## Importing data from kaggle api

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [2]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
import zipfile
zip_ref = zipfile.ZipFile('/content/imdb-dataset-of-50k-movie-reviews.zip','r')
zip_ref.extractall('/content')
zip_ref.close()

In [4]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
df = pd.read_csv('/content/IMDB Dataset.csv')

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df.shape

(50000, 2)

In [8]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [9]:
df.duplicated().sum()

418

In [10]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

## Text Preprocessing

In [None]:
lemmatizer = WordNetLemmatizer()
corpus =[]
for i in df['review']:
  data = re.sub('[^a-zA-Z0-9\s]','',i)
  data = data.lower()
  data = data.split()
  data = [lemmatizer.lemmatize(words) for words in data if data not in (stopwords.words('english'))]
  data = ' '.join(data)
  corpus.append(data)

In [None]:
corpus[0]

'one of the other reviewer ha mentioned that after watching just 1 oz episode youll be hooked they are right a this is exactly what happened with mebr br the first thing that struck me about oz wa it brutality and unflinching scene of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pull no punch with regard to drug sex or violence it is hardcore in the classic use of the wordbr br it is called oz a that is the nickname given to the oswald maximum security state penitentary it focus mainly on emerald city an experimental section of the prison where all the cell have glass front and face inwards so privacy is not high on the agenda em city is home to manyaryans muslim gangsta latino christian italian irish and moreso scuffle death stare dodgy dealing and shady agreement are never far awaybr br i would say the main appeal of the show is due to the fact that it go where other show wouldnt dare forget pretty picture painted f

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])

## Bag Of Words Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features =100 , binary =True)
bow_x = cv.fit_transform(corpus).toarray()

In [None]:
bow_x

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 0, 1, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 1, 1, 0],
       [1, 0, 1, ..., 1, 1, 1],
       [1, 0, 1, ..., 1, 1, 0]])

TF-IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features =100)
tfidf_x = tfidf.fit_transform(corpus).toarray()

In [None]:
tfidf_x

array([[0.05397688, 0.07141969, 0.04814322, ..., 0.19564057, 0.06149708,
        0.09524377],
       [0.07161777, 0.        , 0.12775509, ..., 0.15574827, 0.        ,
        0.06318582],
       [0.        , 0.        , 0.        , ..., 0.13609133, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.05138611, ..., 0.04176375, 0.06563947,
        0.        ],
       [0.14384222, 0.        , 0.0641481 , ..., 0.15640794, 0.08194135,
        0.06345344],
       [0.07834559, 0.        , 0.06987824, ..., 0.05679312, 0.08926091,
        0.        ]])

## Word2Vec Vectorizer

In [None]:
import gensim
from gensim.utils import simple_preprocess
words =[]
for i in corpus:
  sent = nltk.sent_tokenize(i)
  for j in sent:
    words.append(simple_preprocess(j))

In [None]:
w2v_model = gensim.models.Word2Vec(window =5 ,min_count=2)

In [None]:
w2v_model.build_vocab(words)   ## Here we are building vocab from all words of our corpus

In [None]:
len(w2v_model.wv.index_to_key)  ## This is code to see vocab in our corpus 

71506

In [None]:
w2v_model.corpus_count     ### Number of review in our data

50000

In [None]:
w2v_model.epochs

5

In [None]:
w2v_model.train(words , total_examples = w2v_model.corpus_count , epochs = w2v_model.epochs)  ### it gives vector of each word in review of 100 dim

(41150208, 54294800)

In [None]:
w2v_model.wv['other']   ## Vector of 100 dim of word 'other'

array([-4.0355504e-01,  2.4510441e+00,  2.9365318e+00,  1.0776652e+00,
       -1.1645190e+00, -4.3283084e-01, -1.2502393e+00, -5.9843415e-01,
       -1.0824804e+00,  3.2787485e+00,  7.7678211e-02, -9.8848593e-01,
       -2.5411408e+00,  1.4787580e+00,  2.0636222e+00, -1.7818233e+00,
       -1.3950388e-01,  1.4231637e+00,  2.5152776e+00,  3.4201585e-02,
        2.3765235e+00, -5.8547634e-01,  5.7326376e-01,  1.3387384e+00,
        8.2657123e-01, -1.9806367e+00,  6.2419969e-01, -7.6227501e-02,
       -2.6120756e+00, -3.6505040e-02, -1.6088872e+00,  9.1866404e-01,
       -3.2975323e+00, -7.3956341e-01, -2.0650070e+00,  5.5257940e-01,
        9.1291332e-01,  3.7623718e-01,  1.1230074e+00,  4.5789677e-01,
        3.4273180e-01,  1.7347891e+00, -1.1574714e+00, -1.4668182e-01,
       -2.2518847e+00, -1.5986917e+00, -3.2391086e+00, -1.0136397e+00,
       -1.0036594e+00, -2.1952813e+00, -4.0731859e+00,  8.4589952e-01,
       -2.9147651e+00, -1.7700726e-01,  1.3197781e+00, -6.6831470e-01,
      

In [None]:
value =[]                                 ## Calculating Vector of entire review using average word2vec
vector = []
for i in corpus:
  for j in i.split():
    if j in w2v_model.wv.index_to_key:
      value.append(w2v_model.wv[j])

  vector.append((np.mean(value,axis=0)))


In [None]:
vector