In [None]:
import re, nltk, pandas as pd, numpy as np
from gensim.parsing.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.parsing.preprocessing import remove_stopwords

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':'12CUjW29tTTxYAcPhxuKb_qSn0UTzc4BR'}) # replace the id with id of file you want to access
downloaded.GetContentFile('imdb_sentiment.csv') 

In [None]:
data = pd.read_csv('imdb_sentiment.csv')
data.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
data.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


## Text Cleaning

In [None]:
stemmer = PorterStemmer()
docs_clean = data['review'].str.lower()
docs_clean = docs_clean.str.replace('-',' ').str.replace('[^\w+\s]', '')
docs_clean = docs_clean.apply(lambda x : remove_stopwords(x)) # stopword removal
docs_clean = stemmer.stem_documents(docs_clean) # stem documents
docs_clean[:5]


['slow move aimless movi distress drift young man',
 'sure lost flat charact audienc nearli half walk',
 'attempt arti black white clever camera angl movi disappoint ridicul act poor plot line non exist',
 'littl music speak',
 'best scene movi gerardo try song keep run head']

In [None]:
vectorizer = CountVectorizer().fit(docs_clean) #initiating the fn and fitting the cleaned doc

In [None]:
vocab = vectorizer.get_feature_names() #this fn will be able to identify individual tokens in the doc & extract token names 

In [None]:
print(vocab[:10])
print('length is :',len(vocab))

['010', '10', '1010', '110', '12', '13', '15', '17', '18th', '1928']
length is : 2373


In [None]:
vocab

['act',
 'actor',
 'actual',
 'art',
 'bad',
 'beauti',
 'believ',
 'best',
 'better',
 'big',
 'bore',
 'camera',
 'cast',
 'charact',
 'come',
 'dialogu',
 'didnt',
 'direct',
 'disappoint',
 'doesnt',
 'dont',
 'effect',
 'end',
 'enjoi',
 'excel',
 'feel',
 'film',
 'funni',
 'good',
 'great',
 'job',
 'know',
 'life',
 'like',
 'line',
 'littl',
 'look',
 'lot',
 'love',
 'make',
 'man',
 'minut',
 'movi',
 'music',
 'peopl',
 'perform',
 'piec',
 'plai',
 'plot',
 'predict',
 'pretti',
 'rate',
 'real',
 'recommend',
 'right',
 'saw',
 'scene',
 'script',
 'seen',
 'set',
 'short',
 'stori',
 'stupid',
 'terribl',
 'thing',
 'think',
 'thought',
 'time',
 'total',
 'understand',
 'wai',
 'want',
 'wast',
 'watch',
 'wonder',
 'work',
 'wors',
 'worst',
 'worth',
 'write',
 'year']

In [None]:
# Another way of doing operations 
vectorizer2 = CountVectorizer(token_pattern='[a-z]+').fit(docs_clean) #token_pattern = '[a-z]+' specifying all word tokens 
vocab = vectorizer2.get_feature_names()
print(vocab[:10])
print('length is :',len(vocab))

['a', 'aailiyah', 'abandon', 'abil', 'abroad', 'absolut', 'abstrus', 'abysm', 'academi', 'accent']
length is : 2345


In [None]:
vectorizer3 = CountVectorizer(token_pattern='[a-z]{3,10}').fit(docs_clean) #token_pattern='[a-z]{3,10}: specifying size of token as min & max no of chars in words
vocab = vectorizer3.get_feature_names()
print(vocab[:10])
print('length is :',len(vocab))

['aailiyah', 'abandon', 'abil', 'abroad', 'absolut', 'abstrus', 'abysm', 'academi', 'accent', 'access']
length is : 2304


In [None]:
vectorizer4 = CountVectorizer(token_pattern='[a-z]{3,10}', min_df=5).fit(docs_clean) # min_df is used to reduce those tokens which rally up in few docs; used for discarding tokens
vocab = vectorizer4.get_feature_names()
print(vocab[:10])
print('length is :',len(vocab))

['absolut', 'act', 'action', 'actor', 'actress', 'actual', 'amaz', 'annoi', 'aphi', 'art']
length is : 255


* length reduced drastically from being 2304 to 255

In [None]:
vectorizer5 = CountVectorizer(token_pattern='[a-z]{3,10}', min_df=10).fit(docs_clean) # min_df is used to reduce those tokens which rally up in few docs; used for discarding tokens
vocab = vectorizer5.get_feature_names()
print(vocab[:10])
print('length is :',len(vocab))

['act', 'actor', 'actual', 'art', 'bad', 'beauti', 'believ', 'best', 'better', 'big']
length is : 82


In [None]:
vectorizer6 = CountVectorizer(token_pattern='[a-z]{3,10}', min_df=10, stop_words='english').fit(docs_clean) # stop_words is used to remove english words
vocab = vectorizer6.get_feature_names()
print(vocab[:10])
print('length is :',len(vocab))

['act', 'actor', 'actual', 'art', 'bad', 'beauti', 'believ', 'best', 'better', 'big']
length is : 81


  'stop_words.' % sorted(inconsistent))


* So far we have only extracted valid tokens using get_feature_names()

### Tranforming text doc in to document term matrix

In [None]:
dtm = vectorizer6.transform(docs_clean)
dtm

<748x81 sparse matrix of type '<class 'numpy.int64'>'
	with 1711 stored elements in Compressed Sparse Row format>

In [None]:
dtm.toarray() #converting compressed doc into an array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
df_dtm = pd.DataFrame(dtm.toarray()) # large arrays will throw memory error while converting into dataframes if datasets are bigger
df_dtm.shape

(748, 81)

In [None]:
df_dtm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df_dtm = pd.DataFrame(dtm.toarray(), columns=vocab) # Using 'columns=vocab' will give us token names as columns
df_dtm.head()

Unnamed: 0,act,actor,actual,art,bad,beauti,believ,best,better,big,bore,camera,cast,charact,come,dialogu,didnt,direct,disappoint,doesnt,dont,effect,end,enjoi,excel,feel,film,funni,good,great,job,know,life,like,line,littl,look,lot,love,make,...,minut,movi,music,peopl,perform,piec,plai,plot,predict,pretti,rate,real,recommend,right,saw,scene,script,seen,set,short,stori,stupid,terribl,thing,think,thought,time,total,understand,wai,want,wast,watch,wonder,work,wors,worst,worth,write,year
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Bigram Vectorizer** - 
* single token - two words
* using CountVectorizer

In [None]:
bigram_vectorizer = CountVectorizer(token_pattern='[a-z]{3,10}', min_df=2,
                                    stop_words='english',
                                    ngram_range=(2,2)).fit(docs_clean) # ngram_range=(2,2) to specify no of words in individual tokens
bigram_dtm = bigram_vectorizer.transform(docs_clean)
bigram_vocab = bigram_vectorizer.get_feature_names()

df_dtm_bigram = pd.DataFrame(bigram_dtm.toarray(), columns=bigram_vocab)
print(df_dtm_bigram.shape)
df_dtm_bigram.head()

(748, 202)


  'stop_words.' % sorted(inconsistent))


Unnamed: 0,accus murder,act bad,act suck,action movi,action scene,actor plai,actor screen,aerial scene,amaz film,appreci subtl,art movi,avoid cost,bad act,bad bad,bad direct,bad film,bad idea,bad movi,bad review,bad script,best film,best movi,best scene,big fan,bit predict,black white,bore movi,bother movi,camera angl,camera work,cast good,cast plai,charact film,charact man,charact plai,cinematogr aphi,clever camera,cover girl,crowd pleaser,cult classic,...,script bad,script big,seen entir,seen movi,self indulg,short film,site film,solid act,sound effect,sound like,south africa,special effect,start watch,stori line,stori stupid,suck act,terrif cast,thing happen,think best,think film,think peopl,thoroughli enjoi,thought provok,time film,time movi,time watch,tom wilkinson,total believ,turn good,volcano angel,wast monei,wast time,watch film,watch movi,wind lion,wonder film,wonder stori,worst seri,worth check,year old
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Trigram Vectorizer** - 
* single token - three words
* using CountVectorizer

In [None]:
trigram_vectorizer = CountVectorizer(token_pattern='[a-z]{3,10}', min_df=2,
                                    stop_words='english',
                                    ngram_range=(3,3)).fit(docs_clean) # ngram_range=(3,3) to specify no of words in individual tokens
trigram_dtm = trigram_vectorizer.transform(docs_clean)
trigram_vocab = trigram_vectorizer.get_feature_names()

df_dtm_trigram = pd.DataFrame(trigram_dtm.toarray(), columns=trigram_vocab)
print(df_dtm_trigram.shape)
df_dtm_trigram.head()

  'stop_words.' % sorted(inconsistent))


(748, 9)


Unnamed: 0,best scene movi,definit worth check,dont wast time,errol flynn brilliant,film great director,good film great,movi recent year,seen entir life,wast time watch
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0
