## **IMDB Movie Data Perform Sentiment Analysis**

In [2]:
import pandas as pd
df=pd.read_csv('/content/data/movie_data.csv')

In [6]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()

docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

In [7]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [8]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.         0.43370786 0.         0.55847784 0.55847784 0.
  0.43370786 0.         0.        ]
 [0.         0.43370786 0.         0.         0.         0.55847784
  0.43370786 0.         0.55847784]
 [0.50238645 0.44507629 0.50238645 0.19103892 0.19103892 0.19103892
  0.29671753 0.25119322 0.19103892]]


In [52]:
df.loc[0,'review'][-99:]

'xperiment das weisse rauschen muxmã uschenstill out of ten because of the topic and the photography'

In [14]:
import re
def preprocessor(text):
	text = re.sub('<[^>]*>', '', text)
	emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
	text = re.sub('[\W]+', ' ', text.lower()) +\
    	' '.join(emoticons).replace('-', '')
	return text


In [16]:
preprocessor("</a>hello world")

'hello world'

In [18]:
df['review']=df['review'].apply(preprocessor)

In [22]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

In [24]:
def tokenizer(text):
  return text.split()

In [25]:
def tokenizer_porter(text):
  return[porter.stem(word) for word in text.split()]

In [26]:
tokenizer('runners like running and ths they run')

['runners', 'like', 'running', 'and', 'ths', 'they', 'run']

In [27]:
tokenizer_porter('runners like running and thus they')

['runner', 'like', 'run', 'and', 'thu', 'they']

In [28]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [30]:

from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a running like running')[-10:]if w not in stop]

['run', 'like', 'run']

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, tokenizer=tokenizer_porter,use_idf=True,norm='l2',smooth_idf=True)

y=df.sentiment.values
X=tfidf.fit_transform(df.review)

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1,test_size=0.5,shuffle=False)

import pickle
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5,scoring='accuracy',random_state=0,n_jobs=-1,verbose=3,max_iter=300).fit(X_train,y_train)
saved_model=open('/content/saved_model.sav','wb')
pickle.dump(clf,saved_model)
saved_model.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.6min finished


In [40]:
filename = 'saved_model.sav'
saved_clf = pickle.load(open(filename,'rb'))

In [41]:
saved_clf.score(X_test,y_test)

0.8969381628977386