# Home task: Sentiment analysis

In [25]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [26]:
fn='./data/rt-polarity.neg'

with open(fn, "r",encoding='utf-8', errors='ignore') as f: # some invalid symbols encountered 
    content = f.read()  

texts_neg = content.splitlines()
print ('len of texts_neg = {:,}'.format (len(texts_neg)))

for review in texts_neg[:5]:
    print ( '\n', review)

len of texts_neg = 5,331

 simplistic , silly and tedious . 

 it's so laddish and juvenile , only teenage boys could possibly find it funny . 

 exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 

 [garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . 

 a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . 


In [27]:
fn='./data/rt-polarity.pos'

with open(fn, "r",encoding='utf-8', errors='ignore') as f:
    content = f.read()

texts_pos = content.splitlines()
print ('len of texts_pos = {:,}'.format (len(texts_pos)))

for review in texts_pos[:5]:
    print ('\n', review)

len of texts_pos = 5,331

 the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

 the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . 

 effective but too-tepid biopic

 if you sometimes like to go to the movies to have fun , wasabi is a good place to start . 

 emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . 


In [28]:
X = texts_neg + texts_pos
y = [0]*len(texts_neg) + [1]*len(texts_pos)

Split into train and test sets

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

Transfrom text to feature representation

In [30]:
vect = CountVectorizer(min_df=5, max_features=50000, ngram_range=(1,2))
vect.fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

Train the model

In [31]:
clf = LogisticRegression(max_iter=2000).fit(X_train_vectorized, y_train)

Evaluate the model

In [32]:
predictions = clf.predict(vect.transform(X_test))
print('f1: ', f1_score(y_test, predictions)) 
scores = clf.decision_function(X_test_vectorized) 
print('AUC: ', roc_auc_score(y_test, scores)) 

f1:  0.7641369047619048
AUC:  0.8344878258886721


Review relevant features

In [33]:
feature_names = np.array(vect.get_feature_names_out())
sorted_coef_index = clf.coef_[0].argsort()

print('Smallest coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest coefs:
['dull' 'bore' 'unfunny' 'neither' 'mediocre' 'routine' 'lacks'
 'pretentious' 'plodding' 'ill']

Largest Coefs: 
['unexpected' 'entertaining' 'cinema' 'works' 'wonderful' 'refreshing'
 'imax' 'witty' 'better than' 'solid']


Predicting sentiment for new reviews

In [34]:
X_predict_vectorized = vect.transform([
    "This movie is awful", 
    "Disgusting!",
    "The best movie I've ever seen"
])

predictions = clf.predict(X_predict_vectorized)
predictions

array([0, 0, 1])