# Sentiment Analysis on imdb dataset

##### Dataset used: http://ai.stanford.edu/~amaas/data/sentiment/

##### By: Emmanuel Raj, Jyothi Nandikonda, Naz Syeda

# Step 1: Import data

In [1]:
reviews_train = []
for line in open('./Dataset/full_train.txt', 'r'):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('./Dataset/full_test.txt', 'r'):
    reviews_test.append(line.strip())

# Step 2: Clean and Preprocess data

In [2]:
import re
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [3]:
len(reviews_train_clean)

25000

In [4]:
len(reviews_test_clean)

25000

# Step 3: Vectorization

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

In [6]:
#save cv as pkl file
#import pickle
#with open('/home/nbuser/library/vectorizer.pkl', 'wb') as model_pkl:
#    pickle.dump(cv, model_pkl)


# Step 4: Build Sentiment analysis model and find best parameters

##### Note: 

The targets/labels we use will be the same for training and testing because both datasets are structured the same, where the first 12.5k are positive and the last 12.5k are negative.

In [7]:
#Ignore warnings on Jupyter notebook
import warnings
warnings.filterwarnings('ignore')


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

#Grid search for c value
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.8768
Accuracy for C=0.05: 0.88864
Accuracy for C=0.25: 0.88384
Accuracy for C=0.5: 0.88096
Accuracy for C=1: 0.8768


# Step 5.1: Train Final model

In [8]:
sentiment = LogisticRegression(C=0.05)
sentiment.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, sentiment.predict(X_test)))

Final Accuracy: 0.88152


In [9]:
from sklearn.metrics import f1_score

In [10]:
f1_score(y_val, sentiment.predict(X_val), average='macro') 

0.9542395314128017

# Step 5.2: Training SVM classifier

In [17]:
from sklearn.svm import SVC

In [18]:
clf_svc = SVC(gamma='auto')

In [19]:
%%time
clf_svc.fit(X_train, y_train) 

CPU times: user 12min 13s, sys: 365 ms, total: 12min 13s
Wall time: 12min 13s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
print("Accuracy for Support vector machine:   "+str(accuracy_score(y_val, clf_svc.predict(X_val))))

Accuracy for Support vector machine:   0.50064


In [21]:
from sklearn.metrics import f1_score
f1_score(y_val, clf_svc.predict(X_val), average='macro') 

0.3350370021973224

# Step 5.3: Train Random Forrest Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

In [13]:
%%time
clf_rf.fit(X_train, y_train)

CPU times: user 1.2 s, sys: 9.91 ms, total: 1.21 s
Wall time: 1.21 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [14]:
print("Accuracy for random forrest:   "+str(accuracy_score(y_val, clf_rf.predict(X_val))))

Accuracy for random forrest:   0.7656


In [16]:
from sklearn.metrics import f1_score
f1_score(y_val, clf_rf.predict(X_val), average='macro') 

0.7648153729078457

#### Check 5 most discriminating words for both positive and negative reviews

In [22]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), sentiment.coef_[0]
    )
}

#Best positive words
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    

('excellent', 0.9292549017181694)
('perfect', 0.7907005565370882)
('great', 0.6745323515415729)
('amazing', 0.6127039824916363)
('superb', 0.6019368131550034)


In [23]:
#Best negetive words
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('worst', -1.3645958840794268)
('waste', -1.166424244219479)
('awful', -1.0324190211775237)
('poorly', -0.8752018744646883)
('boring', -0.8563543419889986)


# Step 6: Predict using the model

In [24]:
pred = sentiment.predict(cv.transform(['It was awful!']))

In [25]:
if pred == 0:
    print("Negative")
else: 
    print("Positive")

Negative


# Step 7: Function to predict sentiment of a given input

In [26]:
def sentiment_predict(input):
    pred = sentiment.predict(cv.transform([input]))
    confidence_score = sentiment.predict_proba(cv.transform([input]))
    if pred == 0:
         output = "Negative" 
         confidence_score = float("{0:.2f}".format(confidence_score[:,0][0]))
    else: 
         output = "Positive"
         confidence_score = float("{0:.2f}".format(confidence_score[:,1][0]))
    
    return output, confidence_score
    
    

In [27]:
sentance = 'It was excellent!'

In [28]:
sentiment_predict(sentance)

('Positive', 0.74)

# Step 8: Save the model as a serialized file

In [30]:
#import pickle
#with open('/home/nbuser/library/sentiment_model.pkl', 'wb') as model_pkl:
#pickle.dump(sentiment, model_pkl)