In [1]:
# ! pip install wget
import wget
import tarfile
import glob
import re
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

## Download the dataset

In [2]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
imdb_dataset = wget.download(url)

In [3]:
# open file
file = tarfile.open(imdb_dataset)

# extracting file
file.extractall()
  
file.close()

## Import the datasets to dataframes

In [4]:
def text_files_to_list(files):
    ls = []
    for file in files:
        f = open(file, "r")
        review = f.readline()
        ls.append(review)
    return ls

In [5]:
train_pos_files = glob.glob("aclImdb/train/pos/*.txt")
train_neg_files = glob.glob("aclImdb/train/neg/*.txt")
test_pos_files = glob.glob("aclImdb/test/pos/*.txt")
test_neg_files = glob.glob("aclImdb/test/neg/*.txt")

train_pos_ls = text_files_to_list(train_pos_files)
train_neg_ls = text_files_to_list(train_neg_files)
test_pos_ls = text_files_to_list(test_pos_files)
test_neg_ls = text_files_to_list(test_neg_files)


In [6]:
print ("There are {0} positive reviews in training set, {1} negative reviews in training set.".format(len(train_pos_ls), len(train_neg_ls)))
print ("There are {0} positive reviews in testing set, {1} negative reviews in testing set.".format(len(test_pos_ls),len(test_neg_ls)))

There are 12500 positive reviews in training set, 12500 negative reviews in training set.
There are 12500 positive reviews in testing set, 12500 negative reviews in testing set.


In [7]:
def reviews_to_df(ls):
    df = pd.DataFrame()
    df['review'] = ls
    return df

In [8]:
df_train_pos = reviews_to_df(train_pos_ls)
df_train_pos['label'] = 1

df_train_neg = reviews_to_df(train_neg_ls)
df_train_neg['label'] = 0

df_test_pos = reviews_to_df(test_pos_ls)
df_test_pos['label'] = 1

df_test_neg = reviews_to_df(test_neg_ls)
df_test_neg['label'] = 0

In [9]:
df_train = pd.concat([df_train_pos , df_train_neg])
df_test = pd.concat([df_test_pos , df_test_neg])

## Text preprocessing

In [10]:
pd.set_option('display.max_colwidth', None)

In [11]:
df_train.head()

Unnamed: 0,review,label
0,"For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan ""The Skipper"" Hale jr. as a police Sgt.",1
1,"Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV's ""Flamingo Road"") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina's pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple Sylvia Miles & Beverly D'Angelo, are a diabolical lot, and Eli Wallach is great fun as a wily police detective. The movie is nearly a cross-pollination of ""Rosemary's Baby"" and ""The Exorcist""--but what a combination! Based on the best-seller by Jeffrey Konvitz, ""The Sentinel"" is entertainingly spooky, full of shocks brought off well by director Michael Winner, who mounts a thoughtfully downbeat ending with skill. ***1/2 from ****",1
2,"A solid, if unremarkable film. Matthau, as Einstein, was wonderful. My favorite part, and the only thing that would make me go out of my way to see this again, was the wonderful scene with the physicists playing badmitton, I loved the sweaters and the conversation while they waited for Robbins to retrieve the birdie.",1
3,"It's a strange feeling to sit alone in a theater occupied by parents and their rollicking kids. I felt like instead of a movie ticket, I should have been given a NAMBLA membership.<br /><br />Based upon Thomas Rockwell's respected Book, How To Eat Fried Worms starts like any children's story: moving to a new town. The new kid, fifth grader Billy Forrester was once popular, but has to start anew. Making friends is never easy, especially when the only prospect is Poindexter Adam. Or Erica, who at 4 1/2 feet, is a giant.<br /><br />Further complicating things is Joe the bully. His freckled face and sleeveless shirts are daunting. He antagonizes kids with the Death Ring: a Crackerjack ring that is rumored to kill you if you're punched with it. But not immediately. No, the death ring unleashes a poison that kills you in the eight grade.<br /><br />Joe and his axis of evil welcome Billy by smuggling a handful of slimy worms into his thermos. Once discovered, Billy plays it cool, swearing that he eats worms all the time. Then he throws them at Joe's face. Ewww! To win them over, Billy reluctantly bets that he can eat 10 worms. Fried, boiled, marinated in hot sauce, squashed and spread on a peanut butter sandwich. Each meal is dubbed an exotic name like the ""Radioactive Slime Delight,"" in which the kids finally live out their dream of microwaving a living organism.<br /><br />If you've ever met me, you'll know that I have an uncontrollably hearty laugh. I felt like a creep erupting at a toddler whining that his ""dilly dick"" hurts. But Fried Worms is wonderfully disgusting. Like a G-rated Farrelly brothers film, it is both vomitous and delightful.<br /><br />Writer/director Bob Dolman is also a savvy storyteller. To raise the stakes the worms must be consumed by 7 pm. In addition Billy holds a dark secret: he has an ultra-sensitive stomach.<br /><br />Dolman also has a keen sense of perspective. With such accuracy, he draws on children's insecurities and tendency to exaggerate mundane dilemmas.<br /><br />If you were to hyperbolize this movie the way kids do their quandaries, you will see that it is essentially about war. Freedom-fighter and freedom-hater use pubescent boys as pawns in proxy wars, only to learn a valuable lesson in unity. International leaders can learn a thing or two about global peacekeeping from Fried Worms.<br /><br />At the end of the film, I was comforted when two chaperoning mothers behind me, looked at each other with befuddlement and agreed, ""That was a great movie."" Great, now I won't have to register myself in any lawful databases.",1
4,"You probably all already know this by now, but 5 additional episodes never aired can be viewed on ABC.com I've watched a lot of television over the years and this is possibly my favorite show, ever. It's a crime that this beautifully written and acted show was canceled. The actors that played Laura, Whit, Carlos, Mae, Damian, Anya and omg, Steven Caseman - are all incredible and so natural in those roles. Even the kids are great. Wonderful show. So sad that it's gone. Of course I wonder about the reasons it was canceled. There is no way I'll let myself believe that Ms. Moynahan's pregnancy had anything to do with it. It was in the perfect time slot in this market. I've watched all the episodes again on ABC.com - I hope they all come out on DVD some day. Thanks for reading.",1


In [12]:
punctuations = set(string.punctuation)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def text_prep(review):
    # Remove HTML tags
    review_html = re.sub(r'<.*?>', ' ', review)
    # Remove punctuations
    review_punc = "".join(char for char in review_html if char not in punctuations)
    # Remove stop words
    review_stop = " ".join(word.lower() for word in review_punc.split() if word not in stop_words)
    # Lemmatize each word
    review_lem = [lemmatizer.lemmatize(review_stop, pos='v') for review_stop in review_stop.split()] 
    review_final = " ".join(review_lem)
    return (review_final)


In [13]:
df_train['review_final'] = df_train['review'].apply(lambda x:text_prep(x))
df_test['review_final'] = df_test['review'].apply(lambda x:text_prep(x))

In [14]:
df_train.head(1)

Unnamed: 0,review,label,review_final
0,"For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan ""The Skipper"" Hale jr. as a police Sgt.",1,for movie get respect sure lot memorable quote list gem imagine movie joe piscopo actually funny maureen stapleton scene stealer the moroni character absolute scream watch alan the skipper hale jr police sgt


In [15]:
df_test.head(1)

Unnamed: 0,review,label,review_final
0,"Based on an actual story, John Boorman shows the struggle of an American doctor, whose husband and son were murdered and she was continually plagued with her loss. A holiday to Burma with her sister seemed like a good idea to get away from it all, but when her passport was stolen in Rangoon, she could not leave the country with her sister, and was forced to stay back until she could get I.D. papers from the American embassy. To fill in a day before she could fly out, she took a trip into the countryside with a tour guide. ""I tried finding something in those stone statues, but nothing stirred in me. I was stone myself."" <br /><br />Suddenly all hell broke loose and she was caught in a political revolt. Just when it looked like she had escaped and safely boarded a train, she saw her tour guide get beaten and shot. In a split second she decided to jump from the moving train and try to rescue him, with no thought of herself. Continually her life was in danger. <br /><br />Here is a woman who demonstrated spontaneous, selfless charity, risking her life to save another. Patricia Arquette is beautiful, and not just to look at; she has a beautiful heart. This is an unforgettable story. <br /><br />""We are taught that suffering is the one promise that life always keeps.""",1,base actual story john boorman show struggle american doctor whose husband son murder continually plague loss a holiday burma sister seem like good idea get away passport steal rangoon could leave country sister force stay back could get id paper american embassy to fill day could fly take trip countryside tour guide i try find something stone statues nothing stir i stone suddenly hell break loose catch political revolt just look like escape safely board train saw tour guide get beat shoot in split second decide jump move train try rescue think continually life danger here woman demonstrate spontaneous selfless charity risk life save another patricia arquette beautiful look beautiful heart this unforgettable story we teach suffer one promise life always keep


## Feature engineering

In [19]:
tfidf = TfidfVectorizer(ngram_range = (1,2))

X_train = tfidf.fit_transform(df_train['review_final'])
y_train = df_train['label']

X_test = tfidf.transform(df_test['review_final'])
y_test = df_test['label']

## Traditional ML models 

In [20]:
algo_dict = {
    'Logistic Regression': LogisticRegression(),
    'Multinomial Naive Bayes': MultinomialNB()
}
param_dict = {
    'Logistic Regression': {
             "penalty": ['l2'],
             "C": [100, 10, 1.0, 0.1, 0.01],
             "solver": ['newton-cg', 'saga', 'liblinear', 'lbfgs'] 
            },
    
    'Multinomial Naive Bayes': {
        'alpha': [10, 1, 0.1, 0.01]
    }

}

In [22]:
clf_dict = dict()
for algo in algo_dict:
    grid_search = GridSearchCV(algo_dict[algo], param_dict[algo], cv=10, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print("Best accuracy score and hyperparameters for {0} model using 10-fold cross validation: {1} using {2}".format(algo, np.round(grid_search.best_score_, 3), grid_search.best_params_))

    clf = grid_search.best_estimator_
    clf_dict[algo] = clf


Best accuracy score and hyperparameters for Logistic Regression model using 10-fold cross validation: 0.902 using {'C': 100, 'penalty': 'l2', 'solver': 'saga'}
Best accuracy score and hyperparameters for Multinomial Naive Bayes model using 10-fold cross validation: 0.89 using {'alpha': 0.1}


In [48]:
clf_dict

{'Logistic Regression': LogisticRegression(C=100, solver='saga'),
 'Multinomial Naive Bayes': MultinomialNB(alpha=0.1)}

In [23]:
for name, clf in clf_dict.items():
    y_pred = clf.predict(X_test)
    print("Accuracy on testing set for {0} model: {1}".format(name, np.around(accuracy_score(y_test, y_pred), 3)))
    print (classification_report(y_test, y_pred))
    print ()

Accuracy on testing set for Logistic Regression model: 0.895
              precision    recall  f1-score   support

           0       0.90      0.89      0.89     12500
           1       0.89      0.90      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000


Accuracy on testing set for Multinomial Naive Bayes model: 0.859
              precision    recall  f1-score   support

           0       0.83      0.90      0.87     12500
           1       0.89      0.82      0.85     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000




Based on the test results, Logistic Regression model has better accuracy, hence we chose it as our final model.

## Saving the tf-idf vectorizer and the final model for future use

In [50]:
import pickle
v_pkl = "tfidf.pkl"
m_pkl = "lr_model.pkl"

# Save the tf-idf vectorizer
pickle.dump(tfidf, open(v_pkl, "wb"))

# Save the ML model
pickle.dump(clf_dict['Logistic Regression'], open(m_pkl, 'wb'))

## Predict sentiment on current reviews

In [107]:
with open('tfidf.pkl', 'rb') as v:
    tfidf_vec = pickle.load(v)
    
with open('lr_model.pkl', 'rb') as m:
    ml_model = pickle.load(m)

In [108]:
def get_sentiment(review):
    df = pd.DataFrame([review], columns=['review'])
    df['review_final'] = df['review'].apply(text_prep)
    
    X_test = tfidf_vec.transform(df['review_final'])
    y_pred = ml_model.predict(X_test)
    
    result = y_pred[0]
    if result == 1:
        print ('Positive review')
    else:
        print ('Negative review')


In [109]:
movie_review = 'this IS the worst Movie'
get_sentiment(movie_review)

Negative review
