#IMPORTING 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline 
import re


In [2]:
from nltk import download
download('punkt')
download('wordnet')
from nltk import word_tokenize 
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gokul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#PROCESSING THE DATA

Importing the datasets

In [3]:
data=pd.read_csv('D:\\tensor\\venv\\imdb_small.csv')
data_test=pd.read_csv('D:\\tensor\\venv\\test.csv', header=None,names=['review'])

Exploring the data

In [4]:
data.head()

Unnamed: 0,review,sentiment
0,I have to differ from the other comments poste...,negative
1,I saw this movie with low expectations and was...,negative
2,Taran Adarsh a reputed critic praised such a d...,negative
3,When I first heard that the subject matter for...,positive
4,"With the release of Peter Jackson's famed ""Lor...",positive


In [5]:
data.describe()

Unnamed: 0,review,sentiment
count,5000,5000
unique,4996,2
top,Not only was this movie better than all the fi...,positive
freq,2,2500


In [6]:
data.review[1]

"I saw this movie with low expectations and was not disappointed. Its so bad that it is actually funny in a very cringe worthy way.<br /><br />Gael is absolutely terrible, I mean he just cannot act, period. He should give up now, as acting is clearly not his thing.. His co-stars are about the same caliber, i'm sure my 5 year old cousin could do a better job than all of them! The director should be ashamed to have put his name on something so ridiculous.. Somehow I don't think an Oscar is on the cards for this guy.<br /><br />I have never written a comment on IMDb, but this movie was so bad I felt compelled to do so.<br /><br />If you get the chance to see this film, don't 0/10 if there was a 0"

Removing the $<br/>$ and encoding the sentiment to 1s and 0s

In [7]:
data['review']=data['review'].replace(["<br />"], " ",regex=True)
data_test['review']=data_test['review'].replace(["<br />"], " ",regex=True)
data['y']=data['sentiment'].apply(lambda x: True if(x =='negative') else False )

Splitting data into training and testing set

In [8]:
train_text,test_text,train_y,test_y=train_test_split(data['review'],data['y'],stratify=data['y'],random_state=0)

Reviewing the training set

In [9]:
train_text

1966    The New Batman Adventures (also called Gotham ...
1215    This has to be the worst movie i've seen this ...
2261    That was the first thing that sprang to mind a...
2250    This has to be one of the most beautiful, movi...
753     When I am watching a film, I am aware that it ...
                              ...                        
4527    Othello is set to burn the eyes of the viewers...
4225    This movie is an almost forgotten gem from 197...
3347    Babette's Feast, for me, is about healing: men...
2636    I'm sorry, but this really does feel like a mo...
2976    In December 1945 a train leaves the central st...
Name: review, Length: 3750, dtype: object

In [10]:
train_y

1966    False
1215     True
2261    False
2250    False
753      True
        ...  
4527     True
4225    False
3347    False
2636     True
2976    False
Name: y, Length: 3750, dtype: bool

#Vectorizing the data

Here we look at different type of tokenizers suitable for our task. The best tokenizer is obtained from the cross-validation score

Using PorterStemmer with word_tokenize

In [11]:
def token(doc):
    return [PorterStemmer().stem(t) for t in word_tokenize(doc)if re.match("^[-a-zA-Z0-9_'/]+$",t)]

Using PorterStemmer with Regular expression specified for our particular task

In [12]:
def tokenstem(doc):
    return [PorterStemmer().stem(t) for t in RegexpTokenizer(r'\d+/*\d*|[\?\!]+|[\w]+(?!\'t)').tokenize(doc)]

Using Lemmatization with Regular expression specified for our particular task

In [13]:
def tokenlemma(doc):
    return [WordNetLemmatizer().lemmatize(t,pos='a') for t in RegexpTokenizer(r'\d+/*\d*|[\?\!]+|[\w]+(?!\'t)').tokenize(doc)]

From this example we can see that the tokenstem function does a good job in tokenizing the words and required characters

In [14]:
g="hello eight-man https://folder/web/net apple/orange 'beed' dude/ it's hadn't haven't (hello) 1 grade: D ('hello') !wait kool! help? 10/10 "
print(word_tokenize(g))
print(token(g))
print(tokenstem(g))
print(tokenlemma(g))

['hello', 'eight-man', 'https', ':', '//folder/web/net', 'apple/orange', "'beed", "'", 'dude/', 'it', "'s", 'had', "n't", 'have', "n't", '(', 'hello', ')', '1', 'grade', ':', 'D', '(', "'hello", "'", ')', '!', 'wait', 'kool', '!', 'help', '?', '10/10']
['hello', 'eight-man', 'http', '//folder/web/net', 'apple/orang', "'beed", "'", 'dude/', 'it', "'s", 'had', "n't", 'have', "n't", 'hello', '1', 'grade', 'D', "'hello", "'", 'wait', 'kool', 'help', '10/10']
['hello', 'eight', 'man', 'http', 'folder', 'web', 'net', 'appl', 'orang', 'beed', 'dude', 'it', 's', 'had', 't', 'have', 't', 'hello', '1', 'grade', 'D', 'hello', '!', 'wait', 'kool', '!', 'help', '?', '10/10']


['hello', 'eight', 'man', 'https', 'folder', 'web', 'net', 'apple', 'orange', 'beed', 'dude', 'it', 's', 'had', 't', 'have', 't', 'hello', '1', 'grade', 'D', 'hello', '!', 'wait', 'kool', '!', 'help', '?', '10/10']


#Training the model

Here we create a pipeline which vectorizes the words using the tfidf vectorizer with the above mentioned tokenizers then does logistic regression to fit the data. A grid search is done to obtain the best parameters.(The best parameters obtained are only provided now,Stop words are not removed as they reduced the cross validation score)

In [15]:
pipe = make_pipeline(TfidfVectorizer(min_df=3,max_df=.7,tokenizer=tokenstem), LogisticRegression(max_iter=100000,random_state=0)) 
param_grid = {'logisticregression__C': [100],"tfidfvectorizer__ngram_range": [(1, 4)]}
grid = GridSearchCV(pipe, param_grid, cv=5) 
grid.fit(train_text, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidfvectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.7,
                                                        max_features=None,
                                                        min_df=3,
                                                        ngram_range=(1, 1),
                                               

The best parameters are obtained

In [16]:
print("Best cross-validation score: ", grid.best_score_)
print("prameters:",grid.best_params_)

Best cross-validation score:  0.8728
prameters: {'logisticregression__C': 100, 'tfidfvectorizer__ngram_range': (1, 4)}


In [17]:
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
X_train = vectorizer.transform(train_text)
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
feature_names = np.array(vectorizer.get_feature_names())

#RESULTS

Features on the basis of weight given by tfidf

In [18]:
print(len(feature_names[sorted_by_tfidf]))
print(feature_names[sorted_by_tfidf[0:20]])
print(feature_names[sorted_by_tfidf[-20:]])

93596
['what s go to' 'hold on to the' 'is that the film' 'for the local'
 'is shown to be' 'archaeologist' 'marriag but' 'here there s a'
 'blown to' '1870' 'hi henchmen' 'but that he' 'so as not to' 'as not to'
 'plot and charact' 'so as not' 'novel ha' 'on the fact'
 'on the fact that' 'receiv it']
['seagal' 'phoeb' 'gypsi' 'fulci' 'greek' 'bart' 'gamera' 'vs' 'lundgren'
 'herschel' 'tarzan' 'gein' 'homer' 'biko' 'edmund' 'cat' 'joan' 'sheba'
 'uzumaki' 'ernest']


The coefficients with most weights given by Logistic regression

In [19]:
feature_names_sorted=feature_names[np.argsort(grid.best_estimator_.named_steps["logisticregression"].coef_)[-1,:]]
print(feature_names_sorted[0:40],'\nPositive sentiment tokens\n',feature_names_sorted[-40:],'\nNegative sentiment tokens')

['great' 'excel' 'well' 'still' 'enjoy' 'love' 'beauti' 'dvd' 'best'
 'veri' 'definit' 'you' 'a great' 'brilliant' 'wonder' 'most' 'alway'
 'fun' 'favorit' 'differ' 'love it' 'the best' 'perfect' 'year' 'life'
 'highli' 'hilari' 'amaz' 'today' 'perform' 'highli recommend' 'superb'
 'have to' 'learn' 'my favorit' 'both' 'true' 'a wonder' 'i love' '10/10'] 
Positive sentiment tokens
 ['instead' 'have been' 'to be' 'annoy' 't even' 'cheap' 'poorli' 'wast of'
 'director' 'line' 'predict' 'idea' 'wors' 'do' 'badli' 'even' 'kill'
 'embarrass' 'unfortun' 'script' 'ridicul' 'fail' 't' 'tri' 'horribl' 'no'
 'stupid' 'noth' 'ani' 'minut' 'disappoint' 'poor' 'terribl' 'bore' 'aw'
 'the worst' '?' 'wast' 'bad' 'worst'] 
Negative sentiment tokens


Test score is calculated

In [20]:
grid.score(test_text, test_y)

0.8896

In [21]:
data_test['sentiment']=grid.predict(data_test['review'])
data_test['sentiment']=data_test['sentiment'].apply(lambda x: "positive" if(x ==False) else "negative" )

In [22]:
data_test.head(12)

Unnamed: 0,review,sentiment
0,I laughed all the way through this rotten movi...,negative
1,"I've just watched Fingersmith, and I'm stunned...",negative
2,I liked the movie towards the firat half but t...,negative
3,The movie Bad boys for life has a good trailer,negative
4,Is nothing else on TV? Are you really bored? W...,negative
5,Normally I don't like series at all. They're a...,negative
6,This show makes absolutely no sense. Every wee...,negative
7,I watched this movie for the hot guy--and even...,negative
8,I consider myself a bit of a connoisseur of bo...,positive
9,"I have watched this movie countless times, and...",positive
