In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import re,html,json
import seaborn as sns
import nltk as nt
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report


In [17]:
df = pd.read_excel("final_df.xlsx")

# Preprocessing

In [18]:
# Function to clean the text.

def clean(text):
    text = str(text)
    #remove numbers
    text = re.sub(r'\d+',"",text)
    #lower
    text = text.lower()
    #tags like <tag>
    text = re.sub(r'<[^<>]*>', ' ',text)
    #Markdown Urls
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)',r'\1',text)
    #Remove Punctuation
    text = re.sub(r'([!?,])\1+', r'\1', text)
    #Remove all URL's
    text = re.sub(r'http.*', ' ', text)
    #Remove @
    text = re.sub(r'@\w*', ' ', text)
    #text or code in brackets
    text = re.sub(r'\[[^\[\]]*\]',' ',text)
    # remove b"
    text = text.replace('b\"',' ') 
    # remove b'
    text = text.replace("b\'",' ') 
    # remove \\n
    text = text.replace('\\n',' ')
    #Remove &amp
    text = text.replace('&amp',' ') 
    # remove UTF-8 code like \\xe2
    text = re.sub(r'(\\x(.){2})', ' ',text) 
    #Standalone sequences for specials
    text = re.sub(r'(?:^|\s)[;.\'\"&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ',text)
    #stand alone sequence of hyphens 
    text= re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ',text)
    # Sequence of white spaces
    text = re.sub(r'\s+',' ',text)
    return text.strip()

In [19]:
def _normalize_contractions_text(text, contractions):
    """
    This function normalizes english contractions.
    """
    new_token_list = []
    token_list = text.split()
    for word_pos in range(len(token_list)):
        word = token_list[word_pos]
        first_upper = False
        if word[0].isupper():
            first_upper = True
        if word.lower() in contractions:
            replacement = contractions[word.lower()]
            if first_upper:
                replacement = replacement[0].upper()+replacement[1:]
            replacement_tokens = replacement.split()
            if len(replacement_tokens)>1:
                new_token_list.append(replacement_tokens[0])
                new_token_list.append(replacement_tokens[1])
            else:
                new_token_list.append(replacement_tokens[0])
        else:
            new_token_list.append(word)
    sentence = " ".join(new_token_list).strip(" ")
    return sentence

In [20]:
def normalize_contractions(sentence_list):
    contraction_list = json.loads(open('english_contractions.json', 'r').read())
    norm_sents = []
    print("Normalizing contractions")
    for sentence in tqdm(sentence_list):
        norm_sents.append(_normalize_contractions_text(sentence, contraction_list))
    return norm_sents

contraction_list = json.loads(open('english_contractions.json', 'r').read())

In [21]:
df.drop("Unnamed: 0",axis=1,inplace=True)

In [22]:
df.head()

Unnamed: 0,Review,Title,Spoiler_flag,Synopsis,Cosine_Similarity,doc_similarity
0,The conclusion to the series hits some of the ...,/title/tt1201607/,0,After burying Dobby at the garden of the Shell...,0.158318,0.106248
1,The Lion King was pretty much my favourite mov...,/title/tt0110357/,1,The Lion King takes place in the Pride Lands o...,0.197134,0.404854
2,"OK, perhaps there is truth in the saying ""lost...",/title/tt0364569/,0,"The film begins in medias res, with the silhou...",0.115011,0.219557
3,Top Gun has been an 80's staple since it first...,/title/tt1745960/,1,"Over three decades after his time at TOPGUN, C...",0.423765,0.262304
4,I'll keep this brief: This is simply one of th...,/title/tt0060196/,0,The film tells the story of three men who purs...,0.084172,0.156379


In [23]:
df['Review'] = df['Review'].map(clean)


In [24]:
df['Review'] = normalize_contractions(df['Review'])


Normalizing contractions


100%|█████████████████████████████████████████████████████████████████████████| 22879/22879 [00:01<00:00, 12999.70it/s]


In [None]:
nltk.download('stopwords')
sw = stopwords.words( 'english' )

In [26]:
def remove_stopwords(text):
    l = []
    for i in text.split():
        if i not in sw:
            l.append(i.strip('\'\"'))
    return l

In [27]:
df['tokens'] = df['Review'].apply(remove_stopwords)


In [28]:
df.head()

Unnamed: 0,Review,Title,Spoiler_flag,Synopsis,Cosine_Similarity,doc_similarity,tokens
0,the conclusion to the series hits some of the ...,/title/tt1201607/,0,After burying Dobby at the garden of the Shell...,0.158318,0.106248,"[conclusion, series, hits, strongest, emotiona..."
1,the lion king was pretty much my favourite mov...,/title/tt0110357/,1,The Lion King takes place in the Pride Lands o...,0.197134,0.404854,"[lion, king, pretty, much, favourite, movie, g..."
2,"ok, perhaps there is truth in the saying ""lost...",/title/tt0364569/,0,"The film begins in medias res, with the silhou...",0.115011,0.219557,"[ok,, perhaps, truth, saying, lost, translatio..."
3,top gun has been an 's staple since it first r...,/title/tt1745960/,1,"Over three decades after his time at TOPGUN, C...",0.423765,0.262304,"[top, gun, s, staple, since, first, released, ..."
4,i will keep this brief: this is simply one of ...,/title/tt0060196/,0,The film tells the story of three men who purs...,0.084172,0.156379,"[keep, brief:, simply, one, entertaining, best..."


In [29]:
df.isnull().sum()

Review               0
Title                0
Spoiler_flag         0
Synopsis             0
Cosine_Similarity    0
doc_similarity       0
tokens               0
dtype: int64

In [15]:
df.to_excel('Cleaned_Data.xlsx')

In [30]:
X = df["Review"]

In [31]:
X

0        the conclusion to the series hits some of the ...
1        the lion king was pretty much my favourite mov...
2        ok, perhaps there is truth in the saying "lost...
3        top gun has been an 's staple since it first r...
4        i will keep this brief: this is simply one of ...
                               ...                        
22874    let us be honest: is this film really worthy o...
22875    paul edgecomb (tom hanks) is the lead guard on...
22876    this is undoubtedly the greatest film ever. th...
22877    no, this is not an horrible movie and i did no...
22878    i had heard so much of this film, and how posi...
Name: Review, Length: 22879, dtype: object

In [32]:
y = df["Spoiler_flag"]

In [33]:
y

0        0
1        1
2        0
3        1
4        0
        ..
22874    0
22875    0
22876    0
22877    0
22878    1
Name: Spoiler_flag, Length: 22879, dtype: int64

In [34]:
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=1)

In [35]:
X_train_2, X_val, y_train_2, y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=1)

In [36]:
X_train_2

285      i have yet to watch a good or a very good mart...
14544    film-making in the s was a different exercise ...
13400    it is because of this movie that i began liste...
6791     marvel seems to be on a roll towards killing o...
1210     i have heard good things around this movie. so...
                               ...                        
11034    what can be said about this film that has not ...
9141     great film. however, does not anyone else noti...
22403    my next movie review is going to be on the hig...
17543    guy richie's follow up to lock stock and two s...
1296     set in during the height of world war i, this ...
Name: Review, Length: 16472, dtype: object

# Vectorizing

In [37]:
#training
cv = CountVectorizer(stop_words='english')
tv = TfidfVectorizer(stop_words='english')


cv_train_features = cv.fit_transform(X_train_2)
tf_train_features = tv.fit_transform(X_train_2)

In [38]:
cv_train_features

<16472x56160 sparse matrix of type '<class 'numpy.int64'>'
	with 1854562 stored elements in Compressed Sparse Row format>

In [21]:
#Vectorizing test data
cv_test_features = cv.transform(X_val)
tf_test_features = tv.transform(X_val)

In [58]:
!pip install sklearn_pandas 

Collecting sklearn_pandas
  Downloading sklearn_pandas-2.2.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: sklearn-pandas
Successfully installed sklearn-pandas-2.2.0




In [100]:
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.compose import ColumnTransformer

In [115]:
trf_1 = ColumnTransformer([('Vect',CountVectorizer(stop_words='english'),['Review'])],remainder="passthrough")
trf_2 = LogisticRegression(max_iter=100,C=1)

In [116]:
pipe = Pipeline([('sect_1',trf_1),('sect_2',trf_2)])

In [117]:
pipe

In [120]:
pipe.fit(df["Review"].iloc[0:200],df['Spoiler_flag'].iloc[0:200])

IndexError: tuple index out of range

In [76]:
a.shape

(100, 5042)

In [77]:
lr = LogisticRegression(max_iter=100,C=1)

In [79]:
lr.fit(a,df['Spoiler_flag'][0:100])

In [90]:
df[["Review",'doc_similarity']].iloc[101]

Review            how can i be disappointed by a good movie? i s...
doc_similarity                                             0.140523
Name: 101, dtype: object

In [91]:
b = mapper.transform(df[["Review",'doc_similarity']].iloc[104:106])



In [94]:
df["Spoiler_flag"].iloc[104:106]

104    1
105    0
Name: Spoiler_flag, dtype: int64

In [92]:
lr.predict(b)

array([1, 0], dtype=int64)

# Model Implementation on Count Vectorizer


In [52]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(max_iter=100,C=1)
sgd = SGDClassifier(loss='log_loss',max_iter=100)

In [53]:
lr.fit(a,df['Spoiler_flag'])

ValueError: Found input variables with inconsistent numbers of samples: [2, 22879]

In [23]:
lr.fit(cv_train_features,y_train_2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
sgd.fit(cv_train_features,y_train_2)


In [25]:
Y_pred_lr_cv = lr.predict(cv_test_features)


In [26]:
Y_pred_svm_cv = sgd.predict(cv_test_features)


In [27]:
print('Accuracy with Logistic Regression :', accuracy_score(Y_pred_lr_cv,y_val))
print(classification_report(y_val,Y_pred_lr_cv))

Accuracy with Logistic Regression : 0.6683661082787085
              precision    recall  f1-score   support

           0       0.69      0.72      0.71      2299
           1       0.63      0.60      0.61      1820

    accuracy                           0.67      4119
   macro avg       0.66      0.66      0.66      4119
weighted avg       0.67      0.67      0.67      4119



In [28]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, Y_pred_lr_cv)

print(cm)

# Extract the false negatives count from the confusion matrix
false_negatives = cm[1, 0]

# Calculate the false negative ratio
false_neg_ratio = false_negatives / sum(cm[0])

# Print the false negative ratio
print("False negative ratio: ", false_neg_ratio)

[[1666  633]
 [ 733 1087]]
False negative ratio:  0.3188342757720748


In [29]:
print('Accuracy with SGD Regression :', accuracy_score(y_val,Y_pred_svm_cv))
print(classification_report(y_val,Y_pred_svm_cv))

Accuracy with SGD Regression : 0.6695799951444525
              precision    recall  f1-score   support

           0       0.69      0.74      0.71      2299
           1       0.64      0.58      0.61      1820

    accuracy                           0.67      4119
   macro avg       0.66      0.66      0.66      4119
weighted avg       0.67      0.67      0.67      4119



In [30]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, Y_pred_svm_cv)

print(cm)

# Extract the false negatives count from the confusion matrix
false_negatives = cm[1, 0]

# Calculate the false negative ratio
false_neg_ratio = false_negatives / sum(cm[0])

# Print the false negative ratio
print("False negative ratio: ", false_neg_ratio)

[[1696  603]
 [ 758 1062]]
False negative ratio:  0.3297085689430187


In [39]:
sample_review = "For me, bringing back both the Grace character and the evil colonel felt too much. One of them, Grace (Sigorney Weaver), would have been sufficient, the Colonel could easily have been a brother like in Die Hard 3, although this would require additional story changes regarding the adolescent « Spider » character. But that would have been better than the whole « importance of Family » theme running through everything. Just too much of that for me, as well as let's make sure only incidental characters die."

In [40]:
sample_review = clean(sample_review)

In [41]:
vectotized_sample = cv.transform([sample_review])

In [42]:
lr.predict(vectotized_sample)

array([0], dtype=int64)

In [43]:
lr.predict_log_proba(vectotized_sample)

array([[-0.2037932 , -1.69081623]])

In [63]:
import json,codecs

model_param = {}
model_param['coef'] = lr.coef_.tolist()
model_param['intercepts'] = lr.intercept_.tolist()
file_path = './lr.json'

In [64]:
model_param

{'coef': [[-0.02561629398916542,
   0.14651746597310253,
   -0.04741759781525748,
   0.04072336878105915,
   0.03390795369918199,
   0.03390795369918199,
   -0.05438687118703611,
   0.02643549023262049,
   -0.09126927467976502,
   0.023084118798052604,
   -0.0058814939571116736,
   0.02530200923918683,
   -0.05438687118703611,
   -0.05438687118703611,
   0.08855703233715566,
   -0.05438687118703611,
   0.08522276567851586,
   -0.12963969068678868,
   0.17011913325285882,
   0.016000258185050197,
   -0.0418069893124803,
   -0.04741759781525748,
   -0.017700580558262013,
   -0.020976760496869404,
   -0.19721301258323198,
   0.016000258185050197,
   -0.05438687118703611,
   -0.0058814939571116736,
   0.013516463987530883,
   -0.0418069893124803,
   -0.0058814939571116736,
   -0.0418069893124803,
   -0.0418069893124803,
   -0.05438687118703611,
   -0.020976760496869404,
   0.016000258185050197,
   -0.017700580558262013,
   -0.40986011306013015,
   0.08522276567851586,
   -0.041170457699782

In [65]:
json.dump(model_param, codecs.open(file_path, 'w', encoding='utf-8'), 
          separators=(',', ':'), 
          sort_keys=True, 
          indent=4)

In [62]:
with open('lr.json','w') as file:
    file.write(json_text)

NameError: name 'json_text' is not defined

In [40]:
import numpy as np
import json

def logistic_regression_to_json(lrmodel, file=None):
    if file is not None:
        serialize = lambda x: json.dump(x, file)
    else:
        serialize = json.dumps
    data = {}
    data['init_params'] = lrmodel.get_params()
    data['model_params'] = mp = {}
    for p in ('coef_', 'intercept_','classes_', 'n_iter_'):
        mp[p] = getattr(lrmodel, p).tolist()
    return serialize(data)

def logistic_regression_from_json(jstring):
    data = json.loads(jstring)
    model = LogisticRegression(**data['init_params'])
    for name, p in data['model_params'].items():
        setattr(model, name, np.array(p))
    return model

In [43]:
logistic_regression_to_json(lr)

str

In [185]:
#Random Forest Classifier

In [195]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42)

In [196]:
rf_clf.fit(cv_train_features,y_train_2)

In [197]:
Y_pred_rf_cv = rf_clf.predict(cv_test_features)

In [198]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, Y_pred_rf_cv)

print(cm)

# Extract the false negatives count from the confusion matrix
false_negatives = cm[1, 0]

# Calculate the false negative ratio
false_neg_ratio = false_negatives / sum(cm[0])

# Print the false negative ratio
print("False negative ratio: ", false_neg_ratio)

[[1750  549]
 [ 780 1040]]
False negative ratio:  0.3392779469334493


# Model Implementation on Tfidf Vectorizer

In [232]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(max_iter=100,C=1)
sgd = SGDClassifier(loss='log_loss',max_iter=100)

In [233]:
lr.fit(tf_train_features,y_train_2)

In [234]:
sgd.fit(tf_train_features,y_train_2)

In [235]:
Y_pred_lr_tf = lr.predict(tf_test_features)
Y_pred_svm_tf = sgd.predict(tf_test_features)

In [236]:
print('Accuracy with Logistic Regression :', accuracy_score(Y_pred_lr_tf,y_val))
print(classification_report(y_val,Y_pred_lr_tf))

Accuracy with Logistic Regression : 0.7137654770575382
              precision    recall  f1-score   support

           0       0.73      0.78      0.75      2299
           1       0.69      0.63      0.66      1820

    accuracy                           0.71      4119
   macro avg       0.71      0.71      0.71      4119
weighted avg       0.71      0.71      0.71      4119



In [237]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, Y_pred_lr_tf)

print(cm)

# Extract the false negatives count from the confusion matrix
false_negatives = cm[1, 0]

# Calculate the false negative ratio
false_neg_ratio = false_negatives / sum(cm[0])

# Print the false negative ratio
print("False negative ratio: ", false_neg_ratio)

[[1792  507]
 [ 672 1148]]
False negative ratio:  0.2923010004349717


In [238]:
print('Accuracy with SGD Regression :', accuracy_score(y_val,Y_pred_svm_tf))
print(classification_report(y_val,Y_pred_svm_tf))

Accuracy with SGD Regression : 0.714008254430687
              precision    recall  f1-score   support

           0       0.73      0.78      0.75      2299
           1       0.70      0.63      0.66      1820

    accuracy                           0.71      4119
   macro avg       0.71      0.71      0.71      4119
weighted avg       0.71      0.71      0.71      4119



In [239]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, Y_pred_svm_tf)

print(cm)

# Extract the false negatives count from the confusion matrix
false_negatives = cm[1, 0]

# Calculate the false negative ratio
false_neg_ratio = false_negatives / sum(cm[0])

# Print the false negative ratio
print("False negative ratio: ", false_neg_ratio)

[[1798  501]
 [ 677 1143]]
False negative ratio:  0.2944758590691605


# Model Implementation with similarity score

In [243]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler

In [247]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(max_iter=100,C=1)
sgd = SGDClassifier(loss='log_loss',max_iter=100)

In [254]:
X = df[['doc_similarity','Review']]
y= df['Spoiler_flag']
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=1)

In [270]:
X_train[['Review','doc_similarity']]

Unnamed: 0,Review,doc_similarity
13209,every time i watch an iranian film i am amazed...,0.253923
13813,dom de luise once uttered a line i use quite o...,0.118664
6938,it is a great movie! charles chaplin is able t...,0.327167
9289,no need to recap plot or consensus points. in ...,0.064079
15753,tgbh is one of my favourite films ever. it is ...,0.306137
...,...,...
10955,two other titles came to mind as i watched thi...,0.192651
17289,"in , in chicago, the musicians and friends joe...",0.367849
5192,"this adventure flick, which mixes the western ...",0.177938
12172,"""there can be no understanding between the han...",0.283252


In [262]:
X_train['doc_similarity']

13209    0.253923
13813    0.118664
6938     0.327167
9289     0.064079
15753    0.306137
           ...   
10955    0.192651
17289    0.367849
5192     0.177938
12172    0.283252
235      0.392834
Name: doc_similarity, Length: 20591, dtype: float64

In [121]:
df

Unnamed: 0,Review,Title,Spoiler_flag,Synopsis,Cosine_Similarity,doc_similarity,tokens
0,the conclusion to the series hits some of the ...,/title/tt1201607/,0,After burying Dobby at the garden of the Shell...,0.158318,0.106248,"[conclusion, series, hits, strongest, emotiona..."
1,the lion king was pretty much my favourite mov...,/title/tt0110357/,1,The Lion King takes place in the Pride Lands o...,0.197134,0.404854,"[lion, king, pretty, much, favourite, movie, g..."
2,"ok, perhaps there is truth in the saying ""lost...",/title/tt0364569/,0,"The film begins in medias res, with the silhou...",0.115011,0.219557,"[ok,, perhaps, truth, saying, lost, translatio..."
3,top gun has been an 's staple since it first r...,/title/tt1745960/,1,"Over three decades after his time at TOPGUN, C...",0.423765,0.262304,"[top, gun, s, staple, since, first, released, ..."
4,i will keep this brief: this is simply one of ...,/title/tt0060196/,0,The film tells the story of three men who purs...,0.084172,0.156379,"[keep, brief:, simply, one, entertaining, best..."
...,...,...,...,...,...,...,...
22874,let us be honest: is this film really worthy o...,/title/tt0211915/,0,Amelie Poulain (Audrey Tautou) is the only chi...,0.112264,0.048846,"[let, us, honest:, film, really, worthy, top, ..."
22875,paul edgecomb (tom hanks) is the lead guard on...,/title/tt0120689/,0,The movie opens with a group of people running...,0.270954,0.230906,"[paul, edgecomb, (tom, hanks), lead, guard, de..."
22876,this is undoubtedly the greatest film ever. th...,/title/tt0031381/,0,"The film opens in Tara, a cotton plantation ow...",0.106873,0.180562,"[undoubtedly, greatest, film, ever., scale, fi..."
22877,"no, this is not an horrible movie and i did no...",/title/tt0046268/,0,"1950, in the very isolated town of Las Piedras...",0.174709,0.272613,"[no,, horrible, movie, disliked, watching, abs..."


In [127]:
#Deep learning

In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
from sklearn.model_selection import train_test_split
import gensim
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
import nltk
from keras.utils import np_utils
import keras
from keras.models import Sequential
from keras.layers import Dropout, Activation, Dense
from keras.layers.normalization import batch_normalization
from sklearn.preprocessing import LabelEncoder

In [148]:
df = pd.read_excel("final_df.xlsx")

In [150]:
df.head()

Unnamed: 0.1,Unnamed: 0,Review,Title,Spoiler_flag,Synopsis,Cosine_Similarity,doc_similarity
0,21866,The conclusion to the series hits some of the ...,/title/tt1201607/,0,After burying Dobby at the garden of the Shell...,0.158318,0.106248
1,7319,The Lion King was pretty much my favourite mov...,/title/tt0110357/,1,The Lion King takes place in the Pride Lands o...,0.197134,0.404854
2,15075,"OK, perhaps there is truth in the saying ""lost...",/title/tt0364569/,0,"The film begins in medias res, with the silhou...",0.115011,0.219557
3,3203,Top Gun has been an 80's staple since it first...,/title/tt1745960/,1,"Over three decades after his time at TOPGUN, C...",0.423765,0.262304
4,10801,I'll keep this brief: This is simply one of th...,/title/tt0060196/,0,The film tells the story of three men who purs...,0.084172,0.156379


In [151]:
df.drop("Unnamed: 0",inplace=True,axis=1)

In [152]:
X = df["Review"]
y = df["Spoiler_flag"]

In [153]:
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=1)

In [154]:
X_train_2, X_val, y_train_2, y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=1)

In [155]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(X_train_2.shape)
print(X_val.shape)
print(y_train_2.shape)
print(y_val.shape)


(20591,)
(2288,)
(20591,)
(2288,)
(16472,)
(4119,)
(16472,)
(4119,)


In [156]:
import gensim

def preprocess_text(text):
    if isinstance(text, float):
        text = str(text)
    return gensim.utils.simple_preprocess(text)



In [157]:
tokenize_train = X_train_2.apply(preprocess_text)

In [158]:
tokenize_test = X_val.apply(preprocess_text)

In [159]:
tokenize_train

285      [have, yet, to, watch, good, or, very, good, m...
14544    [film, making, in, the, was, different, exerci...
13400    [it, is, because, of, this, movie, that, began...
6791     [marvel, seems, to, be, on, roll, towards, kil...
1210     [ve, heard, good, things, around, this, movie,...
                               ...                        
11034    [what, can, be, said, about, this, film, that,...
9141     [great, film, however, doesn, anyone, else, no...
22403    [my, next, movie, review, is, going, to, be, o...
17543    [guy, richie, follow, up, to, lock, stock, and...
1296     [set, in, during, the, height, of, world, war,...
Name: Review, Length: 16472, dtype: object

In [161]:
#Building Word2Vec Model
w2v_num_features = 512
w2v_model = gensim.models.Word2Vec(tokenize_train,size=w2v_num_features, window=150,min_count=2,sample=0.001)

In [164]:
def document_vectorizer(corpus,model,num_features):
  vocabulary=set(model.wv.index2entity)

  def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords=0
    for word in words:
      if word in vocabulary:
        nwords=nwords+1
        feature_vector=np.add(feature_vector, model.wv[word])
    if nwords:
      feature_vector = np.divide(feature_vector,nwords)

    return feature_vector
  features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]

  return  np.array(features)

In [165]:
avg_wv_train_features = document_vectorizer(corpus=tokenize_train,model=w2v_model, num_features=w2v_num_features)


In [166]:
avg_wv_test_features = document_vectorizer(corpus=tokenize_test,model=w2v_model, num_features=w2v_num_features)


In [167]:
#Basic Neural network model

In [4]:
from keras.layers.normalization.batch_normalization_v1 import BatchNormalization
from keras.engine import input_layer
from keras.regularizers import l2
from keras import regularizers


def construct_deepnn(num_input_features):
  dnn_model = Sequential()
  dnn_model.add(Dense(256, input_shape=(num_input_features,), kernel_initializer='glorot_uniform'))
  dnn_model.add(BatchNormalization())
  dnn_model.add(Activation('relu'))
  dnn_model.add(Dropout(0.2))

  dnn_model.add(Dense(1, kernel_regularizer=regularizers.l2(0.01)))
  dnn_model.add(Activation('sigmoid'))

  dnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return dnn_model

In [171]:
w2v_dnn = construct_deepnn(num_input_features=w2v_num_features)

In [177]:
batch_size=100

w2v_dnn.fit(avg_wv_train_features, y_train_2, epochs=20, batch_size=batch_size, shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/20
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function 

<keras.callbacks.History at 0x23121baf040>

In [193]:
w2v_dnn.save("deep_learning1.h5")

In [178]:
#LSTM

In [179]:
from collections import Counter

token_counter = Counter([token for review in tokenize_train for token in review])

vocab_map = {item[0]: index+1 for index,item in enumerate(dict(token_counter).items())}

In [180]:
max_index = np.max(list(vocab_map.values()))
vocab_map['PAD_INDEX'] = 0
vocab_map['NOT_FOUND_INDEX'] = max_index+1
vocab_size = len(vocab_map)

In [181]:
print('Vocabulary Size:', vocab_size)
print('Sample Size of Vocabulary Size:', dict(list(vocab_map.items())[10:20]))


Vocabulary Size: 56221
Sample Size of Vocabulary Size: {'every': 11, 'watched': 12, 'is': 13, 'also': 14, 'old': 15, 'jack': 16, 'nicholson': 17, 'as': 18, 'young': 19, 'film': 20}


In [182]:
max_len = np.max([len(review) for review in tokenize_train])

In [183]:
# Convert tokeized train to numeric vectors
train_X = [[vocab_map[token] for token in line] for line in tokenize_train]

In [187]:
from keras.preprocessing import sequence


In [188]:
train_X = sequence.pad_sequences(train_X,maxlen=max_len)


In [189]:
# Convert tokeized test to numeric vectors

test_X = [[vocab_map[token] if vocab_map.get(token) else vocab_map['NOT_FOUND_INDEX']for token in line] for line in tokenize_test]

In [190]:
test_X = sequence.pad_sequences(test_X,maxlen=max_len)


In [191]:
# Drop out 40%

from keras.layers import Dropout, Embedding, Dense, SpatialDropout1D
from keras.layers import LSTM

EMBEDDING_DIMENSION = 128
LSTM_DIM = 64

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIMENSION, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(LSTM_DIM, dropout=0.2,recurrent_dropout=0.4))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [192]:
model.fit(train_X,y_train_2,epochs=10, batch_size=100, shuffle=True, validation_split=0.1, verbose=1)


Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function had ()
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: closure mismatch, requested ('self', 'step_function'), but source function 

<keras.callbacks.History at 0x231311cb880>

In [194]:
model.save("lstm1.h5")