In [1]:
import pandas as pd
import numpy as np
import re
import string
import pickle as pk

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier ,ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score ,f1_score

  from pandas import MultiIndex, Int64Index


In [2]:
df_train = pd.read_csv('Train.csv')
# df_train = pd.read_parquet('pre_train.parquet').iloc[:,1:]
df_val = pd.read_csv('Valid.csv')
df_test = pd.read_csv('Test.csv')

In [16]:
np.array((df_val.sample(1).text , df_val.sample(1).label) )

array([['Harrison Ford playing a playing a cop in a crime thriller. The perfect ingredients it SEEMS for top entertainment with Harrison back to his Indy and Han Solo best, protecting a witness from ruthless and merciless murderers. How easy it is to be fooled. If the film concentrated on the main, supposed, themes of crime and suspense instead of putting up barns and shoving ice creams in peoples faces it possibly could have been more worthwhile. Unbelieveably predictable with the best method of despatching of a foe is with corn.'],
       [1]], dtype=object)

In [14]:
df_train.head(2)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0


In [15]:
df_val.head(2)

Unnamed: 0,text,label
0,It's been about 14 years since Sharon Stone aw...,0
1,someone needed to make a car payment... this i...,0


In [16]:
df_train.describe(include= "all")

Unnamed: 0,text,label
count,40000,40000.0
unique,39723,
top,"Hilarious, clean, light-hearted, and quote-wor...",
freq,4,
mean,,0.499525
std,,0.500006
min,,0.0
25%,,0.0
50%,,0.0
75%,,1.0


In [17]:
df_train['text_length'] = df_train['text'].apply(lambda x: len(x))

In [18]:
df_train.drop(df_train[df_train.text_length > 5500].index , axis = 0 , inplace = True)

In [19]:
df_train.reset_index(inplace=True)
# print(df_train.shape)
# df_train.head()

## Preprocessing

In [20]:
# Lowercasing
df_train['text'] = df_train['text'].str.lower()
# Tokenization and removal of punctuation
df_train['text'] = df_train['text'].apply(lambda x: word_tokenize(x.translate(str.maketrans('', '', string.punctuation))))

# Removal of stopwords
stop_words = set(stopwords.words('english'))
df_train['text'] = df_train['text'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df_train['text'] = df_train['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Joining the tokens back to text
df_train['text'] = df_train['text'].apply(lambda x: ' '.join(x))

In [21]:
df_train.drop_duplicates(inplace = True)

In [22]:
df_train = df_train.sample(frac = 1)
df_train.reset_index(drop=True,inplace=True)

In [23]:
df_train.head(2)

Unnamed: 0,index,text,label,text_length
0,30871,80 best time worst time james karin starred on...,0,952
1,28297,good movie keep front tv dying see resultbr br...,0,273


## Tokenizing for Deep learning

In [33]:
tokenizer = Tokenizer(num_words=2500)
tokenizer.fit_on_texts(df_train['text'])
train_sequences = tokenizer.texts_to_sequences(df_train['text'])
val_sequences = tokenizer.texts_to_sequences(df_val['text'])
test_sequences = tokenizer.texts_to_sequences(df_test['text'])

In [30]:
# Pad the sequences to the same length
max_length = 256
train_data = pad_sequences(train_sequences, maxlen=max_length)
val_data = pad_sequences(val_sequences, maxlen=max_length)
test_data = pad_sequences(test_sequences, maxlen=max_length)

In [34]:
# Define the LSTM model
model = Sequential()
model.add(Embedding(2500, 256, input_length=max_length))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(16))  
model.add(Dense(1, activation='sigmoid'))


# Compile the model
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])

# Train the model
model.fit(train_data, df_train['label'], validation_data = (val_data, df_val['label']), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1fb8c7cf340>

In [35]:
# Evaluate the model
predictions = model.predict(val_data)
predictions = np.round(predictions).flatten()
print('F1 Score : ',f1_score(df_val['label'], predictions, average='weighted'))
print('confusion matrix: \n' , confusion_matrix(df_val['label'],predictions))
print("Accuracy:", accuracy_score(df_val['label'], predictions))

F1 Score :  0.8609141484104577
confusion matrix: 
 [[2083  403]
 [ 292 2222]]
Accuracy: 0.861


## VECTORIZE & Machine Leaning

In [36]:
X = df_train['text']
y = np.array(df_train["label"])

In [37]:
X.head(2)

0    80 best time worst time james karin starred on...
1    good movie keep front tv dying see resultbr br...
Name: text, dtype: object

In [38]:
vect = TfidfVectorizer(max_features=12500 , ngram_range=(1,2))
train_x = vect.fit_transform(X)

In [39]:
val_x = vect.transform(df_val['text'])
test_x = vect.transform(df_test['text'])

In [40]:
# Save vectorizer for deployment
#pk.dump(vect , open('vectortizer.sav','wb'))

In [41]:
X_train = train_x.toarray()
X_val = val_x.toarray()
X_test = test_x.toarray()

In [42]:
y_train = y.astype('int')
y_val = df_val['label']

## Modelling

#### RandomForest MODEL

In [None]:
# RF_model = RandomForestClassifier(n_estimators=250,max_depth = 24, min_samples_split=25, n_jobs=-1, verbose= 2)
# RF_model.fit(X_train,y_train)
# rf_y_pred = RF_model.predict(val_x)

In [None]:
# print('train score: ' ,RF_model.score(X_train, y_train))
# print('test score: ' ,RF_model.score(val_x, y_val))
# print('accuracy score: ' ,accuracy_score(rf_y_pred, y_val))
# print('F1 Score : ',f1_score(y_val, rf_y_pred, average='weighted'))
# print('confusion matrix: \n' , confusion_matrix(y_val,rf_y_pred))

# ----------------------------
# ----------------------------

# train score:  0.8699657463151339
# test score:  0.7294
# accuracy score:  0.7294
# F1 Score :  0.7117945023461818
# confusion matrix: 
#  [[1199 1287]
#  [  66 2448]]

#### LogisticRegression Model

In [None]:
LR_model = LogisticRegression(n_jobs=-1) 
LR_model.fit(X_train,y_train)

In [None]:
lr2_y_pred = LR_model.predict(X_val)

In [None]:
print('train score: ' ,LR_model.score(X_train, y_train))
print('test score: ' ,LR_model.score(val_x, y_val))

In [None]:
print('accuracy score: ' ,accuracy_score(lr2_y_pred, np.array(y_val)))
print('F1 Score : ',f1_score(np.array(y_val), lr2_y_pred, average='weighted'))
print('confusion matrix: \n' , confusion_matrix(np.array(y_val),lr2_y_pred))
# accuracy score:  0.8812
# F1 Score :  0.8808737409797148

In [None]:
train_data

In [None]:
# Saving the best model

# pk.dump(LR_model , open('LR_Model88.sav','wb'))

#### XGB Model

In [None]:
XGB_model = XGBClassifier(n_estimators=200, max_depth=10 , eta=0.6, subsample=0.7, colsample_bytree=0.7,n_jobs=-1 ,verbosity =3)
XGB_model.fit(X_train,y_train)
xg_y_pred = XGB_model.predict(val_x)

In [None]:
print('accuracy score: ' ,accuracy_score(xg_y_pred, y_val))
print('F1 Score : ',f1_score(y_val, xg_y_pred, average='weighted'))
print('confusion matrix: \n' , confusion_matrix(y_val,xg_y_pred))

# ---------------------------
# ---------------------------

# accuracy score:  0.848
# F1 Score :  0.8475825642164702
# confusion matrix: 
#  [[1982  504]
#  [ 256 2258]]

#### KNeighborsClassifier Model

In [None]:
knn_model = KNeighborsClassifier(n_neighbors= 100,n_jobs=-1)
knn_model.fit(X_train,y_train)
knn_y_pred = knn_model.predict(val_x)

In [None]:
print('accuracy score: ' ,accuracy_score(knn_y_pred, y_val))
print('F1 Score : ',f1_score(y_val, knn_y_pred, average='weighted'))
print('confusion matrix: \n' , confusion_matrix(y_val,knn_y_pred))

# --------------------
# --------------------

# accuracy score:  0.6982
# F1 Score :  0.6950688294693479
# confusion matrix: 
#  [[1992  494]
#  [1015 1499]]

#### ExtraTreesClassifier Model

In [35]:
etc_model = ExtraTreesClassifier(n_estimators=250,max_depth = 12, min_samples_split=50, n_jobs=-1, verbose= 2)
etc_model.fit(X_train,y_train)
etc_y_pred = etc_model.predict(val_x)

In [None]:
print('train score: ' ,etc_model.score(X_train, y_train))
print('test score: ' ,etc_model.score(val_x, y_val))
print('accuracy score: ' ,accuracy_score(etc_y_pred, y_val))
print('F1 Score : ',f1_score(y_val, etc_y_pred, average='weighted'))
print('confusion matrix: \n' , confusion_matrix(y_val,etc_y_pred))

# ------------------
# ------------------

# train score:  0.8871963877932323
# test score:  0.7446
# accuracy score:  0.7446
# F1 Score :  0.729544218437901
# confusion matrix: 
#  [[1265 1221]
#  [  56 2458]]

#### Multinomial Bayes naive model

In [None]:
NB_model = MultinomialNB(alpha=16)
NB_model.fit(X_train,y_train)
nb_y_pred = NB_model.predict(val_x)

In [None]:
print('train score: ' ,NB_model.score(X_train, y_train))
print('test score: ' ,NB_model.score(val_x, y_val))
print('accuracy score: ' ,accuracy_score(nb_y_pred, y_val))
print('F1 Score : ',f1_score(y_val, nb_y_pred, average='weighted'))
print('confusion matrix: \n' , confusion_matrix(y_val,nb_y_pred))

# ------------------
# ------------------

# train score:  0.8775520479319286
# test score:  0.8662
# accuracy score:  0.8662
# F1 Score :  0.8661798021138692
# confusion matrix: 
#  [[2127  359]
#  [ 310 2204]]

### Trying to see ensembling score

In [43]:
prd = predictions.astype(int)

In [45]:
combined_predictions = np.round((prd + lr2_y_pred + nb_y_pred) / 3)
print("Combined Predictions Accuracy:", accuracy_score(y_val, combined_predictions))
print('F1 Score : ',f1_score(y_val, combined_predictions, average='weighted'))

Combined Predictions Accuracy: 0.8816
F1 Score :  0.8812945027987608


### Save  model , submission

In [46]:
y_test = model.predict(test_data)
y_test = np.round(y_test).flatten()



In [48]:
y_t1 = y_test.astype(int) 

In [49]:
y_t2  = NB_model.predict(X_test)
y_t3  = LR_model.predict(X_test)

In [None]:
y_t4 = knn_model.predict(X_test)
y_t5 = XGB_model.predict(X_test)

In [50]:
combined_predictions = np.round((y_t1 +y_t2 + y_t3 + y_t4 + y_t5) /5)

In [52]:
cp = combined_predictions.astype(int)

In [28]:
# y_test = LR_model.predict(X_test)

In [54]:
import pandas as pd

In [55]:
sub = pd.read_csv('sample_submission.csv')
sub.label = cp

In [56]:
sub.to_csv('sub_DL2.csv' , index=False)