In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.externals import joblib
import warnings
warnings.filterwarnings('ignore')
tfidf_model = TfidfVectorizer(
        max_features=1000,
        analyzer='word',
        ngram_range=(1, 2)
        )
MAINDIR = "./gdrive/My Drive/NLP_Movie/"



In [0]:
#Load data
df = pd.read_csv(MAINDIR + 'data_preproces.csv')

In [0]:
df.head()

Unnamed: 0,docuName,text,category,length,category_id
0,cv000_29590.txt,film adapt from comic book have have plenti of...,pos,717,1
1,cv003_11664.txt,jaw be a rare film that grab -pron- attent bef...,pos,1027,1
2,cv004_11636.txt,moviemak be a lot like be the gener manag of a...,pos,677,1
3,cv008_29435.txt,after bloodi clash and independ win lumumba re...,pos,263,1
4,cv001_18431.txt,everi now and then a movi come along from a su...,pos,705,1


In [0]:
#Select train and test data
X_train, X_test, y_train, y_test = train_test_split(df.text.values, df.category_id.values, test_size=0.05, random_state=38)

In [0]:
#tfidf model
tfidf_model = tfidf_model.fit(X_train)

In [0]:
joblib.dump(tfidf_model, MAINDIR + 'tfidf.pkl')

['./gdrive/My Drive/NLP_Movie/tfidf.pkl']

In [0]:
# transform text to numeric vector
X_train_vec = tfidf_model.transform(X_train)
X_train_vec = X_train_vec.toarray()
X_test_vec = tfidf_model.transform(X_test)
X_test_vec = X_test_vec.toarray()

# Models

## Machine learning models

In [0]:
#To apply RandomizedSearchCV and save machine learning models
def randomize_function(model, x, y,paramDic, classif_folder=None, classifier ='classifier', n_jobs = -1, cv = 3, n_iter =10):
    
    if classif_folder== None:
      classif_folder = ''
    
    #In case of n_iter is greater than the possible combinations
    vals = 1
    for l in paramDic.values(): vals = vals*len(l)
    nIter = min(vals,n_iter)

    clf = RandomizedSearchCV(estimator=model, param_distributions=paramDic,
                             n_jobs=n_jobs, cv=cv, n_iter=nIter, return_train_score=True)

    clf.fit(x, y)
    
    
    with open(MAINDIR + classif_folder+'/'+classifier+'_best_params.txt', "w") as text_file:
      print(clf.best_params_, file=text_file)
     
    
    
    clf2 = model
    print(classifier, clf.best_params_)
    clf2.set_params(**clf.best_params_)
    clf2.fit(x, y)
    
    joblib.dump(clf2, MAINDIR + classif_folder + '/'+ classifier + '.pkl')
    
    acc_train = clf2.score(x, y)
    
    return clf2, acc_train

### Run Machine Learning Models

In [0]:
paramLR = {'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'C':[1,.01,.01,.001],'class_weight':['balanced',None]}
paramSVC = {'C': np.arange(1,1000,50), 'kernel': ['linear', 'poly', 'rbf'],'gamma': np.arange(0.5,100,50)}
paramXGB = {'max_depth':[3,5,7], 'learning_rate':[0.1,0.05],'n_estimators':np.arange(90,160,10),'booster':['gbtree', 'gblinear', 'dart']}

models = [('LogisticRegression', LogisticRegression(), paramLR),
          ('SVC',SVC(), paramSVC),
          ('XGBClassifier', XGBClassifier(), paramXGB)
         ]

metrics = []
for name, model, param in models:
  clf, acc_train = randomize_function(model, X_train_vec, y_train, param, classifier=name)
  acc_test = clf.score(X_test_vec, y_test)
  metrics.append((name, acc_train, acc_test))

LogisticRegression {'solver': 'sag', 'class_weight': None, 'C': 1}
SVC {'kernel': 'rbf', 'gamma': 0.5, 'C': 1}
XGBClassifier {'n_estimators': 90, 'max_depth': 5, 'learning_rate': 0.1, 'booster': 'gbtree'}


In [0]:
# results: (model, train accuracy, test accuracy)
metrics

[('LogisticRegression', 0.8642105263157894, 0.77),
 ('SVC', 0.891578947368421, 0.77),
 ('XGBClassifier', 0.998421052631579, 0.77)]

## Neural Networw

In [0]:
import keras
#from keras import backend as k
from keras.models import Sequential, load_model
from keras.layers import Activation
from keras.layers.core import Dense
from keras.optimizers import Adam
#from keras.metrics import categorical_crossentropy
from keras import callbacks
from keras.layers import Dropout

Using TensorFlow backend.


In [0]:
#hot encoding
y_trainNN = pd.get_dummies(y_train).values

In [0]:
#Build NN
model = Sequential([
    Dense(512,input_shape=(X_train_vec.shape[1],),activation='relu'),
    Dense(256,activation='relu'),
    Dense(64,activation='relu'),
    Dropout(0.25),
    Dense(y_trainNN.shape[1], activation = 'softmax')             
])

In [0]:
model.compile(Adam(lr=.00001), loss='categorical_crossentropy', metrics=['accuracy']) #.0001

In [0]:
#Define early stoping and save best model 
patience = 10
callbacks_list = [ callbacks.EarlyStopping(monitor='val_loss', patience=patience, verbose=1),]
  
callbacks_list.append(callbacks.ModelCheckpoint(MAINDIR +'NN_model2.h5',
                                          monitor='val_loss', verbose=1,
                                          save_best_only=True, mode='min'))

In [0]:
#train model
model.fit(X_train_vec, y_trainNN, batch_size=100, epochs=300, verbose = 2, callbacks=callbacks_list, 
          validation_split=.05)

Train on 1805 samples, validate on 95 samples
Epoch 1/300
 - 1s - loss: 0.6926 - acc: 0.5330 - val_loss: 0.6928 - val_acc: 0.4316

Epoch 00001: val_loss improved from inf to 0.69282, saving model to ./gdrive/My Drive/NLP_Movie/NN_model2.h5
Epoch 2/300
 - 0s - loss: 0.6923 - acc: 0.5407 - val_loss: 0.6924 - val_acc: 0.4737

Epoch 00002: val_loss improved from 0.69282 to 0.69242, saving model to ./gdrive/My Drive/NLP_Movie/NN_model2.h5
Epoch 3/300
 - 0s - loss: 0.6919 - acc: 0.5540 - val_loss: 0.6921 - val_acc: 0.4842

Epoch 00003: val_loss improved from 0.69242 to 0.69212, saving model to ./gdrive/My Drive/NLP_Movie/NN_model2.h5
Epoch 4/300
 - 0s - loss: 0.6915 - acc: 0.5518 - val_loss: 0.6922 - val_acc: 0.4737

Epoch 00004: val_loss did not improve from 0.69212
Epoch 5/300
 - 0s - loss: 0.6906 - acc: 0.5850 - val_loss: 0.6920 - val_acc: 0.4842

Epoch 00005: val_loss improved from 0.69212 to 0.69204, saving model to ./gdrive/My Drive/NLP_Movie/NN_model2.h5
Epoch 6/300
 - 0s - loss: 0.69

<keras.callbacks.History at 0x7fcf8e29a8d0>

In [0]:
#to load best saved model
model = load_model(MAINDIR +'NN_model.h5')

In [0]:
#predict train
y_pred_train = np.argmax(model.predict(X_train_vec),axis=1) 

In [0]:
#accuracy train
acc_train = sum(y_train == y_pred_train)/len(y_train)
acc_train

0.9468421052631579

In [0]:
#predict test
y_pred_test = np.argmax(model.predict(X_test_vec),axis=1)

In [0]:
#accuracy test
acc_test = sum(y_test == y_pred_test)/len(y_test)
acc_test

0.81

The Neural Networks had the highest accuray, for this reason this model is chosen