# MsCA 31009 - Machine Learning and Predictive Analytics

## Project - Toxic Comment Classification

## Import files and libraries.

In [49]:
!pip3 install autocorrect
!pip3 install nltk
!pip3 install imblearn
!pip3 install keras

Collecting keras
[?25l  Downloading https://files.pythonhosted.org/packages/5e/10/aa32dad071ce52b5502266b5c659451cfd6ffcbf14e6c8c4f16c0ff5aaab/Keras-2.2.4-py2.py3-none-any.whl (312kB)
[K    100% |################################| 317kB 6.1MB/s ta 0:00:011
Collecting pyyaml (from keras)
[?25l  Downloading https://files.pythonhosted.org/packages/9e/a3/1d13970c3f36777c583f136c136f804d70f500168edc1edea6daa7200769/PyYAML-3.13.tar.gz (270kB)
[K    100% |################################| 276kB 9.7MB/s eta 0:00:01
Building wheels for collected packages: pyyaml
  Running setup.py bdist_wheel for pyyaml ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/ad/da/0c/74eb680767247273e2cf2723482cb9c924fe70af57c334513f
Successfully built pyyaml
Installing collected packages: pyyaml, keras
Successfully installed keras-2.2.4 pyyaml-3.13


In [7]:
import pandas as pd
import numpy as np

import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from autocorrect import spell

from imblearn.over_sampling import SMOTE, RandomOverSampler

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC

import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Download train data.**

In [8]:
!wget 'https://drive.google.com/uc?export=download&id=1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp'
!unzip -o 'uc?export=download&id=1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp'

/bin/sh: 1: wget: not found
unzip:  cannot find or open uc?export=download&id=1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp, uc?export=download&id=1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp.zip or uc?export=download&id=1hcoewV5fpD0kx8ysZsZi8EnSjxIgC0lp.ZIP.

No zipfiles found.


In [10]:
toxic = pd.read_csv('train.csv')

## Data Preprocessing

### Text Cleaning

In [11]:
toxic.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


**Remove ID column.**

In [12]:
toxic.drop(['id'], axis=1, inplace=True)

**Remove non-alphabet characters**

In [13]:
toxic['comment_text'] = [re.sub('[^A-Za-z]', ' ', i).lower() for i in toxic['comment_text']]

**Tokenization**

In [14]:
toxic['comment_text_tokenize'] = [word_tokenize(i) for i in toxic['comment_text']]

In [15]:
toxic.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_tokenize
0,explanation why the edits made under my userna...,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my..."
1,d aww he matches this background colour i m s...,0,0,0,0,0,0,"[d, aww, he, matches, this, background, colour..."
2,hey man i m really not trying to edit war it...,0,0,0,0,0,0,"[hey, man, i, m, really, not, trying, to, edit..."
3,more i can t make any real suggestions on im...,0,0,0,0,0,0,"[more, i, can, t, make, any, real, suggestions..."
4,you sir are my hero any chance you remember...,0,0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re..."


**Standardize contraction**

In [17]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"cant", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

**Stemming**

In [18]:
stemmer = SnowballStemmer('english')
stentence_placeholder = []
for sentence in toxic.loc[:,'comment_text_tokenize']:
    sentence_stemmed = [stemmer.stem(clean_text(word)) for word in sentence]
    stentence_placeholder.append(sentence_stemmed)
toxic['comment_text_tokenize_stemmed'] = stentence_placeholder

**Stopwords Removal**

In [19]:
stentence_placeholder = []
for sentence in toxic.loc[:,'comment_text_tokenize_stemmed']:
    sentence_clean = [word for word in sentence if word not in stopwords.words('english')]
    stentence_placeholder.append(sentence_clean)
toxic['comment_text_clean'] = stentence_placeholder
toxic['comment_text_clean'] = [' '.join(i) for i in toxic['comment_text_clean']]

In [20]:
toxic

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_tokenize,comment_text_tokenize_stemmed,comment_text_clean
0,explanation why the edits made under my userna...,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my...","[explan, whi, the, edit, made, under, my, user...",explan whi edit made usernam hardcor metallica...
1,d aww he matches this background colour i m s...,0,0,0,0,0,0,"[d, aww, he, matches, this, background, colour...","[d, aww, he, match, this, background, colour, ...",aww match background colour seem stuck thank t...
2,hey man i m really not trying to edit war it...,0,0,0,0,0,0,"[hey, man, i, m, really, not, trying, to, edit...","[hey, man, i, m, realli, not, tri, to, edit, w...",hey man realli tri edit war guy constant remov...
3,more i can t make any real suggestions on im...,0,0,0,0,0,0,"[more, i, can, t, make, any, real, suggestions...","[more, i, can, t, make, ani, real, suggest, on...",make ani real suggest improv wonder section st...
4,you sir are my hero any chance you remember...,0,0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re...","[you, sir, are, my, hero, ani, chanc, you, rem...",sir hero ani chanc rememb page
5,congratulations from me as well use the to...,0,0,0,0,0,0,"[congratulations, from, me, as, well, use, the...","[congratul, from, me, as, well, use, the, tool...",congratul well use tool well talk
6,cocksucker before you piss around on my work,1,1,1,0,1,0,"[cocksucker, before, you, piss, around, on, my...","[cocksuck, befor, you, piss, around, on, my, w...",cocksuck befor piss around work
7,your vandalism to the matt shirvington article...,0,0,0,0,0,0,"[your, vandalism, to, the, matt, shirvington, ...","[your, vandal, to, the, matt, shirvington, art...",vandal matt shirvington articl revert pleas ban
8,sorry if the word nonsense was offensive to ...,0,0,0,0,0,0,"[sorry, if, the, word, nonsense, was, offensiv...","[sorri, if, the, word, nonsens, was, offens, t...",sorri word nonsens offens anyway intend write ...
9,alignment on this subject and which are contra...,0,0,0,0,0,0,"[alignment, on, this, subject, and, which, are...","[align, on, this, subject, and, which, are, co...",align subject contrari dulithgow


In [21]:
toxic.to_csv('train_cleaned.csv', index=False)

### Create feature spaces

In [22]:
toxic = pd.read_csv('train_cleaned.csv')

**Drop NA**

In [23]:
toxic.describe(include='all')

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_tokenize,comment_text_tokenize_stemmed,comment_text_clean
count,159571,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571,159571,159521
unique,159305,,,,,,,158250,158225,157648
top,jun utc,,,,,,,['january'],['januari'],januari
freq,11,,,,,,,21,21,22
mean,,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,,,
std,,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,,,
min,,0.0,0.0,0.0,0.0,0.0,0.0,,,
25%,,0.0,0.0,0.0,0.0,0.0,0.0,,,
50%,,0.0,0.0,0.0,0.0,0.0,0.0,,,
75%,,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [24]:
toxic.dropna(axis=0, inplace=True)

**Split Train and Test**

In [133]:
x_train, x_test, y_train, y_test = train_test_split(toxic.loc[:,'comment_text_clean'], toxic.iloc[:,1:7], test_size = .3, random_state = 43)

In [26]:
x_train.head()

21524    thank note worri wait period get permiss owner...
56229    page need massiv edit initi section befor hit ...
93765                                       okaaaaaay test
87443    apologis make remark sidaway return perhap cou...
73667    newspap headlin newspap headlin blank adult sw...
Name: comment_text_clean, dtype: object

In [27]:
x_train.shape

(111664,)

In [28]:
y_train.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
21524,0,0,0,0,0,0
56229,0,0,0,0,0,0
93765,0,0,0,0,0,0
87443,0,0,0,0,0,0
73667,0,0,0,0,0,0


**Create feature spaces**

In [129]:
#Count Vectors as features

count_vect = CountVectorizer(max_features=5000)
count_vect.fit(x_train)
x_train_cv = count_vect.transform(x_train)
x_test_cv = count_vect.transform(x_test)

#TF-IDF Vectors as features

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(x_train)
x_train_tfidf =  tfidf_vect.transform(x_train)
x_test_tfidf =  tfidf_vect.transform(x_test)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(x_train)
x_train_tfidf_ngram =  tfidf_vect_ngram.transform(x_train)
x_test_tfidf_ngram =  tfidf_vect_ngram.transform(x_test)

In [130]:
feature_name_cv = count_vect.get_feature_names()
feature_name_tfidf = tfidf_vect.get_feature_names()
feature_name_ngram = tfidf_vect_ngram.get_feature_names()

In [131]:
print(feature_name_tfidf)

['aa', 'aaron', 'ab', 'abandon', 'abbrevi', 'abc', 'abid', 'abil', 'abl', 'abort', 'abov', 'abraham', 'abroad', 'absenc', 'absent', 'absolut', 'abstract', 'absurd', 'abund', 'abus', 'ac', 'academ', 'academi', 'acceler', 'accent', 'accept', 'access', 'accid', 'accident', 'accommod', 'accompani', 'accomplish', 'accord', 'account', 'accur', 'accuraci', 'accus', 'ace', 'achiev', 'acid', 'acknowledg', 'acquir', 'acronym', 'across', 'act', 'action', 'activ', 'activist', 'actor', 'actress', 'actual', 'ad', 'adam', 'adapt', 'add', 'addict', 'addit', 'address', 'adequ', 'adher', 'adject', 'adjust', 'admin', 'administ', 'administr', 'adminship', 'admir', 'admiss', 'admit', 'adolf', 'adopt', 'adress', 'adult', 'advanc', 'advantag', 'adventur', 'advert', 'advertis', 'advic', 'advis', 'advoc', 'advocaci', 'ae', 'aesthet', 'afc', 'afd', 'affair', 'affect', 'affili', 'affirm', 'afford', 'afghan', 'afghanistan', 'aforement', 'afraid', 'africa', 'african', 'afternoon', 'afterward', 'age', 'agenc', 'age

### Oversampling (RandomOverSampler)

In [134]:
x_train_cv_os_all = []
y_train_cv_os_all = []

x_train_tfidf_os_all = []
y_train_tfidf_os_all = []

x_train_ngram_os_all = []
y_train_ngram_os_all = []


for i in range(6):
    sm_cv = RandomOverSampler(random_state=40)
    x_train_cv_os, y_train_cv_os = sm_cv.fit_resample(x_train_cv, y_train.iloc[:,i])
    x_train_cv_os_all.append(x_train_cv_os)
    y_train_cv_os_all.append(y_train_cv_os)
    
    sm_tfidf = RandomOverSampler(random_state=40)
    x_train_tfidf_os, y_train_tfidf_os = sm_tfidf.fit_resample(x_train_tfidf, y_train.iloc[:,i])
    x_train_tfidf_os_all.append(x_train_tfidf_os)
    y_train_tfidf_os_all.append(y_train_tfidf_os)
    
    sm_ngram = RandomOverSampler(random_state=40)
    x_train_ngram_os, y_train_ngram_os = sm_ngram.fit_resample(x_train_tfidf_ngram, y_train.iloc[:,i])
    x_train_ngram_os_all.append(x_train_ngram_os)
    y_train_ngram_os_all.append(y_train_ngram_os)

In [135]:
x_train_y_train_all = [x_train_cv_os_all, y_train_cv_os_all, x_train_tfidf_os_all, y_train_tfidf_os_all, x_train_ngram_os_all, y_train_ngram_os_all]

In [136]:
x_test_y_test_all = [x_test_cv, y_test, x_test_tfidf, y_test, x_test_tfidf_ngram, y_test]

In [137]:
for i in x_train_cv_os_all:
    print(i.shape)

(201698, 5000)
(221074, 5000)
(211334, 5000)
(222628, 5000)
(212120, 5000)
(221320, 5000)


In [138]:
import pickle

# where do I want to store this file?
# Open the file to save as pkl file
train_data_path = 'train_data_array.pkl'
train_data_path_pkl = open(train_data_path, 'wb')
pickle.dump(x_train_y_train_all, train_data_path_pkl)

# Close the pickle instances
train_data_path_pkl.close()

In [139]:
test_data_path = 'test_data_array.pkl'
test_data_path_pkl = open(test_data_path, 'wb')
pickle.dump(x_test_y_test_all, test_data_path_pkl)

## Load Feature Matrices

In [140]:
import pickle
x_train_y_train_all_load = pickle.load(open('train_data_array.pkl', 'rb'))
x_test_y_test_all_load = pickle.load(open('test_data_array.pkl', 'rb'))

In [141]:
x_train_cv_os_all = x_train_y_train_all_load[0]
y_train_cv_os_all = x_train_y_train_all_load[1]

x_train_tfidf_os_all = x_train_y_train_all_load[2]
y_train_tfidf_os_all = x_train_y_train_all_load[3]

x_train_ngram_os_all = x_train_y_train_all_load[4]
y_train_ngram_os_all = x_train_y_train_all_load[5]

In [142]:
x_test_cv = x_test_y_test_all_load[0]
x_test_tfidf = x_test_y_test_all_load[2]
x_test_tfidf_ngram = x_test_y_test_all_load[4]
y_test = x_test_y_test_all_load[1]

y_test = [np.array(y_test.iloc[:,i]).reshape(-1,1) for i in range(6)]

## Model Selection

### Logistic Regression

#### Count Vector Feature Space

In [143]:
class toxicmodel:
    def __init__(self, x_train, y_train, x_test, y_test, n = 6):
        self.n = n
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        
        self.best_params = []
        self.best_estimator = []
        
        self.y_predict_train = []
        self.y_predict_test = []
        self.y_predict_proba_train = []
        self.y_predict_proba_test = []

        self.acc_score_train = []
        self.acc_score_test = []

        self.roc_auc_score_train = []
        self.roc_auc_score_test = []

        self.f1_score_train = []
        self.f1_score_test = []

        self.confusion_matrix_train = []
        self.confusion_matrix_test = []

        self.classification_report_train = []
        self.classification_report_test = []

    
    def trainmodel(self, model_name, hyper_param_grid):
        for i in range(self.n):
            grid_search_model = GridSearchCV(model_name, hyper_param_grid, scoring = 'f1', cv = 5,refit = True, n_jobs=-1, verbose = 5)
            grid_search_model.fit(self.x_train[i], self.y_train[i])
            self.best_params.append(grid_search_model.best_params_)
            self.best_estimator.append(grid_search_model.best_estimator_)
    
    
    def predictmodel(self):
        for i in range(self.n):
            
            y_predict_train = self.best_estimator[i].predict(self.x_train[i])
            y_predict_test = self.best_estimator[i].predict(self.x_test)
             
            #y_predict_proba_train = self.best_estimator[i].predict_proba(self.x_train[i])[:,1]
            #y_predict_proba_test = self.best_estimator[i].predict_proba(self.x_test)[:,1]
            

            #self.y_predict_train.append(y_predict_train)
            #self.y_predict_test.append(y_predict_test)
            
            #self.y_predict_proba_train.append(y_predict_proba_train)
            #self.y_predict_proba_test.append(y_predict_proba_test)

            #self.roc_auc_score_train.append(roc_auc_score(self.y_train[i], y_predict_proba_train))
            #self.roc_auc_score_test.append(roc_auc_score(self.y_test[i], y_predict_proba_test))
            
            self.acc_score_train.append(accuracy_score(self.y_train[i], y_predict_train))
            self.acc_score_test.append(accuracy_score(self.y_test[i], y_predict_test))
            
            self.f1_score_train.append(f1_score(self.y_train[i], y_predict_train))
            self.f1_score_test.append(f1_score(self.y_test[i], y_predict_test))

            self.confusion_matrix_train.append(confusion_matrix(self.y_train[i], y_predict_train))
            self.confusion_matrix_test.append(confusion_matrix(self.y_test[i], y_predict_test))

            self.classification_report_train.append(classification_report(self.y_train[i], y_predict_train))
            self.classification_report_test.append(classification_report(self.y_test[i], y_predict_test))

In [147]:
if __name__ == '__main__':
    log_toxic = toxicmodel(x_train_tfidf_os_all, y_train_tfidf_os_all, x_test_tfidf, y_test)
    log_toxic.trainmodel(LogisticRegression(), {'random_state':[0]})
    log_toxic.predictmodel()

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.9s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.7s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.9s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.2s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.0s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.2s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.7s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.9s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.2s remaining:    3.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.5s finished


In [148]:
log_toxic.best_estimator

[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=0, solver='warn',
           tol=0.0001, verbose=0, warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=0, solver='warn',
           tol=0.0001, verbose=0, warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=0, solver='warn',
           tol=0.0001, verbose=0, warm_start=False),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=0, solv

In [149]:
log_toxic.f1_score_test

[0.70342553581242,
 0.3905268245529242,
 0.7417824466282615,
 0.239247311827957,
 0.6252743806836,
 0.3128984796468858]

In [150]:
print(log_toxic.classification_report_test[0])

              precision    recall  f1-score   support

           0       0.98      0.94      0.96     43378
           1       0.60      0.86      0.70      4479

   micro avg       0.93      0.93      0.93     47857
   macro avg       0.79      0.90      0.83     47857
weighted avg       0.95      0.93      0.94     47857



### Neural Network

In [151]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.utils import np_utils

In [158]:
model = Sequential()
model.add(Dense(512, input_shape=(5000,)))
model.add(Activation('relu')) 

model.add(Dropout(0.2))   

model.add(Dense(512))
model.add(Activation('relu'))

model.add(Dropout(0.2))
model.add(Dense(512))
model.add(Activation('relu'))

model.add(Dropout(0.2))
model.add(Dense(2))
model.add(Activation('softmax'))

In [159]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 512)               2560512   
_________________________________________________________________
activation_16 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 512)               262656    
_________________________________________________________________
activation_17 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_13 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 512)               262656    
__________

In [160]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [161]:
y_train = np_utils.to_categorical(y_train_tfidf_os_all[0], 2)

In [156]:
y_test0 = np_utils.to_categorical(y_test[0], 2)

In [162]:
history = model.fit(x_train_tfidf_os_all[0], y_train,
          batch_size=128, epochs=20,
          verbose=1,
          validation_split=0.1)

Train on 181528 samples, validate on 20170 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [163]:
y_test.shape

(47857, 2)

In [164]:
score = model.evaluate(x_test_tfidf, y_test0, verbose=0)
print('Test loss:', score)

Test loss: 0.3995403140610922


In [58]:
from matplotlib import pyplot as plt

plt.figure(figsize=[8,6])
plt.plot(history.history['loss'],'r',linewidth=3.0)
plt.plot(history.history['val_loss'],'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Loss',fontsize=16)
plt.title('Loss Curves',fontsize=16)

Text(0.5, 1.0, 'Loss Curves')

In [165]:
prediction = model.predict(x_test_tfidf)

In [166]:
prediction[:,0]

array([1.0000000e+00, 1.0000000e+00, 1.0000000e+00, ..., 1.0000000e+00,
       1.1753622e-13, 1.0000000e+00], dtype=float32)

In [167]:
y_test0[:,1].reshape(-1,1)
prediction[:,1].reshape(-1,1)

array([[8.569500e-21],
       [6.400106e-17],
       [9.678090e-12],
       ...,
       [1.843435e-20],
       [1.000000e+00],
       [9.754150e-11]], dtype=float32)

In [169]:
print(classification_report(y_test0[:,1], prediction[:,1] > 0.5))

              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98     43378
         1.0       0.82      0.66      0.73      4479

   micro avg       0.95      0.95      0.95     47857
   macro avg       0.89      0.82      0.85     47857
weighted avg       0.95      0.95      0.95     47857

