In [None]:
import os
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import  PIL
from PIL import Image

import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm_notebook
from sklearn.metrics import confusion_matrix
import pylab as pl

In [None]:
data = pd.read_csv('train.csv')
print(data.shape)
data.head()

In [None]:
# check NA observations
data.isnull().sum() #  no NA observation

In [None]:
# check depulicated observations
data.duplicated().sum() # no depulicated observation

In [None]:
print(len(data.question_text[data['target'] == 0]), 'sincere questions')
print(len(data.question_text[data['target'] == 1]), 'insincere questions')

In [None]:
sns.set(style = "darkgrid")
plt.figure(figsize=(7,6))
sns.countplot(x = "target", data = data)
plt.savefig("countplot.png")

In [None]:
stopwords = set(STOPWORDS)

def generate_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        stopwords = stopwords,
        max_words = 80,
        max_font_size = 50, 
        scale = 3,
        random_state = 1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize = (12, 10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)
     
    plt.imshow(wordcloud, interpolation="bilinear")
    plt
    
generate_wordcloud(data.question_text[data['target'] == 0])
generate_wordcloud(data.question_text[data['target'] == 1])

In [None]:
# remove HTML tags
def remove_html(x):
    cleanr = re.compile('<.*?>')
    x = re.sub(cleanr, ' ', x) 
    return x

#remove numbers
def remove_numbers(x):
    x = re.sub(r'[0-9]+', '', x)
    return x

# remove punctuations
def remove_punct(x):
    x = re.sub(r'[?|!|\'|"|#|$|*|-|+|=]',r'', x)
    x = re.sub(r'[.|,|)|(|\|/]',r' ', x)
    return x

# mispell correction
mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [None]:
# text preprocessing
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = list(stopwords.words('english'))

temp = []
ml_x = []
snow = nltk.stem.SnowballStemmer('english') # convert the words into their base word or stem word 
for sentence in data.question_text:
    sentence = sentence.lower() # convert to lowercase
    sentence = remove_html(sentence) # remove HTML tags    
    sentence = remove_numbers(sentence) # remove numbers
    sentence = remove_punct(sentence) # remove punctuations
    sentence = replace_typical_misspell(sentence) # clean speelings
    
    ml_x.append(sentence)
    
    words = [snow.stem(word) for word in sentence.split() if word not in stop_words] # stemming and removing stopwords
    temp.append(words)
X = temp

mydata = {'text':X,'target':data.target}
mydata = pd.DataFrame(mydata)

In [None]:
# split train, val, test
random.seed(1)
train_1, test = train_test_split(mydata, test_size = 0.2, random_state = 1)
random.seed(2)
train, val = train_test_split(train_1, test_size = 0.2, random_state = 1)

In [None]:
# tokenization
n_words = 50000 # max number of unique words to use (i.e num rows in embedding vector)
max_len = 100 # max number of words in a question to use

tokenizer = Tokenizer(num_words=n_words)
tokenizer.fit_on_texts(list(train.text))

x_train = tokenizer.texts_to_sequences(train.text)
x_val = tokenizer.texts_to_sequences(val.text)
x_test = tokenizer.texts_to_sequences(test.text)

x_train = pad_sequences(x_train,maxlen=max_len)
x_val = pad_sequences(x_val,maxlen=max_len)
x_test = pad_sequences(x_test,maxlen=max_len)

y_train = train.target
y_val = val.target
y_test = test.target

print('train.shape =', train.shape)
print('val.shape =', val.shape)
print('test.shape =', test.shape)

In [None]:
# glove embeding
emb_file = open("glove.840B.300d.txt", encoding="utf8")
glove_dic = {}
for line in tqdm_notebook(emb_file):
    temp = line.split(" ")
    glove_dic[temp[0]] = np.asarray(temp[1:], dtype='float32')

In [None]:
word_index = tokenizer.word_index
emb_size = glove_dic['.'].shape[0]
emb_matrix = np.zeros((n_words,emb_size))
for w,index in word_index.items():
    if index >= n_words:
        continue
    vec = glove_dic.get(w)
    if vec is not None:
        emb_matrix[index,:] = vec

In [None]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D

from keras.models import Sequential, Model
from keras import backend as K
from keras.optimizers import Adam

In [None]:
def build_model(emb_matrix, n_words, emb_size):
    inp = Input(shape=(max_len,))
    x = Embedding(n_words, emb_size, weights=[emb_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    
    x1 = Bidirectional(LSTM(128, return_sequences=True))(x)
    x2 = Bidirectional(GRU(128, return_sequences=True))(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    
    x3 = Concatenate()([max_pool1, max_pool2])
    x3 = Dense(64, activation = 'relu')(x3)
    out = Dense(1, activation = 'sigmoid')(x3)
    
    model = Model(inputs = inp, outputs = out)
    
    model.compile(optimizer = "adam", loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [None]:
batch_size = 512
num_epoch = 3

model = build_model(emb_matrix, n_words, emb_size)
history  = model.fit(x_train, y_train, batch_size = batch_size, epochs = num_epoch,
                     validation_data=(x_val, y_val), verbose = 1)

In [None]:
test_pred = model.predict(x_test, batch_size = batch_size, verbose=1)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
test_pred = model.predict(x_test, batch_size = batch_size, verbose=1)
y_pred = np.where(test_pred > 0.5, 1, 0)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
pl.matshow(cm)
pl.title('Confusion matrix of LSTM classifier')
pl.colorbar()
pl.show()

In [None]:
def accuracy(confusion_matrix):
    diagonal_sum = confusion_matrix.trace()
    sum_of_all_elements = confusion_matrix.sum()
    return diagonal_sum / sum_of_all_elements 

test_acc = accuracy(cm)
print("test accuracy =", test_acc)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer  

newdata = {'text':ml_x,'target':data.target}
newdata = pd.DataFrame(newdata)

# split train, test
random.seed(1)
train_2, test_2 = train_test_split(newdata, test_size = 0.2, random_state = 1)

vectorizer = CountVectorizer(max_features=500, min_df=2, max_df=0.7, stop_words=stop_words)  
train_x = vectorizer.fit_transform(train_2.text).toarray()  

tfidfconverter = TfidfTransformer()  
train_x = tfidfconverter.fit_transform(train_x).toarray() 

test_x = vectorizer.transform(test_2.text).toarray()
test_x = tfidfconverter.transform(test_x).toarray() 

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_x_scale = scaler.fit_transform(train_x)
test_x_scale = scaler.transform(test_x)

train_y = train_2.target
test_y = test_2.target

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

gb = GradientBoostingClassifier(n_estimators=20, learning_rate=1e-3, max_features=10, max_depth=2, random_state = 0, validation_fraction=0.2)
gb.fit(train_x_scale, train_y)
predictions = gb.predict(test_x_scale)

cm_2 = confusion_matrix(test_y, predictions)
print("Confusion Matrix:")
print(cm_2)
print("Classification Report")
print(classification_report(test_y, predictions))

In [None]:
pl.matshow(cm_2)
pl.title('Confusion matrix of GB classifier')
pl.colorbar()
pl.show()

In [None]:
test_acc_2 = accuracy(cm_2)
print("test accuracy =", test_acc_2)