In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score , precision_recall_fscore_support,accuracy_score,confusion_matrix

from keras.models import Model , load_model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback


Using TensorFlow backend.


In [2]:
def text_cleaning(comment):
    comment = comment.strip("b'").strip('b"') 
    comment = re.sub("\\[\\[(.*?)\\]\\]","",comment)        # Removing GIFs and images from comments
    comment = re.sub(r'\\x\S+',"",comment)  
    #   Replacing apostrophes
    comment = re.sub(r"'s",' is',comment)                   
    comment = re.sub(r"'re",' are',comment)
    comment = re.sub(r"'t",' not',comment)
    comment = re.sub(r"'m",' am',comment)
    comment = re.sub(r"'d",' would',comment)
    comment = re.sub(r"'ll",' will',comment)
    comment = re.sub(r"'ve",' have',comment)
    comment = re.sub(r'[0-9]',"",comment)                    # Removing numbers
    comment = re.sub(r'http\S+',"",comment)                  # Removing Url
    comment = re.sub(r'[\n]'," ",comment) 
    return comment.strip(" ")

In [3]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(GRU(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(3, activation="softmax")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [4]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [5]:
def clean_pred(file_path,col_name):
    data = pd.read_csv(file_path)
    data = data[data[col_name].notnull()]
    data = data[col_name].apply(lambda x : text_cleaning(x))
    list_sentences_data = data.values
    list_tokenized_data = tokenizer.texts_to_sequences(list_sentences_data)
    comments = sequence.pad_sequences(list_tokenized_data,maxlen=maxlen)
    return comments,data

In [6]:
def pred_df(X_test,df):
    y_test = model.predict([X_test], batch_size=1024, verbose=1)
    y_test = y_test.round(2)
    prediction = []
    for i in (y_test):
        prediction.append(np.argmax(i,axis=0))
    df = pd.DataFrame(df)
    df['prediction'] = prediction
    df = df[df['comment_message'].apply(lambda x:x != "")]
    return df

In [7]:
max_features = 20000 # vocabulary size , # of unique words to be used.
maxlen = 100         # maximum number of words to be used for each comment

train = pd.read_csv("mergedDataSet.csv",usecols=['comment_text','merged_rating'])
train = train.sample(frac=1,random_state=42)
# test = pd.read_csv("Test_Data.csv")

list_sentences_train = train['comment_text'].apply(lambda x: text_cleaning(x))
list_sentences_train = list_sentences_train.fillna("None").values

list_classes = ['merged_rating']
y = train[list_classes].values

one_hot_encoder = OneHotEncoder(sparse=False)
y = one_hot_encoder.fit_transform(y)


tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
# list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
# X_test = sequence.pad_sequences(list_tokenized_test,maxlen=maxlen)


In [8]:
[X_train, X_val, y_train, y_val] = train_test_split(X_t, y, train_size=0.75, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)



In [128]:
model1 = get_model()
batch_size = 32
epochs = 2
model1.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

Train on 120460 samples, validate on 40154 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.965943 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.964809 



<keras.callbacks.History at 0x206514af518>

In [9]:
model = load_model('GRU-2Epochs-Cleaned.h5')

In [10]:
sports,sports_df = clean_pred('sports_data.csv','comment_message')

In [11]:
sports_pred = pred_df(sports,sports_df)



In [135]:
y_test = model.predict([X_val], batch_size=32, verbose=1)
y_test = y_test.round(2)
prediction = []
for i in (y_test):
    prediction.append(np.argmax(i,axis=0))
y_pred = prediction



In [139]:
true = []
for i in (y_val):
    true.append(np.argmax(i,axis=0))
y_true = true

In [152]:
zz = precision_recall_fscore_support(y_true,y_pred,average='weighted')

In [153]:
zz

(0.94146941914887883, 0.94807491159037705, 0.94390464803365037, None)

In [155]:
accuracy_score(y_true,y_pred)

0.94807491159037705

In [172]:
y_true_series = pd.Series(y_true)
y_true_series.value_counts()

0    36007
2     2198
1     1949
dtype: int64

In [157]:
confusion_matrix(y_true,y_pred)

array([[35555,   384,    68],
       [  898,   727,   324],
       [  193,   218,  1787]], dtype=int64)