In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split   
import sklearn.metrics as metrics
from keras.preprocessing.text import Tokenizer                    
from keras.preprocessing.sequence import pad_sequences

In [36]:
#paths required
datafolder_p = r'D:\18-DS\github\SDSHL\data\processed'
datafolder_e = r'D:\18-DS\github\SDSHL\data\external'
datafolder_i = r'D:\18-DS\github\SDSHL\data\internal'
modelfolder  = r'D:\18-DS\data\models'

file_train = datafolder_p + r'\2-train.csv'
file_test  = datafolder_p + r'\2-test.csv'
file_data  = datafolder_p + r'\2-Hinglish_Sarcasm_Clean.csv'
file_FE  = datafolder_p + r'\4-Hinglish_Sarcasm_Clean_FE.csv'

prediction={}
sent_size     = 100 #Max number of the words. If some sentence is more than this then that will be ignored.
embedding_dim = 100 

## <font color=red> CNN from Embedding Data</font>

In [None]:
#Embedding data cannot work on CNN models. Because CNN & RNN need tokens. 
#CNN generates weights and biases. Which are same as embedding, so embedding cannot be input
#In CNN input we must specify sentence length and dictionary length and token for each word.
#in Embedding we do don't have this.

## <font color=red> CNN from Lexical FE File </font>

In [37]:
df_train  = pd.read_csv(file_train, sep='\t', index_col="ID")
idx_train = df_train.index

df_val   = pd.read_csv(file_test, sep='\t', index_col="ID")
idx_val  = df_val.index

df  = pd.read_csv(file_FE, sep='\t', index_col="ID")
df_train = df.loc[idx_train]
df_val   = df.loc[idx_val]

X_train = df_train.drop('label', axis=1)
y_train = df_train['label']

X_val = df_val.drop('label', axis=1)
y_val = df_val['label']

# X_train = tokenizer.texts_to_sequences(df_train['sentence'])
# X_train = pad_sequences(X_train, padding='post', maxlen=sent_size)
# X_train = pd.DataFrame(X_train, index=idx_train)
# y_train = df_train['label']

# X_val = tokenizer.texts_to_sequences(df_val['sentence'])
# X_val = pad_sequences(X_val, padding='post', maxlen=sent_size)
# X_val = pd.DataFrame(X_val, index=idx_val)
# y_val = df_val['label']

vocab_size=20
embedding_dim=20
sent_size=20

## <font color=red> CNN from Original File </font>

In [38]:
df_train  = pd.read_csv(file_train, sep='\t', index_col="ID")
df_train = df_train[['sentence','label']]
idx_train = df_train.index

df_val   = pd.read_csv(file_test, sep='\t', index_col="ID")
df_val   = df_val[['sentence','label']]
idx_val  = df_val.index

df  = pd.read_csv(file_data, sep='\t', index_col="ID")
full_text = df['sentence']


tokenizer = Tokenizer(num_words=5000) #pickup only 5000 top words
tokenizer.fit_on_texts(full_text)

X_train = tokenizer.texts_to_sequences(df_train['sentence'])
X_train = pad_sequences(X_train, padding='post', maxlen=sent_size)
X_train = pd.DataFrame(X_train, index=idx_train)
y_train = df_train['label']

X_val = tokenizer.texts_to_sequences(df_val['sentence'])
X_val = pad_sequences(X_val, padding='post', maxlen=sent_size)
X_val = pd.DataFrame(X_val, index=idx_val)
y_val = df_val['label']

# tokenizer = Tokenizer(num_words=5000) #pickup only 5000 top words
# tokenizer.fit_on_texts(df['sentence'])

# df_train = tokenizer.texts_to_sequences(df_train['sentence'])
# df_train = pad_sequences(df_train, padding='post', maxlen=sent_size)
# df_train = pd.DataFrame(df_train, index=idx_train)

# df_val = tokenizer.texts_to_sequences(df_val['sentence'])
# df_val = pad_sequences(df_val, padding='post', maxlen=sent_size)
# df_val = pd.DataFrame(df_val, index=idx_val)

vocab_size=len(tokenizer.word_index) + 1 #+1 for padding

In [39]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((1800, 20), (1800,), (200, 20), (200,))

#### Token to Sequence (embedding)

In [None]:
# history = model.fit( X_train, y_train,
#                     epochs=10,
#                     validation_data=(X_val, y_val),
#                     batch_size=batch_size)

In [None]:
def generator(df, batch_size):
    print( 'batch size =', batch_size)
    
    while True:  
        df_size = len(df)
        
        num_batches = df_size//batch_size # calculate the number of batches
        remaining_records= df_size %batch_size
        i=0
        for j in range(num_batches):
            tempdf = df.iloc[i*batch_size: (i+1)*batch_size,:]
            print (j,'\n',tempdf.shape)
            yield tempdf.iloc[:,:100], tempdf['label']
 
        
        # write the code for the remaining data points which are left after full batches
        if remaining_records!=0:
            tempdf = df.iloc[(i+1)*batch_size:,:]
            print ('here',j,'\n',tempdf.shape)
            yield tempdf.iloc[:,:100], tempdf['label']

# Develop Model

### Model Architecture : CNN Architecture

In [40]:
embedding_dim, vocab_size, sent_size

(20, 9115, 20)

In [41]:
from keras.models import Sequential
from keras import layers


cnnmodel = Sequential()
cnnmodel.add(layers.Embedding(vocab_size, embedding_dim, input_length=sent_size))
cnnmodel.add(layers.Conv1D(128, 5, activation='relu'))
cnnmodel.add(layers.GlobalMaxPooling1D())
cnnmodel.add(layers.Dense(10, activation='relu'))
cnnmodel.add(layers.Dense(1, activation='sigmoid'))
cnnmodel.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

#Print summary of model
print(cnnmodel.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 20, 20)            182300    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 16, 128)           12928     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 11        
Total params: 196,529
Trainable params: 196,529
Non-trainable params: 0
_________________________________________________________________
None


In [42]:
batch_size=100

# history = model.fit_generator( train_generator,
#                               steps_per_epoch=training_steps_per_epoch,
#                               epochs=10,
#                               validation_data=val_generator )


history = cnnmodel.fit( X_train, y_train,
                    epochs=10,
                    validation_data=(X_val, y_val),
                    batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Model Architecture : RNN Architecture

In [43]:
#deep learning library
from keras.models import *
from keras.layers import *
from keras.callbacks import *

embedding_dim = 100

rnnmodel=Sequential()

#embedding layer
rnnmodel.add(layers.Embedding(vocab_size, embedding_dim, input_length=sent_size))

#lstm layer
rnnmodel.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
rnnmodel.add(GlobalMaxPooling1D())

#Dense Layer
rnnmodel.add(Dense(64,activation='relu')) 
rnnmodel.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
rnnmodel.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc=ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

#Print summary of model
print(rnnmodel.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 20, 100)           911500    
_________________________________________________________________
lstm (LSTM)                  (None, 20, 128)           117248    
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
Total params: 1,037,069
Trainable params: 1,037,069
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# #get train & test data generator. Only for large dataset size, which can not be loaded in memeory.
# batch_size = 200
# training_steps_per_epoch = round(len(X_train) / batch_size)

# train_generator = generator(df_train, batch_size)
# val_generator   = generator(df_val, batch_size)

In [44]:
batch_size=100

# history = model.fit_generator( train_generator,
#                               steps_per_epoch=training_steps_per_epoch,
#                               epochs=10,
#                               validation_data=val_generator )

history = rnnmodel.fit( X_train, y_train,
                    epochs=10,
                    validation_data=(X_val, y_val),
                    batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Performance of model on Validation Data

In [45]:
def print_metrics(y_pred):
    threshold=0.5
    roc = np.round( metrics.roc_auc_score(y_val,y_pred), 2)
    
    y_pred1=[]
    for i in y_pred :  

        if i>threshold:
            y_pred1.append(1)
        else:
            y_pred1.append(0)

    acc = np.round( metrics.accuracy_score(y_val,y_pred1), 2)
    recall = np.round( metrics.recall_score(y_val,y_pred1), 2)
    precision = np.round( metrics.precision_score(y_val,y_pred1), 2)
    f1 = np.round( metrics.f1_score(y_val,y_pred1), 2)

    print("Accuracy : ", acc )
    print("Recall   : ", recall )
    print("Precision: ", precision )
    print("F1       : ", f1 )
    print("ROC      : ", roc )

In [46]:
prediction={}
prediction["CNN"] = list(np.reshape( cnnmodel.predict(X_val),-1))
prediction["RNN"] = list(np.reshape( rnnmodel.predict(X_val),-1))

In [47]:
print_metrics(prediction['CNN'])

Accuracy :  0.68
Recall   :  0.62
Precision:  0.7
F1       :  0.66
ROC      :  0.73


In [48]:
print_metrics(prediction['RNN'])

Accuracy :  0.62
Recall   :  0.63
Precision:  0.62
F1       :  0.62
ROC      :  0.65


### Save CNN, RNN Predictions Results to compare the models.

In [49]:
df_prediction = pd.DataFrame(prediction, columns=["CNN","RNN"], index=idx_val)

df_prediction.to_csv(datafolder_p + r'\model_predictions_NN.csv')

In [50]:
df_prediction

Unnamed: 0_level_0,CNN,RNN
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
5212,0.569488,0.865231
8028,0.059986,0.749970
2364,0.019504,0.000362
5805,0.008435,0.396357
5236,0.011073,0.955523
...,...,...
7171,0.000628,0.000176
8819,0.265164,0.999915
2686,0.129757,0.000343
8692,0.607230,0.015694


In [51]:
print_metrics(prediction['CNN'])

Accuracy :  0.68
Recall   :  0.62
Precision:  0.7
F1       :  0.66
ROC      :  0.73
