In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split   
import sklearn.metrics as metrics
from keras.preprocessing.text import Tokenizer                    
from keras.preprocessing.sequence import pad_sequences

In [27]:
#paths required
datafolder_p = r'D:\18-DS\github\SDSHL\data\processed'
datafolder_e = r'D:\18-DS\github\SDSHL\data\external'
datafolder_i = r'D:\18-DS\github\SDSHL\data\internal'
modelfolder  = r'D:\18-DS\data\models'
resultsfolder = r'D:\18-DS\github\SDSHL\data\results'

file_train = datafolder_p + r'\2-train.csv'
file_test  = datafolder_p + r'\2-test.csv'
file_data  = datafolder_p + r'\2-Hinglish_Sarcasm_Clean.csv'
file_FE  = datafolder_p + r'\4-Hinglish_Sarcasm_Clean_FE.csv'

modelfolder_ft       = r'D:\18-DS\data\models\fasttext_wiki.hi'
modelfolder_ft_ind   = r'D:\18-DS\data\models\fasttext_indicnlp.hi'

prediction={}

## <font color=red> Load "2-Hinglish_Sarcasm_Clean.csv" File & Train-Test Split</font>

In [28]:
df=pd.read_csv(file_data, sep='\t', index_col="ID")
sent_size = max([len(s.split()) for s in df.sentence])
sent_size

119

In [29]:
df_train  = pd.read_csv(file_train, sep='\t', index_col="ID")
df_train = df_train[['sentence','label']]
idx_train = df_train.index

df_val   = pd.read_csv(file_test, sep='\t', index_col="ID")
df_val   = df_val[['sentence','label']]
idx_val  = df_val.index

df  = pd.read_csv(file_data, sep='\t', index_col="ID")
full_text = df['sentence']


tokenizer = Tokenizer(num_words=5000) #pickup only 5000 top words
tokenizer.fit_on_texts(full_text)

X_train = tokenizer.texts_to_sequences(df_train['sentence'])
X_train = pad_sequences(X_train, padding='post', maxlen=sent_size)
X_train = pd.DataFrame(X_train, index=idx_train)
y_train = df_train['label']

X_val = tokenizer.texts_to_sequences(df_val['sentence'])
X_val = pad_sequences(X_val, padding='post', maxlen=sent_size)
X_val = pd.DataFrame(X_val, index=idx_val)
y_val = df_val['label']

## <font color=red> CNN from Original File </font>

In [30]:
tokenizer = Tokenizer(num_words=5000) #pickup only 5000 top words
tokenizer.fit_on_texts(df['sentence'])

df_train = tokenizer.texts_to_sequences(df_train['sentence'])
df_train = pad_sequences(df_train, padding='post', maxlen=sent_size)
df_train = pd.DataFrame(df_train, index=idx_train)

df_val = tokenizer.texts_to_sequences(df_val['sentence'])
df_val = pad_sequences(df_val, padding='post', maxlen=sent_size)
df_val = pd.DataFrame(df_val, index=idx_val)

vocab_size=len(tokenizer.word_index) + 1 #+1 for padding

embedding_dim = 200 

In [31]:
vocab_size

9156

In [32]:
 X_train.shape, y_train.shape, X_val.shape, y_val.shape

((1800, 119), (1800,), (200, 119), (200,))

#### Token to Sequence (embedding)

In [None]:
# history = model.fit( X_train, y_train,
#                     epochs=10,
#                     validation_data=(X_val, y_val),
#                     batch_size=batch_size)

In [None]:
def generator(df, batch_size):
    print( 'batch size =', batch_size)
    
    while True:  
        df_size = len(df)
        
        num_batches = df_size//batch_size # calculate the number of batches
        remaining_records= df_size %batch_size
        i=0
        for j in range(num_batches):
            tempdf = df.iloc[i*batch_size: (i+1)*batch_size,:]
            print (j,'\n',tempdf.shape)
            yield tempdf.iloc[:,:100], tempdf['label']
 
        
        # write the code for the remaining data points which are left after full batches
        if remaining_records!=0:
            tempdf = df.iloc[(i+1)*batch_size:,:]
            print ('here',j,'\n',tempdf.shape)
            yield tempdf.iloc[:,:100], tempdf['label']

# Develop Model

### Model Architecture : CNN Architecture

In [33]:
vocab_size, embedding_dim, sent_size,  X_train.shape, y_train.shape, X_val.shape, y_val.shape

(9156, 200, 119, (1800, 119), (1800,), (200, 119), (200,))

In [34]:
from keras.models import Sequential
from keras import layers


cnnmodel = Sequential()
cnnmodel.add(layers.Embedding(vocab_size, embedding_dim, input_length=sent_size))
cnnmodel.add(layers.Conv1D(128, 5, activation='relu'))
cnnmodel.add(layers.GlobalMaxPooling1D())
cnnmodel.add(layers.Dense(10, activation='relu'))
cnnmodel.add(layers.Dense(1, activation='sigmoid'))
cnnmodel.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

#Print summary of model
print(cnnmodel.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 119, 200)          1831200   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 115, 128)          128128    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 11        
Total params: 1,960,629
Trainable params: 1,960,629
Non-trainable params: 0
_________________________________________________________________
None


In [35]:
batch_size=100

# history = model.fit_generator( train_generator,
#                               steps_per_epoch=training_steps_per_epoch,
#                               epochs=10,
#                               validation_data=val_generator )


history = cnnmodel.fit( X_train, y_train,
                    epochs=10,
                    validation_data=(X_val, y_val),
                    batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Model Architecture : RNN Architecture

In [36]:
#deep learning library
from keras.models import *
from keras.layers import *
from keras.callbacks import *

rnnmodel=Sequential()

#embedding layer
rnnmodel.add(layers.Embedding(vocab_size, embedding_dim, input_length=sent_size))

#lstm layer
rnnmodel.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
rnnmodel.add(GlobalMaxPooling1D())

#Dense Layer
rnnmodel.add(Dense(64,activation='relu')) 
rnnmodel.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
rnnmodel.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc=ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

#Print summary of model
print(rnnmodel.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 119, 200)          1831200   
_________________________________________________________________
lstm_1 (LSTM)                (None, 119, 128)          168448    
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
Total params: 2,007,969
Trainable params: 2,007,969
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# #get train & test data generator. Only for large dataset size, which can not be loaded in memeory.
# batch_size = 200
# training_steps_per_epoch = round(len(X_train) / batch_size)

# train_generator = generator(df_train, batch_size)
# val_generator   = generator(df_val, batch_size)

In [37]:
batch_size=100

# history = model.fit_generator( train_generator,
#                               steps_per_epoch=training_steps_per_epoch,
#                               epochs=10,
#                               validation_data=val_generator )

history = rnnmodel.fit( X_train, y_train,
                    epochs=10,
                    validation_data=(X_val, y_val),
                    batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Performance of model on Validation Data

In [38]:
def print_metrics(y_pred):
    threshold=0.5
    roc = np.round( metrics.roc_auc_score(y_val,y_pred), 2)
    
    y_pred1=[]
    for i in y_pred :  

        if i>threshold:
            y_pred1.append(1)
        else:
            y_pred1.append(0)

    acc = np.round( metrics.accuracy_score(y_val,y_pred1), 2)
    recall = np.round( metrics.recall_score(y_val,y_pred1), 2)
    precision = np.round( metrics.precision_score(y_val,y_pred1), 2)
    f1 = np.round( metrics.f1_score(y_val,y_pred1), 2)

    print("\nAccuracy : ", acc )
    print("Recall   : ", recall )
    print("Precision: ", precision )
    print("F1       : ", f1 )
    print("ROC      : ", roc )

In [39]:
prediction["CNN"] = list(np.reshape( cnnmodel.predict(X_val),-1))
prediction["RNN"] = list(np.reshape( rnnmodel.predict(X_val),-1))

In [40]:
print_metrics(prediction['CNN'])


Accuracy :  0.68
Recall   :  0.65
Precision:  0.7
F1       :  0.67
ROC      :  0.74


In [41]:
print_metrics(prediction['RNN'])


Accuracy :  0.68
Recall   :  0.7
Precision:  0.67
F1       :  0.68
ROC      :  0.74


# Embedding Transfer & CNN

In [None]:
# # turn a doc into clean tokens
# def clean_doc(doc, vocab):
#     # split into tokens by white space
#     tokens = doc.split()
#     # remove punctuation from each token
#     table = str.maketrans('', '', punctuation)
#     tokens = [w.translate(table) for w in tokens]
#     # filter out tokens not in vocab
#     tokens = [w for w in tokens if w in vocab]
#     tokens = ' '.join(tokens)
#     return tokens

# # load all docs in a directory
# def process_docs(directory, vocab, is_trian):
#     documents = list()
#     # walk through all files in the folder
#     for filename in listdir(directory):
#         # skip any reviews in the test set
#         if is_trian and filename.startswith('cv9'):
#             continue
#         if not is_trian and not filename.startswith('cv9'):
#             continue
#         # create the full path of the file to open
#         path = directory + '/' + filename
#         # load the doc
#         doc = load_doc(path)
#         # clean doc
#         tokens = clean_doc(doc, vocab)
#         # add to list
#         documents.append(tokens)
#     return documents

In [42]:
from string import punctuation
from os import listdir
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
dim=300

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r', encoding="utf8")
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

def load_embedding(filename):
    # load embedding into memory, skip first line
    df = pd.read_csv(filename, encoding="utf8", sep=" ")
    embedding = dict()
    for record in range(len(df)):
        embedding[ df.iloc[record,0] ] = df.iloc[record,1:].to_numpy()
    return embedding


# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):   
    
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, dim))

    # step vocab, store vectors using the Tokenizer's integer mapping
    i=0
    for word in vocab:
        try:
            vector = embedding[word]
        except:
            vector = np.zeros(dim)

        if vector is not None:
            weight_matrix[i] = vector.reshape(1,-1)
        i+=1
    return weight_matrix

## <font color=red> CNN with ft_Wiki300 Finetuned Word Vector Embedding </font>

In [43]:
# load the vocabulary
vocab_filename = modelfolder_ft + '\wiki300_finetuned_vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

filename=modelfolder_ft+ '\wiki300_finetuned.vec'
raw_embedding = load_embedding(filename)

# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)

# create the embedding layer
embedding_layer = Embedding(vocab_size, dim, weights=[embedding_vectors], input_length=sent_size, trainable=False)

# define model
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))

# model.add(MaxPooling1D(pool_size=2))
# model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X_train, y_train, epochs=10, verbose=2)


Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 119, 300)          2746800   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 115, 128)          192128    
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 128)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 11        
Total params: 2,940,229
Trainable params: 193,429
Non-trainable params: 2,746,800
_________________________________________________________________
None
Epoch 1/10
57/57 - 3s - loss: 0.6755 - accuracy: 0.5600
Epoch 2/10
57/57 - 3s - loss: 0.5454 - accu

<tensorflow.python.keras.callbacks.History at 0x231ca2e2be0>

In [44]:
# evaluate
loss, acc = model.evaluate(X_val, y_val, verbose=0)
print('Test Accuracy: %f' % (acc*100))

prediction["CNN_Wiki300"] = list(np.reshape( model.predict(X_val),-1))

print_metrics(prediction['CNN_Wiki300'])

Test Accuracy: 64.999998

Accuracy :  0.65
Recall   :  0.74
Precision:  0.63
F1       :  0.68
ROC      :  0.74


## <font color=red> IndicFT300 Finedtuned Vector </font>

In [45]:
# load the vocabulary
vocab_filename = modelfolder_ft_ind + '\indicnlpi300_finetuned_vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

filename=modelfolder_ft_ind+ '\\indicnlp300_finetuned.vec'
raw_embedding = load_embedding(filename)

# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)

# create the embedding layer
embedding_layer = Embedding(vocab_size, dim, weights=[embedding_vectors], input_length=sent_size, trainable=False)

# define model
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))

# model.add(MaxPooling1D(pool_size=2))
# model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X_train, y_train, epochs=10, verbose=2)


Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 119, 300)          2746800   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 115, 128)          192128    
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 11        
Total params: 2,940,229
Trainable params: 193,429
Non-trainable params: 2,746,800
_________________________________________________________________
None
Epoch 1/10
57/57 - 3s - loss: 0.6879 - accuracy: 0.5606
Epoch 2/10
57/57 - 3s - loss: 0.6765 - accu

<tensorflow.python.keras.callbacks.History at 0x231d6134d00>

In [46]:
# evaluate
loss, acc = model.evaluate(X_val, y_val, verbose=0)
print('Test Accuracy: %f' % (acc*100))

prediction["CNN_IndicFT300"] = list(np.reshape( model.predict(X_val),-1))

print_metrics(prediction['CNN_IndicFT300'])

Test Accuracy: 66.000003

Accuracy :  0.66
Recall   :  0.65
Precision:  0.66
F1       :  0.66
ROC      :  0.71


### Save CNN, RNN Predictions Results to compare the models.

In [47]:
df_prediction = pd.DataFrame(prediction, columns=["CNN","RNN","CNN_Wiki300", "CNN_IndicFT300"], index=idx_val)
df_prediction.to_csv(resultsfolder + r'\model_predictions_NN.csv')

In [48]:
df_prediction

Unnamed: 0_level_0,CNN,RNN,CNN_Wiki300,CNN_IndicFT300
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5212,0.819261,0.943548,0.920691,0.364997
8028,0.930264,0.941898,0.755416,0.814725
2364,0.000632,0.020987,0.608887,0.390835
5805,0.002664,0.908897,0.897375,0.103974
5236,0.002150,0.029380,0.208650,0.414263
...,...,...,...,...
7171,0.000079,0.085806,0.005806,0.107279
8819,0.238204,0.974613,0.579415,0.884838
2686,0.108273,0.048052,0.826889,0.318257
8692,0.151846,0.030068,0.567009,0.723224


In [49]:
print_metrics(prediction['RNN'])
print_metrics(prediction['CNN_Wiki300'])
print_metrics(prediction['CNN_IndicFT300'])
print_metrics(prediction['CNN'])



Accuracy :  0.68
Recall   :  0.7
Precision:  0.67
F1       :  0.68
ROC      :  0.74

Accuracy :  0.65
Recall   :  0.74
Precision:  0.63
F1       :  0.68
ROC      :  0.74

Accuracy :  0.66
Recall   :  0.65
Precision:  0.66
F1       :  0.66
ROC      :  0.71

Accuracy :  0.68
Recall   :  0.65
Precision:  0.7
F1       :  0.67
ROC      :  0.74
