<a href="https://colab.research.google.com/github/chendingyan/NLP490H/blob/master/NLP_OffensEval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Initialize Notebook
### The output file and dataset will be stored in the google drive



In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm 
import codecs
import random
import matplotlib.pyplot as plt
import pandas as pd



SEED = 234
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

train_path = 'drive/My Drive/data/OffensEval_task_data/offenseval-training-v1.tsv'
testA_path ='drive/My Drive/data/OffensEval_task_data/testset-taska.tsv'
testB_path = 'drive/My Drive/data/OffensEval_task_data/testset-taskb.tsv'
testC_path ='drive/My Drive/data/OffensEval_task_data/test_set_taskc.tsv'

print('loading glove6B word vector')
glove_path = 'drive/My Drive/data/glove.6B.300d.txt'
embeddings_index = {}
f = open(glove_path, encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))




loading glove6B word vector
Found 400000 word vectors.


## Data preprocessing
1. Remove stop word
2. convert to lower case and split


In [55]:
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

nltk.download("stopwords")
nltk.download('wordnet')
stop = stopwords.words("english")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


def preprocess(df):
    print('-------Remove Stop Word--------')
    stopword_set = set(stopwords.words("english"))

    # convert to lower case and split 
    df.tweet = df.tweet.apply(lambda x: ' '.join([word.lower() for word in x.split() if word not in stopword_set]))

    # keep only words
    pat1 = r'@[A-Za-z0-9]+'
    pat2 =r'[^a-zA-Z\s]'
    pat3 =r"\bURL\b"
    combined_pat = r'|'.join((pat1, pat2,pat3))
    regex_pat = re.compile(combined_pat, flags=re.IGNORECASE)
    df.tweet = df.tweet.str.replace(regex_pat, '')

    # join the cleaned words in a list
    df.tweet.str.join("")
    return df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Classical classifier
1. Logistic Regression
2. Naive Bayes
3. SVM
4. Xgboost
5. Random Forest

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import bsr_matrix
from sklearn import svm

import numpy as np
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB


class Bag_of_word(object):
    
    def __init__(self,classifier,vectorizer, max_feature = 10000):
        if vectorizer =='tf_idf':
            self.vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
                                     stop_words = None, max_features = max_feature)
        else:
            self.vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
                                 stop_words = None, max_features = max_feature)
        self.model = classifier
        self.clf = None
        
    def train_clf(self, train_data, lab_data, C = 1.0):
        train_data_features = self.vectorizer.fit_transform(train_data)
        train_data_features = bsr_matrix(train_data_features)
        print('training data shape', train_data_features.shape)
        
        if self.model == 'logistic':
            print("Training the logistic regression...")
            self.clf = LogisticRegression(solver='lbfgs', penalty='l2', dual=False, tol=0.0001, C=C, fit_intercept=True,
                                         intercept_scaling=1.0, class_weight=None, random_state=None)
            self.clf = self.clf.fit(train_data_features, lab_data)
        
        elif self.model == 'naive_bayes':
            print("Training the Naive Bayes...")
            self.clf = MultinomialNB()
            self.clf = self.clf.fit(train_data_features, lab_data)
            
        elif self.model == 'svm':
            self.clf = svm.LinearSVC()
            self.clf = self.clf.fit(train_data_features, lab_data)
        
        elif self.model == 'xgboost':
            self.clf = XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                            subsample=0.8, nthread=10, learning_rate=0.1)
            self.clf = self.clf.fit(train_data_features, lab_data)

        elif self.model =='random_forest':
            self.clf = RandomForestClassifier(n_jobs=-1)
            self.clf.fit(train_data_features, lab_data)

    def test_clf(self, test_data):
        test_data_features = self.vectorizer.transform(test_data)
        test_data_features = bsr_matrix(test_data_features)
    
        result = self.clf.predict(test_data_features)
        return result
    
    def validate_clf(self, train_data, lab_data, C = 1.0):
        train_data_features = self.vectorizer.fit_transform(train_data)
        train_data_features = bsr_matrix(train_data_features)
        lab_data = np.array(lab_data)
        
        print("start k-fold validate...")
        if self.model == 'logistic':
            clf = LogisticRegression(solver='lbfgs', penalty='l2', dual=False, tol=0.0001, C=C, fit_intercept=True,
                                         intercept_scaling=1.0, class_weight=None, random_state=None)
            cv = cross_val_score(clf, train_data_features, lab_data, cv=10, scoring='accuracy')
        
        elif self.model == 'naive_bayes':
            clf = MultinomialNB()
            cv = cross_val_score(clf, train_data_features, lab_data, cv=10, scoring='accuracy')
            
        elif self.model == 'svm':
            clf = svm.LinearSVC()
            cv = cross_val_score(clf, train_data_features, lab_data, cv=10, scoring='accuracy')
        
        elif self.model == 'xgboost':
            clf = XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                            subsample=0.8, nthread=10, learning_rate=0.1)
            cv = cross_val_score(clf, train_data_features, lab_data, cv=10, scoring='accuracy')

        elif self.model =='random_forest':
            clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
            cv = cross_val_score(clf, train_data_features, lab_data, cv=10, scoring='accuracy')
        return np.mean(cv)

### Tune the classsifier by Hyperparameter Grid Search

In [0]:
def svm_tune(x_train_tfv, y_train_a):
    svd = TruncatedSVD()

    # Standard Scaler
    scl = preprocessing.StandardScaler()

    svc = SVC()
    # SVM
    clf = pipeline.Pipeline([('svd',svd), ('scl',scl), ('svm',svc)])

    param_grid = {'svd__n_components':[120,180],
                 'svm__C':[1, 10],
                  'svm__kernel':('linear', 'rbf'),
                  'svm__gamma':['auto','scale']
                 }
    model = GridSearchCV(cv = 5, estimator=clf, param_grid=param_grid, scoring='accuracy',
                                         verbose=10, n_jobs=-1, iid=True, refit=True)
    model.fit(x_train_tfv, y_train_a) 

    print("Best score while tuning: %0.3f" % model.best_score_)

    return param_grid, model


#Logistic Regression
def logistic_tune(x_train_tfv, y_train_a):
    lr_model = LogisticRegression()

    # pipeline 
    clf = pipeline.Pipeline([('lr', lr_model)])
    param_grid = {
                         'lr__C': [0.1, 1.0, 10], 
                        'lr__penalty': ['l1', 'l2']}

    model = GridSearchCV(cv = 5, estimator=clf, param_grid=param_grid, scoring='accuracy',
                                     verbose=10, n_jobs=-1, iid=True, refit=True)
    model.fit(x_train_tfv, y_train_a) 

    print("Best score while tuning: %0.3f" % model.best_score_)
    
    return param_grid, model

# Naive Bayes
def naive_bayes_tune(x_train_tfv, y_train_a):
    nb_model = MultinomialNB()

    clf = pipeline.Pipeline([('nb', nb_model)])

    param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

    # （Grid Search Model）
    model = GridSearchCV(cv = 5,estimator=clf, param_grid=param_grid, scoring='accuracy',
                                     verbose=10, n_jobs=-1, iid=True, refit=True,return_train_score=True)

    model.fit(x_train_tfv, y_train_a) 
    print("Best score while tuning: %0.3f" % model.best_score_)
    
    return param_grid, model

# Xgboost
def xgboost_tune(x_train_tfv, y_train_a):
    
    xgb_model = XGBClassifier()
    clf = pipeline.Pipeline([('xgb', xgb_model)])
    param_grid = {'learning_rate' :[0.1,0.01,0.001],
                        'n_estimators':[200,400,600,800,1000],
                        'max_depth':[3,5,7],
                        'min_child_weight': [1,3,5]}
    
    model = GridSearchCV(cv = 5, estimator=clf, param_grid=param_grid, scoring='accuracy',
                                     verbose=10, n_jobs=-1, iid=True, refit=True,return_train_score=True)

    model.fit(x_train_tfv.tocsc(), y_train_a) 
    print("Best score while tuning: %0.3f" % model.best_score_)
    
    return param_grid, model


## Deep Neural Network 
We build 5 NN, which are:
1.GRU 2. LSTM 3. CNN 4. CNN2d 5. Extended TextCNN
We can train it using 'Adam', 'SGD' or 'Rmsprop' optimizer

In [0]:

from keras.models import Sequential, Model
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.preprocessing import sequence, text
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D,Dropout,Convolution2D
from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape
from keras.callbacks import EarlyStopping
from keras.layers.merge import concatenate
from keras import optimizers
from sklearn.model_selection import train_test_split, GridSearchCV
from keras import optimizers
from sklearn.utils import class_weight
import os


class Deep_Neural_Network(object):
    def __init__(self, data, label, model_type, embedding_index,  metrics='accuracy'):
        train_data, val_data, train_label, val_label = train_test_split(data, label, test_size=0.2,stratify=label,random_state=SEED,shuffle=True)
        token = text.Tokenizer(num_words=None)
        token.fit_on_texts(train_data)
        self.token = token
        x_train_seq = token.texts_to_sequences(train_data)
        x_val_seq = token.texts_to_sequences(val_data)
        self.max_len = len(max(data,key=lambda x:len(x)))
        x_train_pad = sequence.pad_sequences(x_train_seq, maxlen=self.max_len)
        x_val_pad = sequence.pad_sequences(x_val_seq, maxlen=self. max_len)
        self.train_data = x_train_pad
        self.val_data = x_val_pad
        if  np.array_equal(train_label, train_label.astype(bool)):
            self.num_classes = 1
            self.activation = 'sigmoid'
            self.loss  ='binary_crossentropy'
            self.train_label = train_label
            self.val_label = val_label
        else:
            self.num_classes = len(np.unique(train_label))
            self.activation = 'softmax'
            self.loss= 'categorical_crossentropy'
            self.train_label = np_utils.to_categorical(train_label)
            self.val_label = np_utils.to_categorical(val_label)
        self.metrics = metrics
        self.word_index =token.word_index
        self.weight = self.embedding_matrix(embedding_index)
        self.optim = 'adam'
        if model_type == 'gru':
            self.model = self.gru_model()
        elif model_type == 'lstm':
            self.model = self.lstm_model()
        elif model_type == 'cnn':
            self.model = self.cnn_model()
        elif model_type == 'etextcnn':
            self.model = self.etextcnn_model()
        elif model_type =='cnn2d':
            self.model = self.CNN_2D()
            
            
    def set_optimizer(self, optimizer='adam', lr=0.01, decay=5e-3):
        if optimizer =='sgd':
            self.optim = optimizers.SGD(lr, decay=decay)
        elif optimizer =='adam':
            self.optim = optimizers.Adam(lr,decay=decay)
        elif optimizer =='rmsprop':
            self.optim = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)

    def embedding_matrix(self,embedding_index):
        embedding_matrix = np.zeros((len(self.word_index) + 1, 300))
        for word, i in tqdm(self.word_index.items()):
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        return embedding_matrix
        
   
    
    def lstm_model(self):
        lstm_model = Sequential()
        lstm_model.add(Embedding(len(self.word_index) + 1, 300, weights=[self.weight],  input_length=self.max_len, trainable=False))
        lstm_model.add(SpatialDropout1D(0.3))
        lstm_model.add(Bidirectional(LSTM(100, dropout=0.3, recurrent_dropout=0.3)))
        lstm_model.add(Dense(512, activation='relu'))
        lstm_model.add(Dropout(0.3))
        lstm_model.add(Dense(256, activation='relu'))
        lstm_model.add(Dropout(0.3))
        lstm_model.add(Dense(self.num_classes, activation=self.activation))
        
        lstm_model.compile(loss=self.loss, optimizer=self.optim, metrics=[self.metrics])
        lstm_model.summary()
        return lstm_model
    
    def gru_model(self):
        gru_model = Sequential()
        gru_model.add(Embedding(len(self.word_index) + 1, 300, weights=[self.weight],  input_length=self.max_len, trainable=False))
        gru_model.add(SpatialDropout1D(0.3))
        gru_model.add(GRU(128, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
        gru_model.add(GRU(64, dropout=0.3, recurrent_dropout=0.3))
        gru_model.add(Dense(256, activation='relu'))
        gru_model.add(Dropout(0.3))
        gru_model.add(Dense(128, activation='relu'))
        gru_model.add(Dropout(0.3))
        gru_model.add(Dense(self.num_classes, activation=self.activation))

        gru_model.compile(loss=self.loss, optimizer=self.optim, metrics=[self.metrics])
        gru_model.summary()
        return gru_model
    
    
    def cnn_model(self):
        cnn_model = Sequential()
        cnn_model.add(Embedding(len(self.word_index) + 1,300,weights=[self.weight],input_length=self.max_len, trainable=False))
        cnn_model.add(Conv1D(64, 3, padding='same'))
        cnn_model.add(MaxPooling1D(3,3,padding='same'))
        cnn_model.add(Conv1D(32, 3, padding='same'))
        cnn_model.add(MaxPooling1D(3,3,padding='same'))
        cnn_model.add(Conv1D(16, 3, padding='same'))
        cnn_model.add(Flatten())
        cnn_model.add(Dropout(0.3))
        cnn_model.add(BatchNormalization()) 
        cnn_model.add(Dense(256,activation='relu'))
        cnn_model.add(Dropout(0.1))
        cnn_model.add(Dense(self.num_classes, activation=self.activation))
        
        cnn_model.compile(loss=self.loss, optimizer=self.optim, metrics=[self.metrics])
        cnn_model.summary()
        return cnn_model
    
    def CNN_2D(self):
        cnn2d_model = Sequential()
        cnn2d_model.add(Embedding(len(self.word_index) + 1,50,input_length=self.max_len, trainable=False))
        cnn2d_model.add(Reshape((50,self.max_len,1)))
        cnn2d_model.add(Convolution2D(64,(1,5),activation="relu"))
        cnn2d_model.add(Dropout(0.9))
        cnn2d_model.add(Convolution2D(32,(2,3),activation="relu"))
        cnn2d_model.add(Dropout(0.8))
        cnn2d_model.add(Convolution2D(16,(2,2),activation="relu"))
        cnn2d_model.add(Dropout(0.7))
        cnn2d_model.add(Flatten())
        cnn2d_model.add(Dense(self.num_classes, activation=self.activation))
        cnn2d_model.compile(loss=self.loss, optimizer=self.optim, metrics=[self.metrics])
        cnn2d_model.summary()
        return cnn2d_model    
    
    def etextcnn_model(self):
        main_input = Input(shape=(self.max_len,), dtype='float64')
        embedder = Embedding(len(self.word_index) + 1, 300, input_length = self.max_len, weights = [self.weight], trainable = False)
        embed = embedder(main_input)
        # cnn1，kernel_size = 3
        conv1_1 = Conv1D(64, 3, padding='same')(embed)
        bn1_1 = BatchNormalization()(conv1_1)
        relu1_1 = Activation('relu')(bn1_1)
        drop1_1 = Dropout(0.3)(relu1_1)
        conv1_2 = Conv1D(32, 3, padding='same')(drop1_1)
        bn1_2 = BatchNormalization()(conv1_2)
        relu1_2 = Activation('relu')(bn1_2)
        drop1_2 = Dropout(0.3)(relu1_2)
        cnn1 = MaxPooling1D(pool_size=4)(drop1_1)
        # cnn2，kernel_size = 4
        conv2_1 = Conv1D(64, 4, padding='same')(embed)
        bn2_1 = BatchNormalization()(conv2_1)
        relu2_1 = Activation('relu')(bn2_1)
        drop2_1 = Dropout(0.3)(relu2_1)
        conv2_2 = Conv1D(32, 4, padding='same')(drop2_1)
        bn2_2 = BatchNormalization()(conv2_2)
        relu2_2 = Activation('relu')(bn2_2)
        drop2_2 = Dropout(0.3)(relu2_2)
        cnn2 = MaxPooling1D(pool_size=4)(drop2_2)
        # cnn3，kernel_size = 5
        conv3_1 = Conv1D(64, 5, padding='same')(embed)
        bn3_1 = BatchNormalization()(conv3_1)
        relu3_1 = Activation('relu')(bn3_1)
        drop3_1 = Dropout(0.3)(relu3_1)
        conv3_2 = Conv1D(32, 5, padding='same')(drop3_1)
        bn3_2 = BatchNormalization()(conv3_2)
        relu3_2 = Activation('relu')(bn3_2)
        drop3_2 = Dropout(0.3)(relu3_2)
        cnn3 = MaxPooling1D(pool_size=4)(drop3_2)
        # Combine three block
        cnn = concatenate([cnn1,cnn2,cnn3], axis=-1)
        flat = Flatten()(cnn)
        drop = Dropout(0.5)(flat)
        fc = Dense(128)(drop)
        bn = BatchNormalization()(fc)

        main_output = Dense(self.num_classes, activation=self.activation)(bn)
        etextcnn_model = Model(inputs = main_input, outputs = main_output)                
        etextcnn_model.compile(loss=self.loss, optimizer=self.optim, metrics=[self.metrics])
        etextcnn_model.summary()
        return etextcnn_model
    

    def train(self,epochs=100,batch_size=256):
        earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
#         class_weights = class_weight.compute_class_weight('balanced', np.unique(self.train_label), self.train_label)
#         class_weight_dict = dict(enumerate(class_weights))
        self.model.fit(self.train_data, self.train_label, batch_size=batch_size, epochs=epochs, verbose=1, 
                       validation_data=(self.val_data, self.val_label),callbacks=[earlystop])  

    def predict(self, test_data):
        test_seq = self.token.texts_to_sequences(test_data)
        test_pad =sequence.pad_sequences(test_seq, maxlen=self.max_len)
        prediction =self.model.predict(test_pad)
#         return prediction.argmax(axis=1)
        if self.num_classes != 3:
          return np.where(prediction >= 0.5, 1, 0)
        elif self.num_classes == 3:
          return prediction.argmax(axis=1)

        
    def visualize(self):
        from keras.utils import plot_model
        plot_model(self.model, to_file='drive/My Drive/data/lstm_model_plot.png', show_shapes=True, show_layer_names=True)
        
    def save(self,filename):
        self.model.save(os.path.join('drive/My Drive/data/nlp_output/',filename))

# Task A

## Split dataset for task A

In [59]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load training data
df = pd.read_csv(train_path,sep='\t')
df = preprocess(df)
lbl = LabelEncoder()
y_a = lbl.fit_transform(df.subtask_a.values)
x_a = df.tweet.values

# load test data
df_a = pd.read_csv(testA_path,sep='\t')
df_a = preprocess(df_a)
x_test_a = df_a.tweet.values


-------Remove Stop Word--------
-------Remove Stop Word--------


## Task A - classical classifiers

### tf-idf word embedding

In [0]:
# TF_IDF
clf = Bag_of_word('logistic','tf_idf')
cv = clf.validate_clf(x_a, y_a)
print('mean cross validation for Logistic regression is', cv)

clf = Bag_of_word('naive_bayes','tf_idf')
cv = clf.validate_clf(x_a, y_a)
print('mean cross validation for Naive_Bayes is', cv)

clf = Bag_of_word('xgboost','tf_idf')
cv = clf.validate_clf(x_a, y_a)
print('mean cross validation for Xgboost is', cv)

clf = Bag_of_word('random_forest','tf_idf')
cv = clf.validate_clf(x_a, y_a)
print('mean cross validation for Random Forest is', cv)

clf = Bag_of_word('svm','tf_idf')
cv = clf.validate_clf(x_a, y_a)
print('mean cross validation for SVM is', cv)

start k-fold validate...
mean cross validation for Logistic regression is 0.756797583081571
start k-fold validate...
mean cross validation for Naive_Bayes is 0.7241691842900302
start k-fold validate...
mean cross validation for Xgboost is 0.7580815709969788
start k-fold validate...
mean cross validation for Random Forest is 0.7671450151057403
start k-fold validate...
mean cross validation for SVM is 0.7512839879154078


In [0]:
clf = Bag_of_word('random_forest','tf_idf')
clf.train_clf(x_a,y_a)
prediction = clf.test_clf(x_test_a)
prediction =lbl.inverse_transform(prediction)
out = pd.DataFrame({'id':df_a.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_a_rf.csv',header=None, index=None)
out.head()


training data shape (13240, 10000)


Unnamed: 0,id,label
0,15923,OFF
1,27014,NOT
2,30530,NOT
3,13876,NOT
4,60133,NOT


### Word count embedding

In [0]:
# Word Count
clf = Bag_of_word('logistic','word_count')
cv = clf.validate_clf(x_a, y_a)
print('mean cross validation for Logistic regression is', cv)

clf = Bag_of_word('naive_bayes','word_count')
cv = clf.validate_clf(x_a, y_a)
print('mean cross validation for Naive_Bayes is', cv)

clf = Bag_of_word('xgboost','word_count')
cv = clf.validate_clf(x_a, y_a)
print('mean cross validation for Xgboost is', cv)

clf = Bag_of_word('random_forest','word_count')
cv = clf.validate_clf(x_a, y_a)
print('mean cross validation for Random Forest is', cv)

clf = Bag_of_word('svm','word_count')
cv = clf.validate_clf(x_a, y_a)
print('mean cross validation for SVM is', cv)

start k-fold validate...
mean cross validation for Logistic regression is 0.7614048338368581
start k-fold validate...
mean cross validation for Naive_Bayes is 0.7424471299093656
start k-fold validate...
mean cross validation for Xgboost is 0.7624622356495468
start k-fold validate...
mean cross validation for Random Forest is 0.7583836858006043
start k-fold validate...
mean cross validation for SVM is 0.7370090634441089


In [0]:
clf = Bag_of_word('xgboost','word_count')
clf.train_clf(x_a,y_a)
prediction = clf.test_clf(x_test_a)
prediction =lbl.inverse_transform(prediction)
out = pd.DataFrame({'id':df_a.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_a_xgb.csv',header=None, index=None)
out.head()

training data shape (13240, 10000)


Unnamed: 0,id,label
0,15923,OFF
1,27014,NOT
2,30530,NOT
3,13876,NOT
4,60133,NOT


## Task A - Neural network classifiers

### GRU
- rmsprop optimizer
- val_acc: 0.7776

In [27]:
model= Deep_Neural_Network(x_a,y_a,'gru',embeddings_index)
model.set_optimizer('rmsprop')
model.train(epochs=20,batch_size=256)
prediction = model.predict(x_test_a)
model.save('task_a_gru.h5')

prediction =lbl.inverse_transform(prediction)
out = pd.DataFrame({'id':df_a.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_a_gru.csv',header=None, index=None)
out.head()

100%|██████████| 17766/17766 [00:00<00:00, 480094.87it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 280, 300)          5330100   
_________________________________________________________________
spatial_dropout1d_4 (Spatial (None, 280, 300)          0         
_________________________________________________________________
gru_7 (GRU)                  (None, 280, 128)          164736    
_________________________________________________________________
gru_8 (GRU)                  (None, 64)                37056     
_________________________________________________________________
dense_10 (Dense)             (None, 256)               16640     
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 128)               32896     
__________

Unnamed: 0,id,label
0,15923,OFF
1,27014,NOT
2,30530,NOT
3,13876,NOT
4,60133,NOT


### Extended TextCNN
- SGD, lr = 1e-7
- val_acc: 0.7647

In [0]:
model= Deep_Neural_Network(x_a,y_a,'etextcnn',embeddings_index)
model.set_optimizer('sgd',lr=0.0000001)
model.train(epochs=20,batch_size=256)

100%|██████████| 17778/17778 [00:00<00:00, 356294.28it/s]


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 280)          0                                            
__________________________________________________________________________________________________
embedding_28 (Embedding)        (None, 280, 300)     5333700     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_30 (Conv1D)              (None, 280, 64)      76864       embedding_28[0][0]               
__________________________________________________________________________________________________
conv1d_32 (Conv1D)              (None, 280, 64)      96064       embedding_28[0][0]               
__________________________________________________________________________________________________
batch_norm

### LSTM
- Rmsprop
- val_acc: 0.7636

In [28]:
model= Deep_Neural_Network(x_a,y_a,'lstm',embeddings_index)
model.set_optimizer('rmsprop')
model.train(epochs=20,batch_size=512)
prediction = model.predict(x_test_a)
model.save('task_a_lstm.h5')

prediction =lbl.inverse_transform(prediction)
out = pd.DataFrame({'id':df_a.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_a_lstm.csv',header=None, index=None)
out.head()

100%|██████████| 17766/17766 [00:00<00:00, 448443.15it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 280, 300)          5330100   
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 280, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               320800    
_________________________________________________________________
dense_13 (Dense)             (None, 512)               102912    
_________________________________________________________________
dropout_9 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
__________

Unnamed: 0,id,label
0,15923,NOT
1,27014,NOT
2,30530,NOT
3,13876,NOT
4,60133,NOT


### LSTM
- SGD, lr = 1e-3
- val_acc: 0.7677

In [60]:
model= Deep_Neural_Network(x_a,y_a,'lstm',embeddings_index)
model.set_optimizer('sgd', lr = 0.001)
model.train(epochs=20,batch_size=512)
# prediction = model.predict(x_test_a)
# prediction


100%|██████████| 17766/17766 [00:00<00:00, 435457.98it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 280, 300)          5330100   
_________________________________________________________________
spatial_dropout1d_9 (Spatial (None, 280, 300)          0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 200)               320800    
_________________________________________________________________
dense_35 (Dense)             (None, 512)               102912    
_________________________________________________________________
dropout_32 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_36 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_33 (Dropout)         (None, 256)               0         
__________

### CNN2d
- Adam
- val_acc: 0.6677

In [0]:
model= Deep_Neural_Network(x_a,y_a,'cnn2d',embeddings_index)
# model.set_optimizer('sgd',lr=0.005)
model.train(epochs=20,batch_size=128)

100%|██████████| 17766/17766 [00:00<00:00, 410030.13it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 280, 50)           888350    
_________________________________________________________________
reshape_1 (Reshape)          (None, 50, 280, 1)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 50, 276, 64)       384       
_________________________________________________________________
dropout_3 (Dropout)          (None, 50, 276, 64)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 49, 274, 32)       12320     
_________________________________________________________________
dropout_4 (Dropout)          (None, 49, 274, 32)       0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 48, 273, 16)       2064      
__________

In [31]:
model= Deep_Neural_Network(x_a,y_a,'cnn',embeddings_index)
model.set_optimizer('sgd', lr = 0.001)
model.train(epochs=20,batch_size=128)
prediction=model.predict(x_test_a)
model.save('task_a_cnn.h5')
prediction =lbl.inverse_transform(prediction)
out = pd.DataFrame({'id':df_a.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_a_cnn.csv',header=None, index=None)
out.head()

100%|██████████| 17766/17766 [00:00<00:00, 383358.14it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 280, 300)          5330100   
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 280, 64)           57664     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 94, 64)            0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 94, 32)            6176      
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 32, 32)            0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 32, 16)            1552      
_________________________________________________________________
flatten_3 (Flatten)          (None, 512)               0         
__________

Unnamed: 0,id,label
0,15923,OFF
1,27014,NOT
2,30530,NOT
3,13876,NOT
4,60133,NOT


# Task B

## Split dataset for task B

In [61]:
lbl = LabelEncoder()
y = lbl.fit_transform(df.subtask_a.values)
x = df.tweet.values
taskb_idx= y==1
y_b =df.subtask_b.values[taskb_idx]
y_b = np.where(y_b=='TIN',1,0)
x_b = x[taskb_idx]

# load test data
df_b = pd.read_csv(testB_path,sep='\t')
df_b = preprocess(df_b)
x_test_b = df_b.tweet.values



-------Remove Stop Word--------


## Task B - classical classifiers

### tf-idf word embedding

In [0]:
# TF_IDF
clf = Bag_of_word('logistic','tf_idf')
cv = clf.validate_clf(x_b, y_b)
print('mean cross validation for Logistic regression is', cv)

clf = Bag_of_word('naive_bayes','tf_idf')
cv = clf.validate_clf(x_b, y_b)
print('mean cross validation for Naive_Bayes is', cv)

clf = Bag_of_word('xgboost','tf_idf')
cv = clf.validate_clf(x_b, y_b)
print('mean cross validation for Xgboost is', cv)

clf = Bag_of_word('random_forest','tf_idf')
cv = clf.validate_clf(x_b, y_b)
print('mean cross validation for Random Forest is', cv)

clf = Bag_of_word('svm','tf_idf')
cv = clf.validate_clf(x_b, y_b)
print('mean cross validation for SVM is', cv)

start k-fold validate...
mean cross validation for Logistic regression is 0.8811405181938863
start k-fold validate...
mean cross validation for Naive_Bayes is 0.8809106639825807
start k-fold validate...
mean cross validation for Xgboost is 0.873188005995523
start k-fold validate...
mean cross validation for Random Forest is 0.876824383719112
start k-fold validate...
mean cross validation for SVM is 0.8752293306360993


### Word count embedding

In [0]:
# Word Count
clf = Bag_of_word('logistic','word_count')
cv = clf.validate_clf(x_b, y_b)
print('mean cross validation for Logistic regression is', cv)

clf = Bag_of_word('naive_bayes','word_count')
cv = clf.validate_clf(x_b, y_b)
print('mean cross validation for Naive_Bayes is', cv)

clf = Bag_of_word('xgboost','word_count')
cv = clf.validate_clf(x_b, y_b)
print('mean cross validation for Xgboost is', cv)

clf = Bag_of_word('random_forest','word_count')
cv = clf.validate_clf(x_b, y_b)
print('mean cross validation for Random Forest is', cv)

clf = Bag_of_word('svm','word_count')
cv = clf.validate_clf(x_b, y_b)
print('mean cross validation for SVM is', cv)

start k-fold validate...
mean cross validation for Logistic regression is 0.8688630940054629
start k-fold validate...
mean cross validation for Naive_Bayes is 0.8450060821595153
start k-fold validate...
mean cross validation for Xgboost is 0.8779550690297517
start k-fold validate...
mean cross validation for Random Forest is 0.8681900509910596
start k-fold validate...
mean cross validation for SVM is 0.8284089089490978


## Task B - Neural Network Classifiers

### LSTM
- rmsprop
- val_acc: 0.8807

In [33]:
model= Deep_Neural_Network(x_b,y_b,'lstm',embeddings_index)
model.set_optimizer('rmsprop')
model.train(epochs=20,batch_size=128)
model.save('task_b_lstm.h5')
prediction = model.predict(x_test_b)
prediction = np.where(prediction==0, 'UNT','TIN')
prediction = prediction.flatten()
out = pd.DataFrame({'id':df_b.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_b_lstm.csv',header=None, index=None)
out.head()

100%|██████████| 9290/9290 [00:00<00:00, 503223.31it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 280, 300)          2787300   
_________________________________________________________________
spatial_dropout1d_6 (Spatial (None, 280, 300)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               320800    
_________________________________________________________________
dense_22 (Dense)             (None, 512)               102912    
_________________________________________________________________
dropout_17 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_18 (Dropout)         (None, 256)               0         
__________

Exception: ignored

### GRU
- Adam, lr = 1e-6
- val_acc: 0.8807

In [0]:
model= Deep_Neural_Network(x_b,y_b,'gru',embeddings_index)
model.set_optimizer('adam', lr = 0.000001)
model.train(epochs=20,batch_size=256)

100%|██████████| 9290/9290 [00:00<00:00, 380499.82it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 280, 300)          2787300   
_________________________________________________________________
spatial_dropout1d_11 (Spatia (None, 280, 300)          0         
_________________________________________________________________
gru_7 (GRU)                  (None, 280, 128)          164736    
_________________________________________________________________
gru_8 (GRU)                  (None, 64)                37056     
_________________________________________________________________
dense_37 (Dense)             (None, 256)               16640     
_________________________________________________________________
dropout_32 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_38 (Dense)             (None, 128)               32896     
__________

### CNN
- adam, lr = 5e-6
- val_acc: 0.8057

In [48]:
model= Deep_Neural_Network(x_b,y_b,'cnn',embeddings_index)
model.set_optimizer('adam',lr=0.000005)
model.train(epochs=20,batch_size=256)
model.save('task_b_cnn.h5')
prediction = model.predict(x_test_b)
prediction =np.where(prediction==0, 'UNT','TIN')
prediction = prediction.flatten()
out = pd.DataFrame({'id':df_b.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_b_cnn.csv',header=None, index=None)
out.head()

100%|██████████| 9290/9290 [00:00<00:00, 459022.99it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 280, 300)          2787300   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 280, 64)           57664     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 94, 64)            0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 94, 32)            6176      
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 32, 32)            0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 32, 16)            1552      
_________________________________________________________________
flatten_4 (Flatten)          (None, 512)               0         
__________

Unnamed: 0,id,label
0,15923,TIN
1,60133,TIN
2,83681,UNT
3,65507,TIN
4,12588,TIN


### LSTM
- SGD, lr = 1e-5
- val_acc: 0.8807

In [49]:
model= Deep_Neural_Network(x_b,y_b,'lstm',embeddings_index)
model.set_optimizer('sgd', lr = 0.00001)
model.train(epochs=20,batch_size=512)
model.save('task_b_lstm.h5')
prediction = model.predict(x_test_b)
prediction =np.where(prediction==0, 'UNT','TIN')
prediction = prediction.flatten()
out = pd.DataFrame({'id':df_b.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_b_lstm.csv',header=None, index=None)
out.head()

100%|██████████| 9290/9290 [00:00<00:00, 314795.60it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 280, 300)          2787300   
_________________________________________________________________
spatial_dropout1d_7 (Spatial (None, 280, 300)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 200)               320800    
_________________________________________________________________
dense_27 (Dense)             (None, 512)               102912    
_________________________________________________________________
dropout_21 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_22 (Dropout)         (None, 256)               0         
__________

Unnamed: 0,id,label
0,15923,TIN
1,60133,TIN
2,83681,TIN
3,65507,TIN
4,12588,TIN


### Extended TextCNN
- SGD, lr = 1e-7
- val_acc: 0.7677

In [50]:
model= Deep_Neural_Network(x_a,y_a,'etextcnn',embeddings_index)
model.set_optimizer('sgd',lr=0.0000001)
model.train(epochs=20,batch_size=256)
model.save('task_b_etextcnn.h5')
prediction = model.predict(x_test_b)
prediction =np.where(prediction==0, 'UNT','TIN')
prediction = prediction.flatten()
out = pd.DataFrame({'id':df_b.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_b_etextcnn.csv',header=None, index=None)
out.head()

100%|██████████| 17766/17766 [00:00<00:00, 435631.09it/s]


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 280)          0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 280, 300)     5330100     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_15 (Conv1D)              (None, 280, 64)      76864       embedding_12[0][0]               
__________________________________________________________________________________________________
conv1d_17 (Conv1D)              (None, 280, 64)      96064       embedding_12[0][0]               
__________________________________________________________________________________________________
batch_norm

Unnamed: 0,id,label
0,15923,TIN
1,60133,UNT
2,83681,TIN
3,65507,UNT
4,12588,TIN


### CNN2d
- SGD, lr = 1e-6
- val_acc: 0.8807

In [0]:
model= Deep_Neural_Network(x_b,y_b,'cnn2d',embeddings_index)
model.set_optimizer('sgd',lr=0.000001)
model.train(epochs=20,batch_size=256)

100%|██████████| 9396/9396 [00:00<00:00, 326028.56it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_34 (Embedding)     (None, 280, 50)           469850    
_________________________________________________________________
reshape_9 (Reshape)          (None, 50, 280, 1)        0         
_________________________________________________________________
conv2d_19 (Conv2D)           (None, 50, 276, 64)       384       
_________________________________________________________________
dropout_79 (Dropout)         (None, 50, 276, 64)       0         
_________________________________________________________________
conv2d_20 (Conv2D)           (None, 49, 274, 32)       12320     
_________________________________________________________________
dropout_80 (Dropout)         (None, 49, 274, 32)       0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 48, 273, 16)       2064      
__________

# Task C

## Split dataset for task C

In [62]:

taskc_idx= y_b==1
y_c =df.subtask_c.values[taskb_idx][taskc_idx]
print(y_c)
lbl_enc = LabelEncoder()
y_c = lbl_enc.fit_transform(y_c)
print(y_c)
x_c = x_b[taskc_idx]
x_train_c, x_val_c, y_train_c,y_val_c =train_test_split(x_c,y_c,stratify=y_c,random_state=SEED,test_size=0.1,shuffle=True)

# load test data
df_c = pd.read_csv(testC_path,sep='\t')
df_c = preprocess(df_c)
x_test_c = df_c.tweet.values

['IND' 'OTH' 'GRP' ... 'GRP' 'IND' 'OTH']
[1 2 0 ... 0 1 2]
-------Remove Stop Word--------


## Task C - classical classifiers

### tf-idf word embedding

In [0]:
# TF_IDF
clf = Bag_of_word('logistic','tf_idf')
cv = clf.validate_clf(x_c, y_c)
print('mean cross validation for Logistic regression is', cv)

clf = Bag_of_word('naive_bayes','tf_idf')
cv = clf.validate_clf(x_c, y_c)
print('mean cross validation for Naive_Bayes is', cv)

clf = Bag_of_word('xgboost','tf_idf')
cv = clf.validate_clf(x_c, y_c)
print('mean cross validation for Xgboost is', cv)

clf = Bag_of_word('random_forest','tf_idf')
cv = clf.validate_clf(x_c, y_c)
print('mean cross validation for Random Forest is', cv)

clf = Bag_of_word('svm','tf_idf')
cv = clf.validate_clf(x_c, y_c)
print('mean cross validation for SVM is', cv)

start k-fold validate...
mean cross validation for Logistic regression is 0.6880688322016444
start k-fold validate...
mean cross validation for Naive_Bayes is 0.6320925215447748
start k-fold validate...
mean cross validation for Xgboost is 0.6955464918630865
start k-fold validate...
mean cross validation for Random Forest is 0.6978760485585704
start k-fold validate...
mean cross validation for SVM is 0.6854935587387286


### Word count embedding

In [0]:
# Word Count
clf = Bag_of_word('logistic','word_count')
cv = clf.validate_clf(x_c, y_c)
print('mean cross validation for Logistic regression is', cv)

clf = Bag_of_word('naive_bayes','word_count')
cv = clf.validate_clf(x_c, y_c)
print('mean cross validation for Naive_Bayes is', cv)

clf = Bag_of_word('xgboost','word_count')
cv = clf.validate_clf(x_c, y_c)
print('mean cross validation for Xgboost is', cv)

clf = Bag_of_word('random_forest','word_count')
cv = clf.validate_clf(x_c, y_c)
print('mean cross validation for Random Forest is', cv)

clf = Bag_of_word('svm','word_count')
cv = clf.validate_clf(x_c, y_c)
print('mean cross validation for SVM is', cv)

start k-fold validate...
mean cross validation for Logistic regression is 0.6898750294565582
start k-fold validate...
mean cross validation for Naive_Bayes is 0.6656120496456708
start k-fold validate...
mean cross validation for Xgboost is 0.6940167256266907
start k-fold validate...
mean cross validation for Random Forest is 0.70175546099957
start k-fold validate...
mean cross validation for SVM is 0.6591727019020694


In [0]:
clf = Bag_of_word('random_forest','word_count')
clf.train_clf(x_c,y_c)
prediction = clf.test_clf(x_test_c)
prediction =lbl_enc.inverse_transform(prediction)
out = pd.DataFrame({'id':df_c.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_c_rf.csv',header=None, index=None)
out.head()

training data shape (3876, 9910)


Unnamed: 0,id,label
0,15923,IND
1,60133,IND
2,83681,IND
3,65507,IND
4,34263,IND


## Task C - Neural Network Classifiers

### LSTM
- rmsprop
- val_acc: 0.7113

In [52]:
model= Deep_Neural_Network(x_c,y_c,'lstm',embeddings_index)
model.set_optimizer('rmsprop')
model.train(epochs=20,batch_size=128)
prediction = model.predict(x_test_c)
model.save('task_c_lstm.h5')

prediction =lbl_enc.inverse_transform(prediction)
out = pd.DataFrame({'id':df_c.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_c_lstm.csv',header=None, index=None)
out.head()

100%|██████████| 8689/8689 [00:00<00:00, 333806.33it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 280, 300)          2607000   
_________________________________________________________________
spatial_dropout1d_8 (Spatial (None, 280, 300)          0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 200)               320800    
_________________________________________________________________
dense_32 (Dense)             (None, 512)               102912    
_________________________________________________________________
dropout_30 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_33 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_31 (Dropout)         (None, 256)               0         
__________

Unnamed: 0,id,label
0,15923,GRP
1,60133,GRP
2,83681,IND
3,65507,IND
4,34263,IND


### GRU
- rmsprop
- val_acc: 0.7101

In [63]:
model= Deep_Neural_Network(x_c,y_c,'gru',embeddings_index)
model.set_optimizer('rmsprop')
model.train(epochs=20,batch_size=256)
prediction = model.predict(x_test_c)
model.save('task_c_gru.h5')

prediction = lbl_enc.inverse_transform(prediction)
out = pd.DataFrame({'id':df_c.id.values,'label':prediction})
out.to_csv('drive/My Drive/data/nlp_output/task_c_gru.csv',header=None, index=None)
out.head()

100%|██████████| 8689/8689 [00:00<00:00, 301202.58it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 280, 300)          2607000   
_________________________________________________________________
spatial_dropout1d_10 (Spatia (None, 280, 300)          0         
_________________________________________________________________
gru_9 (GRU)                  (None, 280, 128)          164736    
_________________________________________________________________
gru_10 (GRU)                 (None, 64)                37056     
_________________________________________________________________
dense_38 (Dense)             (None, 256)               16640     
_________________________________________________________________
dropout_34 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_39 (Dense)             (None, 128)               32896     
__________

Unnamed: 0,id,label
0,15923,GRP
1,60133,GRP
2,83681,IND
3,65507,IND
4,34263,IND


### CNN
- Adam, lr = 1e-6
- val_acc: 0.6598

In [0]:
model= Deep_Neural_Network(x_c,y_c,'cnn',embeddings_index)
model.set_optimizer('adam',lr=0.00001)
model.train(epochs=20,batch_size=128)

100%|██████████| 8689/8689 [00:00<00:00, 373600.01it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 280, 300)          2607000   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 280, 64)           57664     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 94, 64)            0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 94, 32)            6176      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 32, 32)            0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 32, 16)            1552      
_________________________________________________________________
flatten_2 (Flatten)          (None, 512)               0         
__________

### CNN2d
- rmsprop
- val_acc: 0.6211

In [0]:
1model= Deep_Neural_Network(x_c,y_c,'cnn2d',embeddings_index)
model.set_optimizer('rmsprop')
model.train(epochs=20,batch_size=256)

100%|██████████| 8689/8689 [00:00<00:00, 300988.65it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 280, 50)           434500    
_________________________________________________________________
reshape_4 (Reshape)          (None, 50, 280, 1)        0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 50, 276, 64)       384       
_________________________________________________________________
dropout_39 (Dropout)         (None, 50, 276, 64)       0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 49, 274, 32)       12320     
_________________________________________________________________
dropout_40 (Dropout)         (None, 49, 274, 32)       0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 48, 273, 16)       2064      
__________

### Extended TextCNN
- Adam, lr = 1e-5
- val_acc: 0.6959

In [0]:
model= Deep_Neural_Network(x_c,y_c,'etextcnn',embeddings_index)
model.set_optimizer('adam',lr=0.00001)
model.train(epochs=20,batch_size=256)


100%|██████████| 8689/8689 [00:00<00:00, 290689.37it/s]


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 280)          0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 280, 300)     2607000     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 280, 64)      76864       embedding_12[0][0]               
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, 280, 64)      96064       embedding_12[0][0]               
__________________________________________________________________________________________________
batch_norm

### GRU
- Adam, lr = 1e-3
- val_acc: 0.7113

In [0]:
model= Deep_Neural_Network(x_c,y_c,'gru',embeddings_index)
model.set_optimizer('adam', lr= 0.001)
model.train(epochs=20,batch_size=256)

100%|██████████| 8689/8689 [00:00<00:00, 315649.91it/s]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 280, 300)          2607000   
_________________________________________________________________
spatial_dropout1d_9 (Spatial (None, 280, 300)          0         
_________________________________________________________________
gru_11 (GRU)                 (None, 280, 128)          164736    
_________________________________________________________________
gru_12 (GRU)                 (None, 64)                37056     
_________________________________________________________________
dense_39 (Dense)             (None, 256)               16640     
_________________________________________________________________
dropout_49 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_40 (Dense)             (None, 128)               32896     
__________