In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from sklearn import tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from collections import  Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("FinalDataset.csv")

In [None]:
df.head()

In [None]:
cols = list(df.columns)
index = cols.index("finalCleanText")

In [None]:
nanIndex = []
nanIndex.extend(list(df[df["STANCE"].isna()].index))
nanIndex.extend(list(df[df["OPINION"].isna()].index))
nanIndex.extend(list(df[df["POL"].isna()].index))
for i in range(index,len(cols)):
    print("checking",cols[i])
    nanIndex.extend(list(df[df[cols[i]].isna()].index))
nanIndex = set(nanIndex)
print("Total nan value found",len(nanIndex))

In [None]:
df.drop(index=nanIndex,inplace=True)
df.reset_index(inplace=True,drop=True)

In [None]:
X  = df.iloc[:,index:].copy()
Y = pd.DataFrame() 
X["cleaned tweets"] = df["finalCleanText"].copy()
del X["finalCleanText"]
Y["STANCE"] = df["STANCE"].copy()

In [None]:
X.head()

In [None]:
Y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y.values, test_size=0.25, random_state = 20)
                                                    #,stratify=Y.values)

In [None]:
Counter(list(y_train.reshape(-1)))

In [None]:
Counter(df["STANCE"])

In [None]:
Counter(list(y_test.reshape(-1)))

In [None]:
X_train.reset_index(inplace=True,drop=True)
X_test.reset_index(inplace=True,drop=True)

In [None]:
type(list(X_train['cleaned tweets'].values)[0])

In [None]:
# x_train  = [str(text[0]) for text in X_train]
# x_test  = [str(text[0]) for text in X_test]

In [None]:
tfidf = TfidfVectorizer(use_idf=True,min_df=0.01,max_df=0.9)
x_train = tfidf.fit_transform(list(X_train['cleaned tweets'].values))
x_test = tfidf.transform(list(X_test['cleaned tweets'].values))

In [None]:
x_train = x_train.toarray()
x_test = x_test.toarray()

In [None]:
x_train = pd.DataFrame(data=x_train,columns=list(tfidf.get_feature_names()))
x_test = pd.DataFrame(data=x_test,columns=list(tfidf.get_feature_names()))

In [None]:
del X_train['cleaned tweets']
del X_test['cleaned tweets']

In [None]:
x_train = pd.concat([X_train,x_train],axis=1)
x_test = pd.concat([X_test,x_test],axis=1)

In [None]:
assert(len(list(x_train.columns)) == len(set(list(x_train.columns))))
print("This is to check all x_train columns are unique")

In [None]:
assert(list(x_train.columns) == list(x_test.columns))
column = list(x_train.columns)
index = column.index('Positive_Score')
column[:index]

In [None]:
from sklearn import preprocessing
for feature in column[:index]:
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(x_train[feature].values.reshape((-1,1)))
    x_train[feature] = scaler.transform(x_train[feature].values.reshape((-1,1)))
    x_test[feature] = scaler.transform(x_test[feature].values.reshape((-1,1)))

In [None]:
def smooth_labels(labels, factor=0.1):
	# smooth the labels
	labels *= (1 - factor)
	labels += (factor / labels.shape[1])
	# returned the smoothed labels
	return labels

In [None]:
from keras.models import Sequential
from keras.layers import Dense
import keras

In [None]:
inputSize = len(list(x_train.columns))

In [None]:
x_train = x_train.values
x_test = x_test.values

In [None]:
model = Sequential()
model.add((Dense(units=64, activation='relu', input_dim=inputSize)))
model.add((Dense(units=32, activation='relu')))
model.add((Dense(units=3, activation='softmax')))

In [None]:
def oneHot(y_inp):
    y = np.zeros((len(y_inp),3))
    for i in range(len(y_inp)):
        y[i][int(y_inp[i][0])] = 1
    return y

In [None]:
y_train_hot = oneHot(y_train)
y_test_hot = oneHot(y_test)

In [None]:
def oneHotToCategorical(y_hot):
    r,c = y_hot.shape
    y = [y_hot[i].argmax() for i in range(r)]
    return y

In [None]:
def createConfusionMatrix(y_true_c,y_pred_c,classifier_name):
    mapping = {2:"Anti Govt",0:"Neutral",1:"Pro Govt"}
    y_true_c = oneHotToCategorical(y_true_c)
    y_pred_c = oneHotToCategorical(y_pred_c)
    y_true = [mapping[i] for i in y_true_c]
    y_pred = [mapping[i] for i in y_pred_c]
    labels = [mapping[key] for key in mapping]
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    title = "Confusion Matrix of "+ classifier_name
    fig = plt.figure()
    ax= plt.subplot()
    fig.add_subplot(ax)
    sns.heatmap(cm, annot=True, ax = ax);
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels');
    ax.set_title(title);
    ax.xaxis.set_ticklabels(labels); ax.yaxis.set_ticklabels(labels);
    fig.savefig("graphs/"+title+".png",bbox_inches='tight')
    return

In [None]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer = keras.optimizers.adam(lr=0.001))

In [None]:
EPOCHS = 6
history = model.fit(x_train,y_train_hot,epochs = EPOCHS,batch_size = 50,validation_data =[x_test,y_test_hot])

In [None]:
model.evaluate(x_test,y_test_hot)

In [None]:
print(history.history.keys())

In [None]:
import matplotlib.pyplot as plt
loss_train = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1,EPOCHS+1)

In [None]:
def createGraph(acc_train,acc_train_label,acc_val,acc_val_label,x_label,y_label,title):
    x_len = max(len(acc_train),len(acc_val))
    x = [i for i in range(1,x_len+1)]
    plt.plot(x, acc_train, 'o-', label=acc_train_label)
    plt.plot(x, acc_val, 'o-', label=acc_val_label)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.legend()
    plt.grid(True)
    plt.savefig('graphs/'+title+'.png',bbox_inches='tight')
    plt.show()

In [None]:
createGraph(loss_train,'Training Loss',loss_val,'Validation Loss','Epochs','Loss','Training and Validation Loss Using Vanilla NN')

In [None]:
acc_train = history.history['accuracy']
acc_val = history.history['val_accuracy']
createGraph(acc_train,'Training Accuracy',acc_val,'Validation Accuracy','Epochs','Accuracy','Training and Validation Accuracy Using Vanilla NN')

In [None]:
createConfusionMatrix(y_test_hot,model.predict(x_test),"Vanilla NN")

In [None]:
y_train_hot_smooth = smooth_labels(y_train_hot)

In [None]:
y_train_hot_smooth

In [None]:
from keras.layers import Dropout
model = Sequential()
model.add((Dense(units=64, activation='relu', input_dim=inputSize)))
model.add(Dropout(rate=0.10))
model.add((Dense(units=32, activation='relu')))
model.add((Dense(units=3, activation='softmax')))

In [None]:
from keras.losses import CategoricalCrossentropy
model.compile(loss=CategoricalCrossentropy(label_smoothing = 0.1),metrics=['accuracy'],optimizer = keras.optimizers.adam(lr=0.001))

In [None]:
EPOCHS = 6
history = model.fit(x_train,y_train_hot_smooth,epochs = EPOCHS,batch_size = 50,validation_data =[x_test,y_test_hot])

In [None]:
import matplotlib.pyplot as plt
loss_train = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1,EPOCHS+1)

In [None]:
createGraph(loss_train,'Training Loss',loss_val,'Validation loss','Epochs','Loss','Training and Validation Loss Using Vanilla NN with Label Smoothing')

In [None]:
acc_train = history.history['accuracy']
acc_val = history.history['val_accuracy']
createGraph(acc_train,'Training Accuracy',acc_val,'Validation Accuracy','Epochs','Accuracy','Training and Validation Accuracy Using Vanilla NN with Label Smoothing')

In [None]:
createConfusionMatrix(y_test_hot,model.predict(x_test),"Vanilla NN with Label Smoothing")