In [1]:
import pandas as pd
import numpy as np

import csv 
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from collections import Counter
import fasttext
import re
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

#deep learning library
from keras.models import *
from keras.layers import *
from keras.callbacks import *

from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

random_state = 123

# Loading Data

In [2]:
df_train = pd.read_csv('train_data.csv')
df_test = pd.read_csv('test_data.csv')

In [3]:
train_data = df_train.copy()
test_data = df_test.copy()

In [4]:
train_data.drop(columns=['Unnamed: 0'],axis=1,inplace=True)
test_data.drop(columns=['Unnamed: 0'],axis=1,inplace=True)

In [5]:
#fasttext required
train_data['Cleaned_Review'] = train_data['Cleaned_Review'].astype(str)
test_data['Cleaned_Review'] = test_data['Cleaned_Review'].astype(str)

#fasttext required: change category to word type
train_data['Category'] = np.where((train_data['Category'] == 1),'Positive','Negative')
test_data['Category'] = np.where((test_data['Category'] == 1),'Positive','Negative')

#fasttext required: Prefixing each row of the category column with '__label__'
train_data['labels_text']= train_data.iloc[:, 2].apply(lambda x: '__label__' + x)
test_data['labels_text'] = test_data.iloc[:, 2].apply(lambda x: '__label__' + x)

# Training and Cross Validation

In [6]:
training_file = open('train.txt','w')
training_file.writelines(train_data.labels_text + '\n')
training_file.close()

In [16]:
def iteration_search(X, Y, Z, k, lr, ngram_type, epoch):

    results = []
    for lr_val in lr:
        for num_val in ngram_type:
            for epoch_val in epoch:  
               
                kf = KFold(n_splits=k, shuffle=True)
                fold_results = []
          
                for train_index, test_index in kf.split(X):

                    training_file = open('train_cv.txt','w')
                    training_file.writelines(Z[train_index] + '\n')
                    training_file.close()

                    model = fasttext.train_supervised('train_cv.txt',
                                          lr=lr_val,
                                          wordNgrams=num_val,
                                          epoch=epoch_val)

                    pred = model.predict(X[test_index].tolist())
                    pred = pd.Series(pred[0]).apply(lambda x: re.sub('__label__', '', x[0]))

                    fold_results.append(accuracy_score(Y[test_index], pred.values))

                mean_acc = pd.Series(fold_results).mean()
                results.append([lr_val, num_val, epoch_val, mean_acc])         

    results = pd.DataFrame(results)
    results.columns = ['lr','ngram_type','epoch','mean_acc']
    return(results)

In [17]:
# 10-fold CV
results = iteration_search (X = train_data.Cleaned_Review, 
                     Y = train_data.Category,
                     Z = train_data.labels_text,
                     k = 10, 
                     lr = [0.1, 0.2, 0.3],
                     ngram_type = [1,2,3],
                     epoch = [15,17,20])

In [21]:
results.sort_values('mean_acc').tail(3)

Unnamed: 0,lr,ngram_type,epoch,mean_acc
13,0.2,2,17,0.566075
25,0.3,3,17,0.567329
5,0.1,2,20,0.56786


In [22]:
train_data['Category']= train_data['Category'].apply(lambda x: '__label__' + x)

In [23]:
train_data[['Category', 'Cleaned_Review']].to_csv('train_1.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

# Testing and Vote

In [24]:
%%time
classifier1 = fasttext.train_supervised('train_1.txt', lr=0.20, wordNgrams=2, epoch=17)
classifier2 = fasttext.train_supervised('train_1.txt', lr=0.30, wordNgrams=3, epoch=17)
classifier3 = fasttext.train_supervised('train_1.txt', lr=0.10, wordNgrams=2, epoch=20)

CPU times: user 15.9 s, sys: 915 ms, total: 16.8 s
Wall time: 4.04 s


In [25]:
%%time
predictions1 = classifier1.predict(test_data.Cleaned_Review.tolist())
predictions2 = classifier2.predict(test_data.Cleaned_Review.tolist())
predictions3 = classifier3.predict(test_data.Cleaned_Review.tolist())

CPU times: user 171 ms, sys: 25.8 ms, total: 197 ms
Wall time: 178 ms


In [26]:
# Combine predictions
majority_vote = np.array([])
for i in range(len(predictions1[0])):
    majority_vote = np.append(majority_vote, Counter([predictions1[0][i][0],
                                                   predictions2[0][i][0],
                                                   predictions3[0][i][0]]).most_common(1)[0][0])

In [27]:
prediction = pd.DataFrame({'id': test_data.index, 
                       'Category': pd.Series(majority_vote)})
prediction.Category = prediction.Category.apply(lambda x: re.sub('__label__', '', x))

In [28]:
groundtruth = test_data.copy()
groundtruth['id'] = groundtruth.index
groundtruth.Category = groundtruth.Category.apply(lambda x: re.sub('__label__', '', x))

In [29]:
precision, recall, fscore, support= score(groundtruth['Category'], prediction['Category'])
print('Precision: {}'.format(precision))
print('Recall: {}'.format(recall))
print('F1 score: {}'.format(fscore))
print('Support: {}'.format(support))
accuracy_score(groundtruth.Category, prediction['Category'])

Precision: [0.93911249 0.55653451]
Recall: [0.75082508 0.8652968 ]
F1 score: [0.8344796  0.67739053]
Support: [1212  438]


0.7812121212121212