In [1]:
import numpy as np
import pandas as pd
import sklearn
import re

# Read trainig and testing files

In [2]:
train = pd.read_csv('data/Corona_NLP_train.csv', encoding='latin1')
test = pd.read_csv('data/Corona_NLP_test.csv', encoding='latin1')

train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


# Encoding classes

In [3]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
train['encoded_sentiment'] = encoder.fit_transform(train['Sentiment'])
test['encoded_sentiment'] = encoder.transform(test['Sentiment'])

In [4]:
train['OriginalTweet'] = train['OriginalTweet'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split()))
test['OriginalTweet'] = test['OriginalTweet'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split()))

In [5]:
from sklearn.model_selection import train_test_split

xtrain, xval, ytrain, yval = train_test_split(train['OriginalTweet'], train['encoded_sentiment'], test_size = 0.2)
xtest, ytest = test['OriginalTweet'], test['encoded_sentiment']

# Encode words to embeddings

In [6]:
import fasttext

In [7]:
ft = fasttext.load_model('cc.en.300.bin')



In [8]:
xtrain = np.array(xtrain.apply(ft.get_sentence_vector).to_list())
xval = np.array(xval.apply(ft.get_sentence_vector).to_list())
xtest = np.array(xtest.apply(ft.get_sentence_vector).to_list())

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [10]:
clf = RandomForestClassifier(300)
clf.fit(xtrain, ytrain)

RandomForestClassifier(n_estimators=300)

In [11]:
clf.score(xval, yval)

0.4363459669582119

In [12]:
clf.score(xtest, ytest)

0.3770405476566614

# Gaussian Naive Bayes

In [13]:
clf = GaussianNB()
clf.fit(xtrain, ytrain)

GaussianNB()

In [14]:
clf.score(xval, yval)

0.3332118561710398

In [15]:
clf.score(xtest, ytest)

0.3246445497630332

# SVM

In [16]:
from sklearn.svm import SVC

In [19]:
clf = SVC(C=5)
clf.fit(xtrain, ytrain)

SVC(C=5)

In [20]:
clf.score(xval, yval)

0.531584062196307

In [21]:
clf.score(xtest, ytest)

0.5115850447604002

# Xgboost

In [23]:
import xgboost as xgb

In [29]:
params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',
    'num_class': 5,
    'gamma': 0.1,
    'max_depth': 6,
    'lambda': 2,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'min_child_weight': 3,
    'silent': 1,
    'eta': 0.1,
    'seed': 1000,
    'nthread': 4,
}

dtrain = xgb.DMatrix(xtrain, ytrain)
dval = xgb.DMatrix(xval, yval)
dtest = xgb.DMatrix(xtest, ytest)
num_rounds = 200
eval_list = [(dtrain, 'train'), (dval, 'val'), (dtest, 'test')]
model = xgb.train(params, dtrain, num_rounds, eval_list)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-merror:0.60079	val-merror:0.66181	test-merror:0.67378
[1]	train-merror:0.56413	val-merror:0.63933	test-merror:0.65640
[2]	train-merror:0.54299	val-merror:0.62306	test-merror:0.65034
[3]	train-merror:0.53300	val-merror:0.61783	test-merror:0.64771
[4]	train-merror:0.52091	val-merror:0.61467	test-merror:0.64007
[5]	train-merror:0.51271	val-merror:0.61309	test-merror:0.64007
[6]	train-merror:0.50372	val-merror:0.60933	test-merror:0.63639
[7]	train-merror:0.49677	val-merror:0.60398	test-merror:0.62770
[8]	train-merror:0.49172	val-merror:0.60119	test-merror:0.63138
[9]	train-merror:0.48346	val-merror:0.60034	test-merror:0.63138
[10]	train-merror:0.47894	val-merror:0.59451	test-merror:0.62849
[11]	train-me

[121]	train-merror:0.16574	val-merror:0.53316	test-merror:0.55608
[122]	train-merror:0.16377	val-merror:0.53134	test-merror:0.55608
[123]	train-merror:0.16216	val-merror:0.53122	test-merror:0.55766
[124]	train-merror:0.16006	val-merror:0.53122	test-merror:0.56003
[125]	train-merror:0.15845	val-merror:0.53049	test-merror:0.55977
[126]	train-merror:0.15733	val-merror:0.53061	test-merror:0.55714
[127]	train-merror:0.15587	val-merror:0.53025	test-merror:0.55766
[128]	train-merror:0.15484	val-merror:0.53195	test-merror:0.55634
[129]	train-merror:0.15359	val-merror:0.52952	test-merror:0.55398
[130]	train-merror:0.15216	val-merror:0.53000	test-merror:0.55292
[131]	train-merror:0.15058	val-merror:0.52733	test-merror:0.55345
[132]	train-merror:0.14943	val-merror:0.52843	test-merror:0.55266
[133]	train-merror:0.14785	val-merror:0.52685	test-merror:0.55503
[134]	train-merror:0.14667	val-merror:0.52758	test-merror:0.55477
[135]	train-merror:0.14497	val-merror:0.52697	test-merror:0.55266
[136]	trai

In [30]:
from sklearn.metrics import accuracy_score

dval = xgb.DMatrix(xval)
ans = model.predict(dval)
print('validation accuracy: ', accuracy_score(yval, ans))

dval = xgb.DMatrix(xval)
ans = model.predict(dtest)
print('testing accuracy: ', accuracy_score(ytest, ans))

validation accuracy:  0.48603012633624876
testing accuracy:  0.46261190100052657
