In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from NN_utils import *
import sys
sys.path.insert(0, "../")
from testsets import *
import evaluation

Using TensorFlow backend.


In [3]:
def evaluate(model, vectorizer, test_data_file):
	ID_test, X_test, Y_test = csv_to_np(test_data_file[0])
	X_test_vec = vectorizer.transform(X_test)

	predictions = model.predict(X_test_vec)
	pred_dict = dict()
	for i in range(len(predictions)):
		pred_dict[str(ID_test[i])] = label_to_sentiment(predictions[i])

	accuracy = model.score(X_test_vec, Y_test)
# 	loss, accuracy = model.evaluate(X_test_vec, Y_test_one_hot)

# 	print()
# 	print("Loss = ", loss)
	print("Test accuracy = " + str(accuracy*100) + "%")

	evaluation.evaluate(pred_dict, test_data_file[1], str(type(model).__name__))
	evaluation.confusion(pred_dict, test_data_file[1], str(type(model).__name__))

In [4]:
def test_model(model, vectorizer):
	for testset in testsets:
		evaluate(model, vectorizer, testset)
		# model.show_errors(testset[0])

In [5]:
'''Load data'''
train_data_file = "../data/processed/csv/proc-twitter-training-data.csv" 
dev_data_file = "../data/processed/csv/proc-twitter-dev-data.csv"

_, X_train, Y_train = csv_to_np(train_data_file)
_, X_dev, Y_dev = csv_to_np(dev_data_file)

In [6]:
'''TFIDF features extraction'''
vectorizer = TfidfVectorizer().set_params(ngram_range=(1,2))
X_train = vectorizer.fit_transform(X_train)
X_dev = vectorizer.transform(X_dev)

In [7]:
print(X_train.shape)

(45101, 378045)


In [9]:
'''GridSearchCV'''
grid_params = {'loss':['hinge', 'log', 'perceptron'], 'alpha': [1e-6, 5e-6, 0.00001, 0.00005, 0.0001, 0.0005]}
gridsearch = GridSearchCV(SGDClassifier(early_stopping=True, max_iter=1000, tol=1e-3), grid_params, cv=5, verbose=True, n_jobs=-1).fit(X_train, Y_train)
print("GridSearchCV training accuracy: ", gridsearch.score(X_train, Y_train))
print("GridSearchCV validation accuracy: ", gridsearch.score(X_dev, Y_dev))
print("Best parameters found: ", gridsearch.best_params_)
test_model(gridsearch, vectorizer)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   12.4s finished


GridSearchCV training accuracy:  0.9627946165273498
GridSearchCV validation accuracy:  0.6505
Best parameters found:  {'alpha': 1e-05, 'loss': 'hinge'}
Test accuracy = 66.10025488530161%
../data/processed/txt/proc-twitter-test1.txt (GridSearchCV):
MacroF1: 0.573
            positive  negative  neutral
positive    0.674     0.074     0.251     
negative    0.116     0.713     0.172     
neutral     0.241     0.144     0.614     

Test accuracy = 65.51538046411225%
../data/processed/txt/proc-twitter-test2.txt (GridSearchCV):
MacroF1: 0.583
            positive  negative  neutral
positive    0.702     0.058     0.240     
negative    0.120     0.676     0.204     
neutral     0.375     0.093     0.532     

Test accuracy = 63.30390920554855%
../data/processed/txt/proc-twitter-test3.txt (GridSearchCV):
MacroF1: 0.526
            positive  negative  neutral
positive    0.687     0.072     0.240     
negative    0.185     0.595     0.220     
neutral     0.304     0.138     0.558     



In [10]:
'''LGR'''
LGR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, Y_train)
print("LGR Training accuracy: ", LGR.score(X_train, Y_train))
print("LGR Validation accuracy: ", LGR.score(X_dev, Y_dev))
test_model(LGR, vectorizer)



LGR Training accuracy:  0.9126183454912308
LGR Validation accuracy:  0.6575
Test accuracy = 64.17445482866043%
../data/processed/txt/proc-twitter-test1.txt (LogisticRegression):
MacroF1: 0.489
            positive  negative  neutral
positive    0.662     0.085     0.254     
negative    0.087     0.819     0.094     
neutral     0.240     0.172     0.588     

Test accuracy = 65.94711279007016%
../data/processed/txt/proc-twitter-test2.txt (LogisticRegression):
MacroF1: 0.502
            positive  negative  neutral
positive    0.697     0.075     0.227     
negative    0.122     0.776     0.102     
neutral     0.349     0.110     0.541     

Test accuracy = 62.21101303068516%
../data/processed/txt/proc-twitter-test3.txt (LogisticRegression):
MacroF1: 0.456
            positive  negative  neutral
positive    0.673     0.093     0.234     
negative    0.135     0.708     0.156     
neutral     0.307     0.152     0.541     



In [14]:
'''Perceptron'''
Perceptron = Perceptron().fit(X_train, Y_train)
print("Perceptron Training accuracy: ", Perceptron.score(X_train, Y_train))
print("Perceptron Validation accuracy: ", Perceptron.score(X_dev, Y_dev))
test_model(Perceptron, vectorizer)



Perceptron Training accuracy:  0.9986031351854726
Perceptron Validation accuracy:  0.6325
Test accuracy = 64.62758425375247%
../data/processed/txt/proc-twitter-test1.txt (Perceptron):
MacroF1: 0.568
            positive  negative  neutral
positive    0.665     0.070     0.265     
negative    0.122     0.658     0.220     
neutral     0.253     0.146     0.601     

Test accuracy = 65.62331354560172%
../data/processed/txt/proc-twitter-test2.txt (Perceptron):
MacroF1: 0.588
            positive  negative  neutral
positive    0.701     0.056     0.243     
negative    0.189     0.622     0.189     
neutral     0.368     0.091     0.541     

Test accuracy = 61.412358133669606%
../data/processed/txt/proc-twitter-test3.txt (Perceptron):
MacroF1: 0.517
            positive  negative  neutral
positive    0.656     0.076     0.267     
negative    0.227     0.492     0.281     
neutral     0.306     0.138     0.556     



In [15]:
'''LinearSVC'''
LinearSVC = LinearSVC().fit(X_train, Y_train)
print("LinearSVC Training accuracy: ", LinearSVC.score(X_train, Y_train))
print("LinearSVC Validation accuracy: ", LinearSVC.score(X_dev, Y_dev))
test_model(LinearSVC, vectorizer)

LinearSVC Training accuracy:  0.9990244118755681
LinearSVC Validation accuracy:  0.657
Test accuracy = 66.10025488530161%
../data/processed/txt/proc-twitter-test1.txt (LinearSVC):
MacroF1: 0.559
            positive  negative  neutral
positive    0.678     0.072     0.250     
negative    0.090     0.783     0.127     
neutral     0.238     0.156     0.606     

Test accuracy = 65.51538046411225%
../data/processed/txt/proc-twitter-test2.txt (LinearSVC):
MacroF1: 0.566
            positive  negative  neutral
positive    0.695     0.065     0.240     
negative    0.096     0.747     0.157     
neutral     0.367     0.097     0.536     

Test accuracy = 63.26187473728457%
../data/processed/txt/proc-twitter-test3.txt (LinearSVC):
MacroF1: 0.518
            positive  negative  neutral
positive    0.677     0.077     0.246     
negative    0.161     0.655     0.185     
neutral     0.309     0.139     0.553     



In [None]:
'''PLOT TFIDF IN 2D - experiment'''
titles = ('2D TFIDF plot')
fig, sub = plt.subplots(1, 1)
plt.subplots_adjust(wspace=0.2, hspace=0.2)

X = X_train.todense()
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)

plt.scatter(data2D[:,0], data2D[:,1], c=Y_train)
plt.show() 