In [None]:
#######################################################
##### STRIP TWEET #####################################
#######################################################

from utils import clean_corpus
import pandas as pd

#READ TRAINING SET
df = pd.read_csv("train2017.tsv", sep='\t', header=None)
train_corpus = df[3].tolist()

#READ TEST SET
df = pd.read_csv("test2017.tsv", sep='\t', header=None)
test_corpus = df[3].tolist()

clean_train_corpus = clean_corpus(train_corpus)
clean_test_corpus = clean_corpus(test_corpus)

In [None]:
#######################################################
##### TOKENIZATION ####################################
#######################################################

from utils import tokenize, lemmatize

train_tokens = tokenize(clean_train_corpus)
test_tokens = tokenize(clean_test_corpus)

train_tweets = lemmatize(train_tokens)
test_tweets = lemmatize(test_tokens)

final_train_corpus = [" ".join(str(word) for word in tweet) for tweet in train_tweets]
final_test_corpus = [" ".join(str(word) for word in tweet) for tweet in test_tweets]

In [None]:
#BAG-OF-WORDS VECTORIZATION
from sklearn.feature_extraction.text import CountVectorizer
from utils import save_to_pickle

vectorizer = CountVectorizer()

X_BOW_train = vectorizer.fit_transform(final_train_corpus)
save_to_pickle('X_BOW_train',X_BOW_train)

X_BOW_test = vectorizer.transform(final_test_corpus)
save_to_pickle('X_BOW_test',X_BOW_test)

In [None]:
#TF-IDF VECTORIZATION
from sklearn.feature_extraction.text import TfidfVectorizer
from utils import save_to_pickle

vectorizer = TfidfVectorizer()

X_TFIDF_train = vectorizer.fit_transform(final_train_corpus)
save_to_pickle('X_TFIDF_train',X_TFIDF_train)

X_TFIDF_test = vectorizer.transform(final_test_corpus)
save_to_pickle('X_TFIDF_test',X_TFIDF_test)

In [None]:
#we have 3 options: word2vec, word2vec in sklearn, doc2vec

from gensim.models import Word2Vec
from utils import create_word_embeddings, save_to_pickle

#train_tweets
model_train = Word2Vec(train_tweets, size=200, window=5, min_count=1, workers=4) # size of vector is 200
model_train.train(train_tweets, total_examples=model_train.corpus_count, epochs=model_train.epochs)  # train word vectors

X_W2V_embeddings_train = create_word_embeddings(train_tweets, model_train)
save_to_pickle('X_W2V_embeddings_train',X_W2V_embeddings_train)

#test_tweets
model_test = Word2Vec(test_tweets, size=200, window=5, min_count=1, workers=4) # size of vector is 200
model_test.train(test_tweets, total_examples=model_test.corpus_count,epochs=model_test.epochs)  # train word vectors

X_W2V_embeddings_test = create_word_embeddings(test_tweets, model_test)
save_to_pickle('X_W2V_embeddings_test',X_W2V_embeddings_test)

In [None]:
#Doc2vec is equal with word2vec but is more appropriate for phrases (vectorize phrases instead of words). I.e 
#1.Manos leaves the office every day at 18:00 to catch his train
#2. This season is called Fall, because leaves fall from the trees.
#In this way we can capture the difference between the same word used in a different context. For example we now have a
#different representation of the word “leaves” in the above two sentences

from utils import create_doc_embeddings, save_to_pickle

X_embeddings_array_train = create_doc_embeddings(final_train_corpus)
X_embeddings_array_test = create_doc_embeddings(final_test_corpus)

X_D2V_embeddings_train = [X_embeddings_array_train[i].tolist() for i in range(len(X_embeddings_array_train))]
save_to_pickle('X_D2V_embeddings_train',X_D2V_embeddings_train)

X_D2V_embeddings_test = [X_embeddings_array_test[i].tolist() for i in range(len(X_embeddings_array_test))]
save_to_pickle('X_D2V_embeddings_test',X_D2V_embeddings_test)

In [None]:
from utils import save_to_pickle, load_from_pickle, add_characteristics

import pandas as pd
import numpy as np
import csv

lexica_df = [pd.read_csv("lexica/affin/affin.txt", sep='\t', header=None),
              pd.read_csv("lexica/emotweet/valence_tweet.txt", sep='\t', header=None),
              pd.read_csv("lexica/generic/generic.txt", sep='\t', engine="python" ,quoting=csv.QUOTE_NONE,header=None),
              pd.read_csv("lexica/nrc/val.txt", sep='\t', engine="python",quoting=csv.QUOTE_NONE, header=None),
              pd.read_csv("lexica/nrctag/val.txt", sep='\t', header=None)]
lexica = [df.set_index(0).T.to_dict('list') for df in lexica_df]

characteristics_train = add_characteristics(lexica,train_tweets)
characteristics_test = add_characteristics(lexica,test_tweets) 

X_W2V_embeddings_train = load_from_pickle('X_W2V_embeddings_train')
X_W2Vplus_embeddings_train = np.concatenate((X_W2V_embeddings_train,characteristics_train), axis=1)
save_to_pickle('X_W2Vplus_embeddings_train',X_W2Vplus_embeddings_train)

X_W2V_embeddings_test = load_from_pickle('X_W2V_embeddings_test')
X_W2Vplus_embeddings_test = np.concatenate((X_W2V_embeddings_test,characteristics_test), axis=1)
save_to_pickle('X_W2Vplus_embeddings_test',X_W2Vplus_embeddings_test)

X_D2V_embeddings_train = load_from_pickle('X_D2V_embeddings_train')
X_D2Vplus_embeddings_train = np.concatenate((X_D2V_embeddings_train,characteristics_train), axis=1)
save_to_pickle('X_D2Vplus_embeddings_train',X_D2Vplus_embeddings_train)

X_D2V_embeddings_test = load_from_pickle('X_D2V_embeddings_test')
X_D2Vplus_embeddings_test = np.concatenate((X_D2V_embeddings_test,characteristics_test), axis=1)
save_to_pickle('X_D2Vplus_embeddings_test',X_D2Vplus_embeddings_test)

In [1]:
##########################
### CONSTRUCT Y_LABELS ###
##########################
import pandas as pd

df = pd.read_csv("train2017.tsv", sep='\t', header=None)
y_train_labels = df[2].tolist() #sentiments

for n, value in enumerate(y_train_labels):
    if value == "positive":
        y_train_labels[n] = 2
    elif value =="negative":
        y_train_labels[n] = 0
    else:
        y_train_labels[n] = 1

df = pd.read_csv("y_test_labels.tsv", sep='\t', header=None)
y_test_labels = df[1].tolist() #sentiments

for n, value in enumerate(y_test_labels):
    if value == "positive":
        y_test_labels[n] = 2
    elif value =="negative":
        y_test_labels[n] = 0
    else:
        y_test_labels[n] = 1

In [None]:
from utils import load_from_pickle, knn_classification, svm_classification

X_train = load_from_pickle('X_BOW_train')
X_test = load_from_pickle('X_BOW_test')

#BOW - KNN CLASSIFICATION
scoreBOW_KNN = knn_classification(X_train, X_test, y_train_labels, y_test_labels)
#BOW - SVM CLASSIFICATION
scoreBOW_SVM = svm_classification(X_train, X_test, y_train_labels, y_test_labels)

print(scoreBOW_KNN, scoreBOW_SVM) #na metaferw ta scores se ena megalo pinaka!

In [None]:
from utils import load_from_pickle, knn_classification, svm_classification

X_train = load_from_pickle('X_TFIDF_train')
X_test = load_from_pickle('X_TFIDF_test')

#TFIDF - KNN CLASSIFICATION
scoreTFIDF_KNN = knn_classification(X_TFIDF_train, X_TFIDF_test, y_train_labels, y_test_labels)
#TFIDF - SVM CLASSIFICATION
scoreTFIDF_SVM = svm_classification(X_TFIDF_train, X_TFIDF_test, y_train_labels, y_test_labels)

print(scoreTFIDF_KNN, scoreTFIDF_SVM)

In [None]:
from utils import load_from_pickle, knn_classification, svm_classification

X_train = load_from_pickle('X_D2V_embeddings_train')
X_test = load_from_pickle('X_D2V_embeddings_test')

#DOC2VEC - KNN CLASSIFICATION
scoreD2V_KNN = knn_classification(X_train, X_test, y_train_labels, y_test_labels)
#DOC2VEC - SVM CLASSIFICATION
scoreD2V_SVM = svm_classification(X_train, X_test, y_train_labels, y_test_labels)

print(scoreD2V_KNN, scoreD2V_SVM)

In [None]:
from utils import load_from_pickle, knn_classification, svm_classification

X_train = load_from_pickle('X_D2Vplus_embeddings_train')
X_test = load_from_pickle('X_D2Vplus_embeddings_test')

#DOC2VEC+features - KNN CLASSIFICATION
scoreD2Vplus_KNN = knn_classification(X_train, X_test, y_train_labels, y_test_labels)
#DOC2VEC+features - SVM CLASSIFICATION
scoreD2Vplus_SVM = svm_classification(X_train, X_test, y_train_labels, y_test_labels)

print(scoreD2Vplus_KNN, scoreD2Vplus_SVM)

In [None]:
from utils import load_from_pickle, knn_classification, svm_classification

X_train = load_from_pickle('X_W2V_embeddings_train')
X_test = load_from_pickle('X_W2V_embeddings_test')

#WORD2VEC - KNN CLASSIFICATION
scoreW2V_KNN = knn_classification(X_train, X_test, y_train_labels, y_test_labels)
#WORD2VEC - SVM CLASSIFICATION
scoreW2V_SVM = svm_classification(X_train, X_test, y_train_labels, y_test_labels)

print(scoreW2V_KNN, scoreW2V_SVM)

In [None]:
from utils import load_from_pickle, knn_classification, svm_classification

X_train = load_from_pickle('X_W2Vplus_embeddings_train')
X_test = load_from_pickle('X_W2Vplus_embeddings_test')

#WORD2VEC+features - KNN CLASSIFICATION
scoreW2Vplus_KNN = knn_classification(X_train, X_test, y_train_labels, y_test_labels)
#WORD2VEC+features - SVM CLASSIFICATION
scoreW2Vplus_SVM = svm_classification(X_train, X_test, y_train_labels, y_test_labels)

print(scoreW2Vplus_KNN, scoreW2Vplus_SVM)

In [None]:
#vectorization = ['BOW', 'BOW', 'TDIF', 'TDIF', 'D2V', 'D2V', 'D2V+', 'D2V+', 'W2V', 'W2V', 'W2V+', 'W2V+']
#classifiers = ['KNN', 'SVM', 'KNN', 'SVM', 'KNN', 'SVM', 'KNN', 'SVM', 'KNN', 'SVM', 'KNN', 'SVM']
#accuracy_scores = []

In [2]:
#################
###ROUND ROBIN###
#################

from utils import create_posteriors, knn_classification

import pandas as pd
import numpy as np

df_train = pd.read_csv("train2017.tsv", sep='\t', header=None)
train_tweets = df_train[3].tolist()
df_test = pd.read_csv("test2017.tsv", sep='\t', header=None)
test_tweets = df_test[3].tolist()

sent_map = {"positive":2, "neutral":1, "negative":0}

pos_neg_train = df_train.loc[(df_train[2] == "positive") | (df_train[2] == "negative")].copy()
pos_neg_labels = [sent_map[sentiment] for sentiment in pos_neg_train[2].tolist()] #sentiments
pos_neg_posteriors = create_posteriors(pos_neg_train[3].tolist(),train_tweets,test_tweets,pos_neg_labels,1)

pos_neu_train = df_train.loc[(df_train[2] == "positive") | (df_train[2] == "neutral")].copy()
pos_neu_labels = [sent_map[sentiment] for sentiment in pos_neu_train[2].tolist()]
pos_neu_posteriors = create_posteriors(pos_neu_train[3].tolist(),train_tweets,test_tweets,pos_neu_labels,1)

neg_neu_train = df_train.loc[(df_train[2] == "negative") | (df_train[2] == "neutral")].copy()
neg_neu_labels = [sent_map[sentiment] for sentiment in neg_neu_train[2].tolist()]
neg_neu_posteriors = create_posteriors(neg_neu_train[3].tolist(),train_tweets,test_tweets,neg_neu_labels,1)

#na apothikeusw tis domes se pickle arxeio!



In [3]:
from utils import knn_classification

test_data = [[pos_neg_posteriors['test'][i][0], pos_neg_posteriors['test'][i][1], pos_neu_posteriors['test'][i][0], pos_neu_posteriors['test'][i][1], neg_neu_posteriors['test'][i][0], neg_neu_posteriors['test'][i][1]] for i in range(len(pos_neg_posteriors['test']))]
train_data = [[pos_neg_posteriors['train'][i][0], pos_neg_posteriors['train'][i][1], pos_neu_posteriors['train'][i][0], pos_neu_posteriors['train'][i][1], neg_neu_posteriors['train'][i][0], neg_neu_posteriors['train'][i][1]] for i in range(len(pos_neg_posteriors['train']))]

scoreRR_KNN = knn_classification(train_data, test_data, y_train_labels, y_test_labels)
print(scoreRR_KNN)

0.45791273200911753
