In [None]:
#######################################################
##### STRIP TWEET #####################################
#######################################################

from utils import clean_corpus
import pandas as pd

#READ TRAINING SET
df = pd.read_csv("train2017.tsv", sep='\t', header=None)
train_corpus = df[3].tolist()

#READ TEST SET
df = pd.read_csv("test2017.tsv", sep='\t', header=None)
test_corpus = df[3].tolist()

clean_train_corpus = clean_corpus(train_corpus)
clean_test_corpus = clean_corpus(test_corpus)

In [None]:
#######################################################
##### TOKENIZATION ####################################
#######################################################

from utils import tokenize, lemmatize

train_tokens = tokenize(clean_train_corpus)
test_tokens = tokenize(clean_test_corpus)

train_tweets = lemmatize(train_tokens)
test_tweets = lemmatize(test_tokens)

final_train_corpus = [" ".join(str(word) for word in tweet) for tweet in train_tweets]
final_test_corpus = [" ".join(str(word) for word in tweet) for tweet in test_tweets]

In [None]:
#BAG-OF-WORDS VECTORIZATION
from sklearn.feature_extraction.text import CountVectorizer
from utils import save_to_pickle

vectorizer = CountVectorizer()

X_BOW_train = vectorizer.fit_transform(final_train_corpus)
save_to_pickle('X_BOW_train',X_BOW_train)

X_BOW_test = vectorizer.transform(final_test_corpus)
save_to_pickle('X_BOW_test',X_BOW_test)

In [None]:
#TF-IDF VECTORIZATION
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_TFIDF_train = vectorizer.fit_transform(final_train_corpus)
save_to_pickle('X_TFIDF_train',X_TFIDF_train)

X_TFIDF_test = vectorizer.transform(final_test_corpus)
save_to_pickle('X_TFIDF_test',X_TFIDF_test)

In [None]:
#we have 3 options: word2vec, word2vec in sklearn, doc2vec

from gensim.models import Word2Vec
from utils import create_word_embeddings

#train_tweets
model_train = Word2Vec(train_tweets, size=200, window=5, min_count=1, workers=4) # size of vector is 200
model_train.train(train_tweets, total_examples=model.corpus_count, epochs=model.epochs)  # train word vectors

X_W2V_embeddings_train = create_word_embeddings(train_tweets, model_train)
save_to_pickle('X_W2V_embeddings_train',X_W2V_embeddings_train)

#test_tweets
model_test = Word2Vec(test_tweets, size=200, window=5, min_count=1, workers=4) # size of vector is 200
model_test.train(test_tweets, total_examples=model.corpus_count,epochs=model.epochs)  # train word vectors

X_W2V_embeddings_test = create_word_embeddings(test_tweets, model_test)
save_to_pickle('X_W2V_embeddings_test',X_W2V_embeddings_test)

In [None]:
#Doc2vec is equal with word2vec but is more appropriate for phrases (vectorize phrases instead of words). I.e 
#1.Manos leaves the office every day at 18:00 to catch his train
#2. This season is called Fall, because leaves fall from the trees.
#In this way we can capture the difference between the same word used in a different context. For example we now have a
#different representation of the word “leaves” in the above two sentences

#IMPORTANT: parameteres in my case is random, we have to pay attention to select the right ones

#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

def word_embeddings(final_corpus):
    tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(final_corpus)]
    print(tagged_data)
    model = Doc2Vec(size=200,
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
    model.build_vocab(tagged_data)
    max_epoch = 20
    for epoch in range(max_epoch):
        model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    X_embeddings_array = model.docvecs
    return X_embeddings_array

#convert array to list

X_embeddings_array_train = word_embeddings(final_train_corpus)
X_embeddings_array_test = word_embeddings(final_test_corpus)

X_D2V_embeddings_train=[]
for i in range(len(X_embeddings_array_train)):
    X_D2V_embeddings_train.append(X_embeddings_array_train[i].tolist())
    
with open('X_D2V_embeddings_train.pickle','wb') as handle: 
    pickle.dump(X_D2V_embeddings_train,handle,protocol = pickle.HIGHEST_PROTOCOL)
    
X_D2V_embeddings_test=[]
for i in range(len(X_embeddings_array_test)):
    X_D2V_embeddings_test.append(X_embeddings_array_test[i].tolist())
    
with open('X_D2V_embeddings_test.pickle','wb') as handle: 
    pickle.dump(X_D2V_embeddings_test,handle,protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Add characteristics to embeddings
import pandas as pd
import numpy as np
import csv
from statistics import mean

lexica_df =[pd.read_csv("lexica/affin/affin.txt", sep='\t', header=None),
              pd.read_csv("lexica/emotweet/valence_tweet.txt", sep='\t', header=None),
              pd.read_csv("lexica/generic/generic.txt", sep='\t', engine="python" ,quoting=csv.QUOTE_NONE,header=None),
              pd.read_csv("lexica/nrc/val.txt", sep='\t', engine="python",quoting=csv.QUOTE_NONE, header=None),
              pd.read_csv("lexica/nrctag/val.txt", sep='\t', header=None)]
lexica = [df.set_index(0).T.to_dict('list') for df in lexica_df]

def add_characteristics(tweets):
    characteristics = [[] for i in range(len(tweets))]
    for tweet in range(len(tweets)):   #for each tweet
        #characteristics[tweet].append(sum([len(token) for token in tweets[tweet]])) #length of each tweet
        characteristics[tweet].append(len(tweets[tweet])) #length of each tweet
        for lexicon in lexica:
            tweet_sentiments = [lexicon.get(token,[0])[0] for token in tweets[tweet]]
            max_valence = max(tweet_sentiments)
            min_valence = min(tweet_sentiments)
            average = mean(tweet_sentiments)
            if len(tweet_sentiments) > 1:
                average_half1 = mean(tweet_sentiments[:len(tweet_sentiments)//2])
                average_half2 = mean(tweet_sentiments[len(tweet_sentiments)//2:])
            else:
                average_half1 = len(tweet_sentiments)
                average_half2 = 0
            characteristics[tweet].extend((max_valence, min_valence, average, average_half1, average_half2))
    print(tweet) #gia na doyme proodo!

    return characteristics

characteristics_train = add_characteristics(train_tweets)
print("next one") #gia na dw an pige sto epomeno set!
characteristics_test = add_characteristics(test_tweets) 

with open('X_W2V_embeddings_train.pickle','rb') as handle: 
    X_W2V_embeddings_train = pickle.load(handle)
    X_W2Vplus_embeddings_train = np.concatenate((X_W2V_embeddings_train,characteristics_train), axis=1)
    with open('X_W2Vplus_embeddings_train.pickle','wb') as handle: 
        pickle.dump(X_W2Vplus_embeddings_train,handle,protocol = pickle.HIGHEST_PROTOCOL)
    
with open('X_W2V_embeddings_test.pickle','rb') as handle: 
    X_W2V_embeddings_test = pickle.load(handle)
    X_W2Vplus_embeddings_test = np.concatenate((X_W2V_embeddings_test,characteristics_test), axis=1)
    with open('X_W2Vplus_embeddings_test.pickle','wb') as handle: 
        pickle.dump(X_W2Vplus_embeddings_test,handle,protocol = pickle.HIGHEST_PROTOCOL)
        
with open('X_D2V_embeddings_train.pickle','rb') as handle: 
    X_D2V_embeddings_train = pickle.load(handle)
    X_D2Vplus_embeddings_train = np.concatenate((X_D2V_embeddings_train,characteristics_train), axis=1)
    with open('X_D2Vplus_embeddings_train.pickle','wb') as handle: 
        pickle.dump(X_D2Vplus_embeddings_train,handle,protocol = pickle.HIGHEST_PROTOCOL)

with open('X_D2V_embeddings_test.pickle','rb') as handle: 
    X_D2V_embeddings_test = pickle.load(handle)
    X_D2Vplus_embeddings_test = np.concatenate((X_D2V_embeddings_test,characteristics_test), axis=1)
    with open('X_D2Vplus_embeddings_test.pickle','wb') as handle: 
        pickle.dump(X_D2Vplus_embeddings_test,handle,protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
####################
#CONSTRUCT Y_LABELS#
####################
import pandas as pd
import pickle

df = pd.read_csv("train2017.tsv", sep='\t', header=None)
y_train_labels = df[2].tolist() #sentiments


for n, value in enumerate(y_train_labels):
    if value == "positive":
        y_train_labels[n] = 2
    elif value =="negative":
        y_train_labels[n] = 0
    else:
        y_train_labels[n] =1

df = pd.read_csv("y_test_labels.tsv", sep='\t', header=None)
y_test_labels = df[1].tolist() #sentiments


for n, value in enumerate(y_test_labels):
    if value == "positive":
        y_test_labels[n] = 2
    elif value =="negative":
        y_test_labels[n] = 0
    else:
        y_test_labels[n] =1


In [None]:
from utils import knn_classification, svm_classification

X_train = load_from_pickle('X_BOW_train')
X_test = load_from_pickle('X_BOW_test')

#BOW - KNN CLASSIFICATION
scoreBOW_KNN = knn_classification(X_train, X_test, y_train_labels, y_test_labels)
#BOW - SVM CLASSIFICATION
scoreBOW_SVM = svm_classification(X_train, X_test, y_train_labels, y_test_labels)

print(scoreBOW_KNN, scoreBOW_SVM) #na metaferw ta scores se ena megalo pinaka!

In [None]:
from utils import knn_classification, svm_classification

X_train = load_from_pickle('X_TFIDF_train')
X_test = load_from_pickle('X_TFIDF_test')

#TFIDF - KNN CLASSIFICATION
scoreTFIDF_KNN = knn_classification(X_TFIDF_train, X_TFIDF_test, y_train_labels, y_test_labels)
#TFIDF - SVM CLASSIFICATION
scoreTFIDF_SVM = svm_classification(X_TFIDF_train, X_TFIDF_test, y_train_labels, y_test_labels)

print(scoreTFIDF_KNN, scoreTFIDF_SVM)

In [None]:
from utils import knn_classification, svm_classification

X_train = load_from_pickle('X_D2V_embeddings_train')
X_test = load_from_pickle('X_D2V_embeddings_test')

#DOC2VEC - KNN CLASSIFICATION
scoreD2V_KNN = knn_classification(X_train, X_test, y_train_labels, y_test_labels)
#DOC2VEC - SVM CLASSIFICATION
scoreD2V_SVM = svm_classification(X_train, X_test, y_train_labels, y_test_labels)

print(scoreD2V_KNN, scoreD2V_SVM)

In [None]:
from utils import knn_classification, svm_classification

X_train = load_from_pickle('X_D2Vplus_embeddings_train')
X_test = load_from_pickle('X_D2Vplus_embeddings_test')

#DOC2VEC+features - KNN CLASSIFICATION
scoreD2Vplus_KNN = knn_classification(X_train, X_test, y_train_labels, y_test_labels)
#DOC2VEC+features - SVM CLASSIFICATION
scoreD2Vplus_SVM = svm_classification(X_train, X_test, y_train_labels, y_test_labels)

print(scoreD2Vplus_KNN, scoreD2Vplus_SVM)

In [None]:
from utils import knn_classification, svm_classification

X_train = load_from_pickle('X_W2V_embeddings_train')
X_test = load_from_pickle('X_W2V_embeddings_test')

#WORD2VEC - KNN CLASSIFICATION
scoreW2V_KNN = knn_classification(X_train, X_test, y_train_labels, y_test_labels)
#WORD2VEC - SVM CLASSIFICATION
scoreW2V_SVM = svm_classification(X_train, X_test, y_train_labels, y_test_labels)

print(scoreW2V_KNN, scoreW2V_SVM)

In [None]:
from utils import knn_classification, svm_classification

X_train = load_from_pickle('X_W2Vplus_embeddings_train')
X_test = load_from_pickle('X_W2Vplus_embeddings_test')

#WORD2VEC+features - KNN CLASSIFICATION
scoreW2Vplus_KNN = knn_classification(X_train, X_test, y_train_labels, y_test_labels)
#WORD2VEC+features - SVM CLASSIFICATION
scoreW2Vplus_SVM = svm_classification(X_train, X_test, y_train_labels, y_test_labels)

print(scoreW2Vplus_KNN, scoreW2Vplus_SVM)

In [None]:
#############
#ROUND ROBIN#
#############

######################
#CONSTRUCT TRAIN_SETS#
######################
# import pandas as pd
# import pickle

# df = pd.read_csv("small_train.tsv", sep='\t', header=None)
# y_train_labels = df[2].tolist() #sentiments
# train_tweets = df[3].tolist() #tweets

# df = pd.read_csv("small_test.tsv", sep='\t', header=None)
# test_tweets = df[3].tolist() #tweets

# pos_neg_trainset = pd.DataFrame()
# pos_neg_trainset = df.loc[(df[2] == "positive") | (df[2] == "negative")].copy()
# clean_pos_neg_corpus = clean_corpus(pos_neg_trainset[3].tolist())
# train_pos_neg_tokens = tokenize(clean_pos_neg_corpus)
# train_pos_neg_tweets = lemmatize(train_pos_neg_tokens)


# pos_neu_trainset = pd.DataFrame()
# pos_neu_trainset = df.loc[(df[2] == "positive") | (df[2] == "neutral")].copy()
# clean_pos_neu_corpus = clean_corpus(pos_neg_trainset[3].tolist())
# train_pos_neu_tokens = tokenize(clean_pos_neu_corpus)
# train_pos_neu_tweets = lemmatize(train_pos_neu_tokens)

# neg_neu_trainset = pd.DataFrame()
# neg_neu_trainset = df.loc[(df[2] == "negative") | (df[2] == "neutral")].copy()
# clean_neg_neu_corpus = clean_corpus(pos_neg_trainset[3].tolist())
# train_neg_neu_tokens = tokenize(clean_neg_neu_corpus)
# train_neg_neu_tweets = lemmatize(train_neg_neu_tokens)


#         knn = KNeighborsClassifier(n_neighbors=10)
#         knn.fit(train_neg_neu_tweets, neg_neu_trainset[2].tolist())
#         y_pred_train = knn.predict(train_neg_neu_tweets)  #predict proba
#         y_pred_test = knn.predict(test_tweets) #predict proba
#         print(metrics.accuracy_score(y_test_labels, y_pred))
#==================================================================================
# for n, value in enumerate(y_train_labels):
#     if value == "positive":
#         y_train_labels[n] = 2
        
#     elif value =="negative":
#         y_train_labels[n] = 0
#     else:
#         y_train_labels[n] =1

# df = pd.read_csv("y_test_labels.tsv", sep='\t', header=None)
# y_test_labels = df[1].tolist() #sentiments


# for n, value in enumerate(y_test_labels):
#     if value == "positive":
#         y_test_labels[n] = 2
#     elif value =="negative":
#         y_test_labels[n] = 0
#     else:
#         y_test_labels[n] =1