In [18]:
#######################################################
##### STRIP TWEET #####################################
#######################################################

import pandas as pd
import re
import string

#READ TRAINING SET
df = pd.read_csv("train2017.tsv", sep='\t', header=None)
train_corpus = df[3].tolist()

#READ TEST SET
df = pd.read_csv("test2017.tsv", sep='\t', header=None)
test_corpus = df[3].tolist()

translate_table = dict((ord(char), None) for char in string.punctuation)   

def clean_corpus(corpus):
    clean_corpus = []
    for tweet in corpus:
        tweet = re.sub(r"http\S+", "", tweet) #remove link
        tweet = ' '.join([word for word in tweet.split(' ')  if not word.startswith('@')])
        tweet = tweet.translate(translate_table) #remove symbols 
        tweet = tweet.lower()
        clean_corpus.append(tweet)
    return clean_corpus
    
clean_train_corpus = clean_corpus(train_corpus)
clean_test_corpus = clean_corpus(test_corpus)

In [19]:
#######################################################
##### TOKENIZATION ####################################
#######################################################

from nltk import word_tokenize

def tokenize(clean_corpus):
    tokens = []
    for tweet in clean_corpus:
        token = []
        token = word_tokenize(tweet)
        tokens.append(token)
    return tokens

train_tokens = tokenize(clean_train_corpus)
test_tokens = tokenize(clean_test_corpus)

In [37]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import  WordNetLemmatizer
# Lemmatize with POS Tag

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

def lemmatize(tokens):
    tweets = []
    for token_list in tokens:
        lemmatized = []
        for word in token_list:
            lemmatized.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))
        tweets.append(lemmatized)
    return tweets
    
train_tweets = lemmatize(train_tokens)
test_tweets = lemmatize(test_tokens)

KeyboardInterrupt: 

In [21]:
final_train_corpus = []
for tweet in train_tweets:
    final_train_corpus.append(" ".join(str(word) for word in tweet))

final_test_corpus = []
for tweet in test_tweets:
    final_test_corpus.append(" ".join(str(word) for word in tweet))

In [22]:
#BAG-OF-WORDS VECTORIZATION
# from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer()

# X_BOW_train = vectorizer.fit_transform(final_train_corpus)
# with open('X_BOW_train.pickle','wb') as handle: 
#     pickle.dump(X_BOW_train,handle,protocol = pickle.HIGHEST_PROTOCOL)

# X_BOW_test = vectorizer.transform(final_test_corpus)
# with open('X_BOW_test.pickle','wb') as handle: 
#     pickle.dump(X_BOW_test,handle,protocol = pickle.HIGHEST_PROTOCOL)

In [23]:
#TF-IDF VECTORIZATION
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer()

# X_TFIDF_train = vectorizer.fit_transform(final_train_corpus)
# with open('X_TFIDF_train.pickle','wb') as handle: 
#     pickle.dump(X_TFIDF_train,handle,protocol = pickle.HIGHEST_PROTOCOL)

# X_TFIDF_test = vectorizer.transform(final_test_corpus)
# with open('X_TFIDF_test.pickle','wb') as handle: 
#     pickle.dump(X_TFIDF_test,handle,protocol = pickle.HIGHEST_PROTOCOL)

In [42]:
#we have 3 options: word2vec, word2vec in sklearn, doc2vec
from gensim.test.utils import common_texts
import numpy

#IMPORT WORD2VEC
from gensim.models import Word2Vec
#nested_corpus = [final_train_corpus[i:i+1] for i in range(0, len(final_train_corpus), 1)]
model = Word2Vec(train_tweets, size=200, window=5, min_count=1, workers=4) #size of vector is 200
model.train(train_tweets, total_examples=model.corpus_count,epochs=model.epochs)  # train word vectors
#print(model['gas'])

X_W2V_embeddings_train=[]

for tweet in train_tweets:
    a = numpy.array(model[tweet[0]])
    for word in tweet[1:]:
        #print(word)
        a = a + numpy.array(model[word])
    a = a / len(tweet)
    X_W2V_embeddings_train.append(a)
#print(X_embeddings_train[1])
with open('X_W2V_embeddings_train.pickle','wb') as handle: 
    pickle.dump(X_W2V_embeddings_train,handle,protocol = pickle.HIGHEST_PROTOCOL)

model = Word2Vec(test_tweets, size=200, window=5, min_count=1, workers=4) #size of vector is 200
model.train(test_tweets, total_examples=model.corpus_count,epochs=model.epochs)  # train word vectors

X_W2V_embeddings_test=[]

for tweet in test_tweets:
    a = numpy.array(model[tweet[0]])
    for word in tweet[1:]:
        #print(word)
        a = a + numpy.array(model[word])
    a = a / len(tweet)
    X_W2V_embeddings_test.append(a)

with open('X_W2V_embeddings_test.pickle','wb') as handle: 
    pickle.dump(X_W2V_embeddings_test,handle,protocol = pickle.HIGHEST_PROTOCOL)


  from ipykernel import kernelapp as app


In [8]:
#Doc2vec is equal with word2vec but is more appropriate for phrases (vectorize phrases instead of words). I.e 
#1.Manos leaves the office every day at 18:00 to catch his train
#2. This season is called Fall, because leaves fall from the trees.
#In this way we can capture the difference between the same word used in a different context. For example we now have a
#different representation of the word “leaves” in the above two sentences

#IMPORTANT: parameteres in my case is random, we have to pay attention to select the right ones

from gensim.models import Word2Vec
nested_corpus = [final_train_corpus[i:i+1] for i in range(0, len(final_train_corpus), 1)]
model = Word2Vec(train_tokens, size=200, window=5, min_count=1, workers=4) #size of vector is 200
model.train(train_tokens, total_examples=model.corpus_count,epochs=model.epochs)  # train word vectors

#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

def word_embeddings(final_corpus):
    tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(final_corpus)]
    print(tagged_data)
    model = Doc2Vec(size=200,
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
    model.build_vocab(tagged_data)
    max_epoch = 20
    for epoch in range(max_epoch):
        model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    X_embeddings_array = model.docvecs
    return X_embeddings_array

#convert array to list

X_embeddings_array_train = word_embeddings(final_train_corpus)
X_embeddings_array_test = word_embeddings(final_test_corpus)

X_D2V_embeddings_train=[]
for i in range(len(X_embeddings_array_train)):
    X_D2V_embeddings_train.append(X_embeddings_array_train[i].tolist())
    
with open('X_D2V_embeddings_train.pickle','wb') as handle: 
    pickle.dump(X_D2V_embeddings_train,handle,protocol = pickle.HIGHEST_PROTOCOL)
    
X_D2V_embeddings_test=[]
for i in range(len(X_embeddings_array_test)):
    X_D2V_embeddings_test.append(X_embeddings_array_test[i].tolist())
    
with open('X_D2V_embeddings_test.pickle','wb') as handle: 
    pickle.dump(X_D2V_embeddings_test,handle,protocol = pickle.HIGHEST_PROTOCOL)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[TaggedDocument(words=['arianagrande', 'ari', 'by', 'ariana', 'grande', '80', 'full', 'singer', 'actress'], tags=['0']), TaggedDocument(words=['ariana', 'grande', 'kiis', 'fm', 'yours', 'truly', 'cd', 'listen', 'party', 'in', 'burbank', 'arianagrande'], tags=['1']), TaggedDocument(words=['ariana', 'grande', 'white', 'house', 'easter', 'egg', 'roll', 'in', 'washington', 'arianagrande'], tags=['2']), TaggedDocument(words=['cd', 'music', 'ariana', 'grande', 'sweet', 'like', 'candy', '34', 'oz', '100', 'ml', 'seal', 'in', 'box', '100', 'authenic', 'new'], tags=['3']), TaggedDocument(words=['side', 'to', 'side', 'sidetoside', 'arianagrande', 'musically', 'comunidadgay', 'lgbt', 'lotb'], tags=['4']), TaggedDocument(words=['hairspray', 'live', 'preview', 'at', 'the', 'macys', 'thanksgiving', 'day', 'parade', 'arianagrande', 'televisionnbc'], tags=['5']), TaggedDocument(words=['lindsaylohan', 'be', 'feel', 'thankful', 'after', 'blasting', 'arianagrande', 'for', 'wear', 'toomuch'], tags=['6']),

In [None]:
#Add characteristics to embeddings
import pandas as pd
import csv
lexica =[pd.read_csv("lexica/affin/affin.txt", sep='\t', header=None),
              pd.read_csv("lexica/emotweet/valence_tweet.txt", sep='\t', header=None),
              pd.read_csv("lexica/generic/generic.txt", sep='\t', engine="python" ,quoting=csv.QUOTE_NONE,header=None),
              pd.read_csv("lexica/nrc/val.txt", sep='\t', engine="python",quoting=csv.QUOTE_NONE, header=None),
              pd.read_csv("lexica/nrctag/val.txt", sep='\t', header=None)]

def add_characteristics(tweets):
    characteristics = [[] for i in range(len(tweets))]
    for tweet in range(len(tweets)):   #for each tweet
        characteristics[tweet].append(len(tweets[tweet]))  #length of each tweet               
        entire_tweet = tweets[tweet]
        first_half= entire_tweet[:len(entire_tweet)//2]
        second_half= entire_tweet[len(entire_tweet)//2:]
        for df in lexica:         #for each lexicon
            if not df[df[0].isin(tweets[tweet])].empty: #search for tweets' tokens in lexicon
                max_valence = df[df[0].isin(tweets[tweet])][1].max() #get average sentiment for this tweet
                min_valence = df[df[0].isin(tweets[tweet])][1].min()
                average_tweet_sentiment = df[df[0].isin(tweets[tweet])][1].mean() #get average sentiment for this tweet
                characteristics[tweet].append(max_valence)
                characteristics[tweet].append(min_valence)
                characteristics[tweet].append(average_tweet_sentiment)
            else:  # add two values to keep all the vectors the same size
                characteristics[tweet].append(0)   #if is zero should we add mean or it will affect the vector
                characteristics[tweet].append(0)   #if is zero should we add mean or it will affect the vector
                characteristics[tweet].append(0)   #if is zero should we add mean or it will affect the vector
                
            if not df[df[0].isin(first_half)].empty: #search for tweets' tokens in lexicon
                average_tweet_sentiment = df[df[0].isin(first_half)][1].mean() #get average sentiment for this tweet
                characteristics[tweet].append(average_tweet_sentiment)
            else:  # add two values to keep all the vectors the same size
                characteristics[tweet].append(0)   #if is zero should we add mean or it will affect the vector
            if not df[df[0].isin(second_half)].empty: #search for tweets' tokens in lexicon
                average_tweet_sentiment = df[df[0].isin(second_half)][1].mean() #get average sentiment for this tweet
                characteristics[tweet].append(average_tweet_sentiment)
            else:  # add two values to keep all the vectors the same size
                characteristics[tweet].append(0)   #if is zero should we add mean or it will affect the vector
    return characteristics



characteristics_train = add_characteristics(train_tweets)
characteristics_test = add_characteristics(test_tweets) 

with open('X_W2V_embeddings_train.pickle','rb') as handle: 
    X_W2V_embeddings_train = pickle.load(handle)
    X_W2Vplus_embeddings_train = X_W2V_embeddings_train + characteristics_train
    with open('X_W2Vplus_embeddings_train.pickle','wb') as handle: 
        pickle.dump(X_W2Vplus_embeddings_train,handle,protocol = pickle.HIGHEST_PROTOCOL)
    
with open('X_W2V_embeddings_test.pickle','rb') as handle: 
    X_W2V_embeddings_test = pickle.load(handle)
    X_W2Vplus_embeddings_test = X_W2V_embeddings_test + characteristics_test
    with open('X_W2Vplus_embeddings_test.pickle','wb') as handle: 
        pickle.dump(X_W2Vplus_embeddings_test,handle,protocol = pickle.HIGHEST_PROTOCOL)
        
with open('X_D2V_embeddings_train.pickle','rb') as handle: 
    X_D2V_embeddings_train = pickle.load(handle)
    X_D2Vplus_embeddings_train = X_D2V_embeddings_train + characteristics_train
    with open('X_D2Vplus_embeddings_train.pickle','wb') as handle: 
        pickle.dump(X_D2Vplus_embeddings_train,handle,protocol = pickle.HIGHEST_PROTOCOL)

with open('X_D2V_embeddings_test.pickle','rb') as handle: 
    X_D2V_embeddings_test = pickle.load(handle)
    X_D2Vplus_embeddings_test = X_D2V_embeddings_test + characteristics_test
    with open('X_D2Vplus_embeddings_test.pickle','wb') as handle: 
        pickle.dump(X_D2Vplus_embeddings_test,handle,protocol = pickle.HIGHEST_PROTOCOL)

# for i in range(len(X_embeddings_train)):
#     print(X_embeddings_train[i])


        

        


In [3]:
####################
#CONSTRUCT Y_LABELS#
####################
import pandas as pd
import pickle

df = pd.read_csv("train2017.tsv", sep='\t', header=None)
y_train_labels = df[2].tolist() #sentiments


for n, value in enumerate(y_train_labels):
    if value == "positive":
        y_train_labels[n] = 2
    elif value =="negative":
        y_train_labels[n] = 0
    else:
        y_train_labels[n] =1

df = pd.read_csv("y_test_labels.tsv", sep='\t', header=None)
y_test_labels = df[1].tolist() #sentiments


for n, value in enumerate(y_test_labels):
    if value == "positive":
        y_test_labels[n] = 2
    elif value =="negative":
        y_test_labels[n] = 0
    else:
        y_test_labels[n] =1


In [4]:
#LOAD PICKLE FILES
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import pickle
with open('X_BOW_train.pickle','rb') as handle: 
    X_BOW_train = pickle.load(handle)
    with open('X_BOW_test.pickle','rb') as handle: 
        X_BOW_test = pickle.load(handle)
        knn = KNeighborsClassifier(n_neighbors=1)
        knn.fit(X_BOW_train, y_train_labels)
        y_pred = knn.predict(X_BOW_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))

0.45791273200911753


In [8]:
with open('X_TFIDF_train.pickle','rb') as handle: 
    X_TFIDF_train = pickle.load(handle)
    with open('X_TFIDF_test.pickle','rb') as handle: 
        X_TFIDF_test = pickle.load(handle)
        knn = KNeighborsClassifier(n_neighbors=1)
        knn.fit(X_TFIDF_train, y_train_labels)
        y_pred = knn.predict(X_TFIDF_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))

0.42461738847281016


In [13]:
with open('X_D2V_embeddings_train.pickle','rb') as handle: 
    X_embeddings_train = pickle.load(handle)
    with open('X_D2V_embeddings_test.pickle','rb') as handle: 
        X_embeddings_test = pickle.load(handle)
        knn = KNeighborsClassifier(n_neighbors=10)
        knn.fit(X_embeddings_train, y_train_labels)
        y_pred = knn.predict(X_embeddings_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))

0.4725659394334093


In [None]:
with open('X_D2Vplus_embeddings_train.pickle','rb') as handle: 
    X_embeddings_train = pickle.load(handle)
    with open('X_D2Vplus_embeddings_test.pickle','rb') as handle: 
        X_embeddings_test = pickle.load(handle)
        knn = KNeighborsClassifier(n_neighbors=10)
        knn.fit(X_embeddings_train, y_train_labels)
        y_pred = knn.predict(X_embeddings_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))

In [44]:
with open('X_W2V_embeddings_train.pickle','rb') as handle: 
    X_embeddings_train = pickle.load(handle)
    with open('X_W2V_embeddings_test.pickle','rb') as handle: 
        X_embeddings_test = pickle.load(handle)
        knn = KNeighborsClassifier(n_neighbors=10)
        knn.fit(X_embeddings_train, y_train_labels)
        y_pred = knn.predict(X_embeddings_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))

0.4833116248778899


In [None]:
with open('X_W2Vplus_embeddings_train.pickle','rb') as handle: 
    X_embeddings_train = pickle.load(handle)
    with open('X_W2Vplus_embeddings_test.pickle','rb') as handle: 
        X_embeddings_test = pickle.load(handle)
        knn = KNeighborsClassifier(n_neighbors=10)
        knn.fit(X_embeddings_train, y_train_labels)
        y_pred = knn.predict(X_embeddings_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))

In [16]:
from sklearn import svm
from sklearn import metrics

from sklearn.metrics import f1_score

with open('X_BOW_train.pickle','rb') as handle: 
    X_BOW_train = pickle.load(handle)
    with open('X_BOW_test.pickle','rb') as handle: 
        X_BOW_test = pickle.load(handle)
        clf = svm.SVC(gamma='scale')
        clf.fit(X_BOW_train, y_train_labels)
        y_pred = clf.predict(X_BOW_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))
        print(f1_score(y_test_labels, y_pred, average=None))


0.5060240963855421
[0.00551793 0.66126744 0.33396999]


In [12]:
from sklearn import svm
from sklearn import metrics

with open('X_TFIDF_train.pickle','rb') as handle: 
    X_TFIDF_train = pickle.load(handle)
    with open('X_TFIDF_test.pickle','rb') as handle: 
        X_TFIDF_test = pickle.load(handle)
        clf = svm.SVC(gamma='scale')
        clf.fit(X_TFIDF_train, y_train_labels)
        y_pred = clf.predict(X_TFIDF_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))

0.48762618039726474


In [14]:
from sklearn import svm
from sklearn import metrics

with open('X_D2V_embeddings_train.pickle','rb') as handle: 
    X_embeddings_train = pickle.load(handle)
    with open('X_D2V_embeddings_test.pickle','rb') as handle: 
        X_embeddings_test = pickle.load(handle)
        clf = svm.SVC(gamma='scale')
        clf.fit(X_embeddings_train, y_train_labels)
        y_pred = clf.predict(X_embeddings_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))

0.48811462064474115


In [None]:
from sklearn import svm
from sklearn import metrics

with open('X_D2Vplus_embeddings_train.pickle','rb') as handle: 
    X_embeddings_train = pickle.load(handle)
    with open('X_D2Vplus_embeddings_test.pickle','rb') as handle: 
        X_embeddings_test = pickle.load(handle)
        clf = svm.SVC(gamma='scale')
        clf.fit(X_embeddings_train, y_train_labels)
        y_pred = clf.predict(X_embeddings_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))

In [43]:
from sklearn import svm
from sklearn import metrics

with open('X_W2V_embeddings_train.pickle','rb') as handle: 
    X_embeddings_train = pickle.load(handle)
    with open('X_W2V_embeddings_test.pickle','rb') as handle: 
        X_embeddings_test = pickle.load(handle)
        clf = svm.SVC(gamma='scale')
        clf.fit(X_embeddings_train, y_train_labels)
        y_pred = clf.predict(X_embeddings_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))

0.4833116248778899


In [None]:
from sklearn import svm
from sklearn import metrics

with open('X_W2Vplus_embeddings_train.pickle','rb') as handle: 
    X_embeddings_train = pickle.load(handle)
    with open('X_W2Vplus_embeddings_test.pickle','rb') as handle: 
        X_embeddings_test = pickle.load(handle)
        clf = svm.SVC(gamma='scale')
        clf.fit(X_embeddings_train, y_train_labels)
        y_pred = clf.predict(X_embeddings_test)
        print(metrics.accuracy_score(y_test_labels, y_pred))

In [5]:
#################
#KNN CLASSIFIER##
#################

# from sklearn.neighbors import KNeighborsClassifier
# from sklearn import metrics

# print(len(X_BOW_train))
# print(len(y_labels))
# knn = KNeighborsClassifier(n_neighbors=1)
# knn.fit(X_BOW_train , y_labels)
# y_pred = knn.predict(X_BOW_test)
# print(metrics.accuracy_score(y_labels, y_pred))

# knn.fit(X_TFIDF_train , y_labels)
# y_pred = knn.predict(X_TFIDF_test)
# print(metrics.accuracy_score(y_labels, y_pred))

# knn.fit(X_embeddings_train , y_labels)
# y_pred = knn.predict(X_embeddings_test)
# print(metrics.accuracy_score(y_labels, y_pred))


In [6]:
#################
#SVM CLASSIFIER##
#################

# from sklearn import svm
# from sklearn import metrics

# print(len(X_BOW_train.toarray()))
# print(len(y_labels))
# clf = svm.SVC(gamma='scale')
# clf.fit(X_BOW_train, y_labels)
# y_pred = clf.predict(X_BOW_test.toarray())
# print(metrics.accuracy_score(y_labels, y_pred))

# clf = svm.SVC(gamma='scale')
# clf.fit(X_TFIDF_train, y_labels)
# y_pred = clf.predict(X_TFIDF_test.toarray())
# print(metrics.accuracy_score(y_labels, y_pred))

# clf = svm.SVC(gamma='scale')
# clf.fit(X_embeddings_train, y_labels)
# y_pred = clf.predict(X_embeddings_test)
# print(metrics.accuracy_score(y_labels, y_pred))

In [7]:
#round robin 
