# Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')

#### Standard Imports

In [2]:
import pandas as pd
import numpy as np

#### Import NLTK to work with language data

In [3]:
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

#### Import scikit-learn to do machine learning

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

#### Import gensim for topic modelling

In [5]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import FastText

# Data loading and preprocessing

#### Loading

Load the data in two dataframes : one containing the positive tweets, the other containing the negative tweets.

In [6]:
datafolder = 'data/'

In [7]:
# The tweets are provided in .txt files with one tweet per line
pos_tweets = pd.read_table(datafolder + 'train_pos.txt', names = ['Tweet'], sep = '\n')
neg_tweets = pd.read_table(datafolder + 'train_neg.txt', names = ['Tweet'], sep = '\n')

#### Preprocessing

Following the format expected by *crowdAI*, a tweet with postive sentiment corresponds to a prediction of `1` and a tweet with negative sentiment corresponds to a prediction of `-1`.

In [8]:
# We add a column containing predictions to the tweets dataframes
pos_tweets['Prediction'] = 1
neg_tweets['Prediction'] = -1

We now concatenate our two dataframes to obtain a single one containing both positive and negative sentiment tweets.

In [9]:
all_tweets = pd.concat((pos_tweets, neg_tweets))
all_tweets.head(3)

Unnamed: 0,Tweet,Prediction
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,<user> just put casper in a box ! looved the...,1


We define a method to preprocess tweets in order to obtain a cleaned list of words for each of them

In [10]:
def preprocess(tweet):
    tokenizer = TweetTokenizer() # Twitter-aware tokenizer
    preprocessed = tokenizer.tokenize(tweet)
    return preprocessed

We apply the preprocessing function implemented above

In [11]:
all_tweets['Tweet Cleaned'] = all_tweets['Tweet'].apply(lambda x: preprocess(x))

## Machine Learning

We start by separating our data into train and test subset

In [12]:
# We use scikit-learn's 'train_test_split' which split arrays into random train and test subset
test_size = 0.15 # train subset: 85%, test subset: 15%
x_tr, x_te, y_tr, y_te = train_test_split(all_tweets['Tweet Cleaned'], all_tweets['Prediction'], test_size = test_size)

The input for a Doc2Vec or FastText model should be a list of TaggedDocument

In [13]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(x_tr)]
documents[:2]

[TaggedDocument(words=['<user>', 'same', 'with', 'this', 'girl', '!', 'i', 'mean', '25', 'minutes', 'isnt', 'too', 'long', 'distance', 'but', 'is', 'when', 'you', "don't", 'have', 'a', 'car', '!', 'gonna', 'miss', 'you', 'next', 'year', '!'], tags=[0]),
 TaggedDocument(words=['tool', 'steel', 'a2', 'rectangular', 'bar', ',', 'oversized', ',', 'astm', 'a', '-681-07', ',', '5/32', '"', 'thick', ',', '6', '"', 'width', ',', '18', '"', 'length', '(', 'pack', 'of', '10', '<url>'], tags=[1])]

We flatten our our documents to obtain a single list of tokens

In [14]:
tokens = [d.words for d in documents]
tokens[1][:10]

['tool',
 'steel',
 'a2',
 'rectangular',
 'bar',
 ',',
 'oversized',
 ',',
 'astm',
 'a']

https://radimrehurek.com/gensim/models/fasttext.html

We use FastText, the library created by Facebook for learning of word embeddings and text classification

In [15]:
#size = 300 # dimensionality of the word vectors
workers = 4 # these many worker threads to train the model (=faster training with multicore machines)
iter = 60 # number of iterations (epochs) over the corpus
#model = FastText(size=size, workers=workers, iter=iter)
model = FastText(workers=workers)

In [16]:
model

<gensim.models.fasttext.FastText at 0x1a325e2f98>

We now build the vocabulary from our list of tokens and then train the model.

In [17]:
model.build_vocab(tokens) # we pass it our list of tokens

In [18]:
total_examples=model.corpus_count # count of sentences
epochs=model.epochs # number of iterations (epochs) over the corpus
model.train(tokens, total_examples=total_examples, epochs=epochs)

We define a function that computes the vector associated to each tweet.
For every tweet it applies the model defined above to each word the tweet contains and then takes the average.

In [19]:
def vectorize(tweet, model):
    res_list = []
    for w in tweet:
        if (w in model):
            res_list.append(model[w])
    return np.mean(res_list, axis=0)

We call the function above to compute the vector associated with each tweet, by applying our FastText model

In [20]:
res_tr = []
res_te = []
for tweet in x_tr:
    res_tr.append(vectorize(tweet, model))
for tweet in x_te:
    res_te.append(vectorize(tweet, model))

### Logisitic Regression

We start with Logistic Regression

In [21]:
lr = LogisticRegression(solver='lbfgs', max_iter=150)

In [22]:
lr.fit(res_tr, y_tr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=150, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
lr.score(res_te, y_te)

0.7569912884309007

### MultiLayer Perceptrons

In [27]:
mlp = MLPClassifier(alpha = 1e-5, solver = 'sgd', hidden_layer_sizes = (64,6), random_state = 12)

In [28]:
mlp.fit(res_tr, y_tr)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(64, 6), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=12, shuffle=True, solver='sgd', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [26]:
mlp.score(res_te, y_te)

0.7984475102538897