## Imports

In [3]:
import pandas as pd
import numpy as np
import nltk
import string
import re

from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from gensim.models.doc2vec import TaggedDocument
from gensim.models.word2vec import Word2Vec
from gensim.models import FastText

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import  Normalizer

## First version - exploring

### Load the tweets

In [4]:
imported_pos_tweets = pd.read_table("./data/train_pos_full.txt", names =['Tweet'], sep = "\n", header = None, nrows=600000)

In [5]:
imported_pos_tweets["Sentiment"] = 1

In [6]:
imported_neg_tweets = pd.read_table("./data/train_neg_full.txt", names =['Tweet'], sep = "\n", header = None, nrows=600000)
imported_neg_tweets["Sentiment"] = -1

In [7]:
df_tweets = pd.concat((imported_pos_tweets,imported_neg_tweets))

In [8]:
df_tweets.head()

Unnamed: 0,Tweet,Sentiment
0,<user> i dunno justin read my mention or not ....,1
1,"because your logic is so dumb , i won't even c...",1
2,<user> just put casper in a box ! looved the...,1
3,<user> <user> thanks sir > > don't trip lil ma...,1
4,visiting my brother tmr is the bestest birthda...,1


### Define other features than only words

In [9]:
stop = stopwords.words('english')
def add_features(df_tweets):
    df_tweets['n_hashtags'] = df_tweets['Tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    df_tweets['n_chars'] = df_tweets['Tweet'].apply(lambda x: len(x))
    df_tweets['n_words'] = df_tweets['Tweet'].apply(lambda x: len(x.split()))
    df_tweets['n_exclamation'] = df_tweets['Tweet'].apply(lambda x: len([x for x in x.split() if x == '!']))
    df_tweets['n_non_stopwords'] = df_tweets['Tweet'].apply(lambda x: len([x for x in x.split() if x not in stop]))
    df_tweets['avg_word_length'] = df_tweets['Tweet'].apply(lambda x: np.mean([len(t) for t in x.split() if t not in stop]) if len([len(t) for t in x.split() if t not in stop]) > 0 else 0)
    return df_tweets

df_tweets = add_features(df_tweets)

### Preprocessing the tweets

In [45]:
def tokenize(tweet):
    stemmer = PorterStemmer()
    tokenizer = TweetTokenizer()
    tweet = tweet.replace('[^\w\s]','')
    #tokens = [stemmer.stem(token) for token in tokenizer.tokenize(tweet)]
    tokens = tokenizer.tokenize(tweet)
    return tokens

In [46]:
df_tweets["Tokenized Tweets"] = df_tweets['Tweet'].map(tokenize)

In [47]:
df_tweets.head()

Unnamed: 0,Tweet,Sentiment,Tokenized Tweets
0,<user> i dunno justin read my mention or not ....,1,"[<user>, i, dunno, justin, read, my, mention, ..."
1,"because your logic is so dumb , i won't even c...",1,"[because, your, logic, is, so, dumb, ,, i, won..."
2,<user> just put casper in a box ! looved the...,1,"[<user>, just, put, casper, in, a, box, !, loo..."
3,<user> <user> thanks sir > > don't trip lil ma...,1,"[<user>, <user>, thanks, sir, >, >, don't, tri..."
4,visiting my brother tmr is the bestest birthda...,1,"[visiting, my, brother, tmr, is, the, bestest,..."


### Create test and train data

In [51]:
xtrain, xtest, ytrain, ytest = train_test_split(df_tweets["Tokenized Tweets"],df_tweets["Sentiment"],test_size = 0.2)

### First Model - FastText

In [52]:
tagged_data = [TaggedDocument(words=tweet, tags=[str(i)]) for i,tweet in enumerate(xtrain)]

In [53]:
words = [t.words for t in tagged_data]

In [54]:
fast = FastText(size=300, workers=4, iter=60)

In [55]:
fast.build_vocab(words)

In [56]:
fast.train(words, total_examples=fast.corpus_count, epochs=fast.epochs)

In [57]:
def tweet2vec(tweet,w2v):
    nwords = len(tweet)
    if nwords > 0:
        vector = np.zeros(300)
        vector = np.mean([w2v[w] for w in tweet if w in w2v], axis=0)
    else:
        vector = np.zeros(300)
        
    return vector

In [58]:
vectors_train = [tweet2vec(tweet,fast) for tweet in xtrain]



In [59]:
vectors_test = [tweet2vec(tweet,fast) for tweet in xtest]



#### Results using Logistic Regression:

In [60]:
clf = LogisticRegression(solver='lbfgs')

In [61]:
clf.fit(vectors_train,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [62]:
clf.score(vectors_test,ytest)

0.8005541666666667

#### Results using MultiLayer Perceptrons:

In [63]:
clf2 = MLPClassifier(alpha = 1e-5, solver = 'lbfgs', hidden_layer_sizes = (64,6), random_state = 12)

In [64]:
clf2.fit(vectors_train,ytrain)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(64, 6), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=12, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [65]:
clf2.score(vectors_test,ytest)

0.8230416666666667

In [174]:
joblib.dump(clf, 'logistic.joblib')
joblib.dump(clf2, 'mlp.joblib')

['mlp.joblib']

## Second model - Pipeline Scikit-Learn

### Create selectors for the different features

In [40]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

### Create the pipeline

In [41]:
text = Pipeline([
                ('selector', TextSelector(key='Tweet')),
                ('tfidf', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 3)))
            ])
n_chars =  Pipeline([
                ('selector', NumberSelector(key='n_chars')),
                ('standard', StandardScaler())
            ])
n_words =  Pipeline([
                ('selector', NumberSelector(key='n_words')),
                ('standard', StandardScaler())
            ])
n_exclamation =  Pipeline([
                ('selector', NumberSelector(key='n_exclamation')),
                ('standard', StandardScaler())
            ])
n_non_stopwords =  Pipeline([
                ('selector', NumberSelector(key='n_non_stopwords')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
n_hashtags =  Pipeline([
                ('selector', NumberSelector(key='n_hashtags')),
                ('standard', StandardScaler()),
            ])
            
feats_number = FeatureUnion([ 
                      ('n_chars', n_chars),
                      ('n_words', n_words),
                      ('n_exclamation', n_exclamation),
                      ('n_non_stopword', n_non_stopwords),
                      ('avg_word_length', avg_word_length),
                      ('n_hashtags', n_hashtags)])

feats_poly = Pipeline([('feats_number', feats_number), ('norm', Normalizer()),('poly', PolynomialFeatures(degree=5))])

feats = FeatureUnion([('text', text), ('feats_number', feats_poly)])

pipeline = Pipeline([
    ('feats', text),
    ('classifier', LinearSVC()),
])

In [42]:
x_train, x_test, y_train, y_test = train_test_split(df_tweets,df_tweets["Sentiment"],test_size = 0.2)

In [43]:
pipeline.fit(x_train, y_train)
pipeline.score(x_test, y_test)

0.8707125

**Full training**

In [45]:
x_train, y_train = df_tweets, df_tweets['Sentiment']
x_test = pd.read_table("test_data.txt", names =['Tweet'], sep = "\t", header = None)
x_test = add_features(x_test)

# Clean classification

## Import clean

In [1]:
import preprocessing, word2vec, pipeline, helpers, cross_validation, plot
from sklearn.model_selection import GridSearchCV

## Load clean

In [2]:
tweets, size_pos, size_neg = preprocessing.load_train_tweets("./data/train_pos.txt", "./data/train_neg.txt")

pred = preprocessing.predictions(size_pos, size_neg)
x_test, ids_test = preprocessing.load_test_tweets('./data/test_data.txt')

## Create classifier

In [None]:
clf = pipeline.pipeline_model()

## Perform cross-validation

In [None]:
cross_val_scores, stds = cross_validation.n_kfold_cross_validation(tweets, pred, clf, n=5, k=5)
plot.plot_cross_validation(cross_val_scores, stds)

cross_validation.display_info(cross_val_scores)

## Finding the best parameters

In [None]:
param_grid = {'SVC__C': [0.1, 0.6, 1.1, 1.6, 2.1]}
best = GridSearchCV(clf, param_grid, cv=5, return_train_score=True, n_jobs=-1)
best.fit(tweets, pred)
print(best.best_params_)