In [208]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import TaggedDocument
from gensim.models.word2vec import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib

In [185]:
imported_pos_tweets = pd.read_csv("train_pos_full.txt", names =['Tweet'], delimiter = "\t", header = None)

In [186]:
imported_pos_tweets["Sentiment"] =1

In [187]:
imported_neg_tweets = pd.read_csv("train_neg_full.txt", names =['Tweet'], delimiter = "\t", header = None)
imported_neg_tweets["Sentiment"] =-1

In [188]:
df_tweets = pd.concat((imported_pos_tweets,imported_neg_tweets))

In [189]:
tokenizer = TweetTokenizer()
stemmer = PorterStemmer()
#stopwords_set = set(stopwords.words("english"))

def tokenize(tweet):
    tokens = tokenizer.tokenize(tweet)
    #tokens = [token for token in tokens if not token in stopwords_set]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

In [192]:
df_tweets["Tokenized Tweets"] = df_tweets['Tweet'].map(tokenize)

In [193]:
xtrain, xtest, ytrain, ytest = train_test_split(df_tweets["Tokenized Tweets"],df_tweets["Sentiment"],test_size = 0.2)

In [194]:
tagged_data = [TaggedDocument(words=tweet, tags=[str(i)]) for i,tweet in enumerate(xtrain)]

In [195]:
words = [t.words for t in tagged_data]

In [196]:
word2vec = Word2Vec(size=80,iter=20,workers=4)

In [197]:
word2vec.build_vocab(words)

In [198]:
word2vec.train(words, total_examples=word2vec.corpus_count, epochs=word2vec.iter)

  if __name__ == '__main__':


(461317361, 628442260)

In [199]:
def tweet2vec(tweet,w2v):
    vector = np.zeros(80)
    for word in tweet:
        try:
            vector += w2v[word].reshape(80)
        except KeyError:
            continue            
    return vector       

In [200]:
vectors_train = [tweet2vec(tweet,word2vec) for tweet in xtrain]



In [201]:
vectors_test = [tweet2vec(tweet,word2vec) for tweet in xtest]



In [202]:
clf = LogisticRegression(solver='lbfgs')

In [203]:
clf.fit(vectors_train,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [204]:
clf.score(vectors_test,ytest)

0.77625594923321

In [205]:
clf2 = MLPClassifier(alpha = 1e-5, solver = 'lbfgs', hidden_layer_sizes = (32,4), random_state = 1)

In [206]:
clf2.fit(vectors_train,ytrain)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(32, 4), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [207]:
clf2.score(vectors_test,ytest)

0.8017512101859008

In [209]:
joblib.dump(clf, 'logistic.joblib')
joblib.dump(clf2, 'mlp.joblib')

['mlp.joblib']