In [1]:
%load_ext autoreload
%autoreload 2
from helpers import export_predictions
from feature_repr import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from helpers import build_poly
from sklearn.preprocessing import StandardScaler
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.neural_network import MLPClassifier



In [7]:
def generate_data ():
    
    embeddings = load_embeddings ()
    print ("Embeddings loaded")
    positive_tweets, negative_tweets,test_tweets = load_tweets (False)
    print ("Tweets loaded")
    vocab_dict = load_vocab ()
    print ("Vocabulary loaded")
    positive_tweets_feature_repr = feature_representation_v2 (embeddings, positive_tweets, vocab_dict)
    negative_tweets_feature_repr = feature_representation_v2 (embeddings,negative_tweets,vocab_dict)
    test_tweets_feature_repr = feature_representation_v2 (embeddings,test_tweets,vocab_dict)
    print ("First feature representation achieved")
    pos_tweets_lexicon_features,neg_tweets_lexicon_features,test_tweets_lexicon_features=load_lexicon_features ()
    print ("Lexicon features loaded")
    pos_tweets_features = concatenate_features (positive_tweets_feature_repr,pos_tweets_lexicon_features)
    neg_tweets_features = concatenate_features (negative_tweets_feature_repr,neg_tweets_lexicon_features)
    test_tweets_features = concatenate_features (test_tweets_feature_repr,test_tweets_lexicon_features)
    
    X = np.vstack ((pos_tweets_features, neg_tweets_features))
    Y = np.hstack ((np.ones (pos_tweets_features.shape [0]), -1 * np.ones (neg_tweets_features.shape [0])))
    
    return X, Y, test_tweets_features

In [8]:
lucas_X, Y, test_X = generate_data ()

Embeddings loaded
Tweets loaded
Vocabulary loaded
First feature representation achieved
Lexicon features loaded


In [2]:
def generateCharList ():
    
    chars = {}
    
    fp = open ('train_pos.txt')
    line = fp.readline ()
    
    while line:
        
        for char in line:
            if char not in chars:
                chars [char] = 1
            else:
                chars [char] += 1
                
        line = fp.readline ()
        
    fp.close ()
    
    fp = open ('train_neg.txt')
    line = fp.readline ()
    
    while line:
        
        for char in line:
            if char not in chars:
                chars [char] = 1
            else:
                chars [char] += 1
                
        line = fp.readline ()
        
    fp.close ()
    
    return chars

In [3]:
characters = generateCharList ()
characters = dict((k, v) for k, v in characters.items() if v >= 5 and v != 200000)

In [4]:
chars = {}
i = 0

for k in characters.keys():
    chars [k] = i
    i += 1

In [10]:
analyzer = SentimentIntensityAnalyzer ()

def tweetToVector (tweet, chars):
    
    vector = [0] * (len (chars) + 2)
    
    for char in tweet:
        if char in chars:
            vector [chars [char]] += 1
            
    """words = tweet.split (' ')
    vector [len (chars)] = len (words)
    m = 0
    
    for word in words:
        m += len (word)
        
    m /= len (words)
    vector [len (vector) - 1] = m
    
    polarity = analyzer.polarity_scores (tweet)
    vector.append (polarity ['pos'])
    vector.append (polarity ['neu'])
    vector.append (polarity ['neg'])
    vector.append (polarity ['compound'])"""
        
    return vector

In [11]:
def generateVectors (chars):
    
    X = []
    Y = []
    
    fp = open ('train_pos.txt', encoding='utf8')
    line = fp.readline ()
    
    while line:
        
        X.append (tweetToVector (line, chars))
        Y.append (1)
                
        line = fp.readline ()
        
    fp.close ()
    
    fp = open ('train_neg.txt', encoding='utf8')
    line = fp.readline ()
    
    while line:
        
        X.append (tweetToVector (line, chars))
        Y.append (-1)
                
        line = fp.readline ()
        
    fp.close ()
    
    return np.array (Y), np.array (X)

In [12]:
Y, moi_X = generateVectors (chars)

In [14]:
scaler = StandardScaler ()
scaler.fit (moi_X)
moi_X_standard = scaler.transform (moi_X)

moi_X [:20]



array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
X = np.hstack ((lucas_X, moi_X))
Y = moi_Y
X.shape

(200000, 101)

In [14]:
scaler = StandardScaler ()
scaler.fit (X)
X_standard = scaler.transform (X)

In [None]:
clf_cnn = MLPClassifier (batch_size = 1)
clf_cnn.fit (X_standard, Y)

In [47]:
cross_val_score (clf_cnn, X_standard, Y, cv=5)

array([ 0.7729  ,  0.770875,  0.7638  ,  0.76255 ,  0.75995 ])

In [29]:
def trainScore (Y, X, classifier):
    
    r = 0
    
    for i in range (X.shape [0]):
        if classifier.predict (X [i, :].reshape (1, -1)) [0] == Y [i]:
            r += 1
            
    return 1.0 * r / X.shape [0]

In [22]:
trainScore (moi_Y, X_standard, clf_cnn)

0.812795

In [4]:
lexicon_pos = np.load ('pos_tweets_lexicon_features.npy')
lexicon_neg = np.load ('neg_tweets_lexicon_features.npy')
lexicon_X = np.vstack ((lexicon_pos, lexicon_neg))

In [5]:
lexicon_X.shape

(200000, 6)

In [19]:
clf_log = LogisticRegression ()
clf_log.fit (moi_X, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
cross_val_score (clf_log, moi_X, Y, cv=2)

array([ 0.7507 ,  0.74893])

In [25]:
X = np.hstack ((moi_X, lexicon_X, lucas_X))

In [26]:
scaler = StandardScaler ()
scaler.fit (X)
tX = scaler.transform (X)

In [43]:
clf_log = LogisticRegression ()
clf_log.fit (tX, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=1e-05,
          verbose=0, warm_start=False)

In [35]:
cross_val_score (clf_log, tX, Y, cv=2)

array([ 0.75577,  0.75313])

In [44]:
trainScore (Y, tX, clf_log)

0.757065