In [1]:
from __future__ import print_function, division
from builtins import range

In [2]:
import sys
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [3]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from gensim.models import KeyedVectors



In [4]:
train = pd.read_csv('data/Train300.csv', header=0, sep=';')
test = pd.read_csv('data/Test.csv', header=0, sep=';')

In [5]:
train = train[['sentiment','tweet_text']]
test = test[['sentiment','tweet_text']]

train.columns = ['label','content']
test.columns = ['label','content']
train.head()

Unnamed: 0,label,content
0,1,"@bejusmila é um aplicativo a parte, o youtube ..."
1,1,@Alixandriiaa No problem! :) ^tm
2,1,"@FellipeC @DoctorClimao Não vejo, pelo menos a..."
3,1,@b4rbixando @barbixas Lembra das sábias palavr...
4,1,ola obgzinho querido :) https://t.co/osihldty12


In [6]:
class GloveVectorizer:
    def __init__(self):
        # load in pre-trained word vectors
        print('Loading word vectors...')
        word2vec = {}
        embedding = []
        idx2word = []
        with open('glove/glove_s100/glove_s100.txt', encoding='utf-8') as f:
            # is just a space-separeted text file in the format:
            # word vec[0] vec[1]
            next(f)
            for line in f:
                values = line.split(' ')
                word = values[0]
                vec = np.asarray(values[1:], dtype='float32')
                word2vec[word] = vec
                embedding.append(vec)
                idx2word.append(word)
            
        print('Found %s word vectors.' % len(word2vec))
        
        ## save for later
        self.word2vec = word2vec
        self.embedding = np.array(embedding)
        self.word2idx = {v:k for k,v in enumerate(idx2word)}
        self.V, self.D = self.embedding.shape
        
    def fit(self, data):
        pass
    
    def transform(self, data):
        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        for sentence in data:
            tokens = sentence.lower().split()
            vecs = []
            for word in tokens:
                if word in self.word2vec:
                    vec = self.word2vec[word]
                    vecs.append(vec)
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount +=1
            n += 1
        print('Number of samples with no words found: %s / %s' % (emptycount, len(data)))
        return X
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [7]:
class Word2VecVectorizer:
    def __init__(self):
        print('Loading in word vectors...')
        self.word_vectors = KeyedVectors.load_word2vec_format('glove/cbow_s100/cbow_s100.txt')
        print('Finished loading in word vectors')
        
    def fit(self, data):
        pass
        
    def transform(self, data):
        #determine the dimensionality of vectors
        v = self.word_vectors.get_vector('rei')
        self.D = v.shape[0]

        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        for sentence in data:
            tokens = sentence.split()
            vecs = []
            m = 0
            for word in tokens:
                try:
                    #throws KeyError if word not found
                    vec = self.word_vectors.get_vector(word)
                    vecs.append(vec)
                    m+=1
                except KeyError:
                    pass
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n+=1
        print('Number of samples with no words found: %s / %s' % (emptycount, len(data)))
        return X

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)


In [8]:
vectorizer = Word2VecVectorizer()
Xtrain = vectorizer.fit_transform(train.content)
Ytrain = train.label

Loading in word vectors...
Finished loading in word vectors
Number of samples with no words found: 4241 / 300000


In [9]:
Xtest = vectorizer.transform(test.content)
Ytest = test.label

Number of samples with no words found: 65 / 5000


In [10]:
# create the model, train it, print scores
model = RandomForestClassifier(n_estimators=200)
model.fit(Xtrain, Ytrain)
print('train score:', model.score(Xtrain, Ytrain))
print('test score:', model.score(Xtest, Ytest))

train score: 0.9920933333333334
test score: 0.7584
