In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn import svm
import pickle

In [10]:
class GrantMattModel():
    def __init__(self):
        with open('docModel.pkl','rb') as f:
            self.embedding = pickle.load(f)
    
    def train(self, features, embeds, labels):
        cores = 1
        self.vectorizer = DictVectorizer(sparse=False)
        self.sentiment = BaggingClassifier(svm.LinearSVC(), max_samples=1.0/cores,
                                           n_estimators=cores, n_jobs=cores)
        
        feat_vec = self.vectorizer.fit_transform(features)
        
        embeds = np.concatenate((embeds, feat_vec), axis=1)
        self.sentiment.fit(embeds, labels)
        
    def extract_features(self, tweet):
        feats = {}
        tweet = tweet.split(' ')
        feats['NUMCAPS'] = 0
        feats['LENGTH'] = len(tweet)
        for j in range(len(tweet)):
            word = tweet[j]

            if len(word) > 0 and word[0] != '@':
#                 feats['WORD='+word.lower()] = 1
                feats['NUMCAPS'] += sum(1 for char in word if char.isupper())
        return feats
    
    def embed_tweet(self,tweet):
        tweet = tweet.split(' ')
        out = np.zeros(100)
        for j in range(len(tweet)):
            word = tweet[j]
            if len(word) > 0 and word[0] != '@':
                if word in self.embedding.wv.vocab:
                    out += self.embedding[word]
        return out/len(tweet)

    def predict(self, newTweet):
        feats = self.extract_features(newTweet)
        feat_vec = self.vectorizer.transform(feats)
        embed = self.embed_tweet(newTweet)
        
        feat_vec = feat_vec[0]
        tweet_vec = np.concatenate((embed, feat_vec)).reshape((1,-1))
        
        return self.sentiment.decision_function(tweet_vec)[0]
    
    def evaluate(self, test_df):
        test_labels = test_df.as_matrix(columns=['sentiment']).flatten()/2-1
        test_features = [self.extract_features(test_df['text'][i]) for i in range(len(test_df))]
        test_embeds = np.zeros((len(test_df), 100))
        for i in range(len(test_df)):
            test_embeds[i] = self.embed_tweet(test_df['text'][i])
        feat_vec = self.vectorizer.transform(test_features)
        test_vec = np.concatenate((test_embeds, feat_vec), axis=1)
        
        acc = self.sentiment.score(test_vec, test_labels)
#         pred_labels = np.tanh(self.sentiment.decision_function(feat_vec))
#         conf_acc = (test_labels + pred_labels)/(2*len(test_df))
#         return (acc, conf_acc)
        return acc


In [3]:
cols = ['sentiment','id','date','query_string','user','text']
tweet_df = pd.read_csv('training.1600000.processed.noemoticon.csv',
                              header=None, names=cols, encoding='utf-8')
tweet_df.drop(['id','date','query_string','user'], axis=1, inplace=True)

In [13]:
gm = GrantMattModel()

In [5]:
from multiprocessing import Pool

tweet_list = tweet_df['text'].tolist()
p = Pool(8)
features = p.map(gm.extract_features, tweet_list)
embeds = p.map(gm.embed_tweet, tweet_list)
embeds = np.array(embeds)

In [10]:
features = [{} for _ in range(len(tweet_df))]
embeds = np.zeros((len(tweet_df), 100))
for i in range(len(tweet_df)):
    features[i] = gm.extract_features(tweet_df['text'][i])
    embeds[i] = gm.embed_tweet(tweet_df['text'][i])
    
    if i % 100000 == 0:
        print(i)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000


In [6]:
labels = tweet_df.as_matrix(columns=['sentiment']).flatten()/2-1

In [7]:
gm.train(features, embeds, labels)

In [8]:
test_df =  pd.read_csv('testdata.manual.2009.06.14.csv',
                       header=None, names=cols, encoding='utf-8')
test_df.drop(['id','date','query_string','user'], axis=1, inplace=True)

In [15]:
from time import time
n = 400
start = time()
print(gm.evaluate(test_df[:n]))
print((time()-start)/n)

0.49
0.00015676438808441163


In [65]:
with open('modelv03.pickle','wb') as f:
    pickle.dump(gm, f, pickle.HIGHEST_PROTOCOL)

In [1]:
newTweet = "At least I'm not dead yet!"

score = gm.predict(newTweet)
print('Sentiment:', score)


NameError: name 'gm' is not defined

In [60]:
# Hey grant, can you turn classification into a pipeline? 
# I think I can import a pickle'd pipeline into java. -- Sam

In [11]:
v = gm.vectorizer
s = gm.sentiment

In [14]:
gm.vectorizer = v
gm.sentiment = s

In [4]:
with open('modelv02.pickle','rb') as f:
    gm = pickle.load(f)

In [12]:
del gm.embedding
del gm