# WORD EMBEDDINGS

In [2]:
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression,Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import random
import os
import string




Data needs to be in one file, one tweet per line

In [3]:
X = pd.read_csv("x-stemmer.csv", index_col=0, header=None)

In [4]:
X.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
570306133677760513,said
570301130888122368,plu ad commerci experi tacki
570301083672813571,today must mean need take anoth trip
570301031407624196,realli aggress blast obnoxi entertain guest fa...
570300817074462722,realli big bad thing


In [5]:
f = open('data.txt', 'w', encoding='utf8')
for i, row in X.iterrows():
    f.write("%s\n" %row.ix[1])
f.close()

In [6]:
class  LabeledLineSentenceLabeled (object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

In [7]:
sources = {'data.txt':'data'}
sentences = LabeledLineSentenceLabeled(sources)

In [8]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
model.build_vocab(sentences.to_array())
nr_words = sum([len(x.words) for x in sentences.sentences_perm()])


In [9]:
model.train(sentences.sentences_perm(), epochs=20, total_words=nr_words)

1561435

In [10]:
model.save('data-20.d2v')

In [11]:
model.most_similar('good')

[('haha', 0.9998568892478943),
 ('pictur', 0.9998269081115723),
 ('winter', 0.9998247027397156),
 ('train', 0.999823808670044),
 ('tho', 0.9998233318328857),
 ('video', 0.9998232126235962),
 ('choic', 0.9998228549957275),
 ('sf', 0.9998192191123962),
 ('awesom', 0.9998174905776978),
 ('game', 0.9998162388801575)]

Get WE of sentence

```
model.docvecs['data_0']
```

### STORAGE PROCEDURE

In [12]:
def format_filename(s):
    """Take a string and return a valid filename constructed from the string.
Uses a whitelist approach: any characters not present in valid_chars are
removed. Also spaces are replaced with underscores.
 
Note: this method may produce invalid filenames such as ``, `.` or `..`
When I use this method I prepend a date string like '2009_01_15_19_46_32_'
and append a file extension like '.txt', so I avoid the potential of using
an invalid filename.
 
from https://gist.github.com/seanh/93666
"""
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    filename = ''.join(c for c in s if c in valid_chars)
    filename = filename.replace(' ','_') # I don't like spaces in filenames.
    return filename

In [13]:
def get_model_name(transformer, model, cleaner_name):
    name = '%s-%s-%s' %(cleaner_name, transformer_name.lower(), model_name.lower())
    return name

In [14]:
def dump_results(res):
    fname = get_filename_for_model_name(res['name'])
    pickle.dump(res, open(fname, 'wb'))

In [15]:
def get_filename_for_model_name(name):
    name = format_filename(name)
    name = '%s.model' %name
    name = os.path.join('models', name)
    return name

In [16]:
def is_grid_search_with_transformer(model):
    return type(model) == sklearn.model_selection._search.GridSearchCV and len(model.estimator.steps) == 2

## PREDICTION

In [17]:
X_vecs = np.zeros((len(X),100))

for i in range(len(X)):
    prefix = 'data_%s' %i
    vector = model.docvecs[prefix]
    X_vecs[i] = vector

In [18]:
y = pd.read_csv("y.csv", index_col=0, header=None)
print(y.head())
y=y.values.reshape(-1)

                    1
0                    
570306133677760513  0
570301130888122368  1
570301083672813571  0
570301031407624196 -1
570300817074462722 -1


In [19]:
models = [
    [
        'LogisticRegression-l2',
        LogisticRegression(max_iter=2000),
        {
            'C': (0.001, 0.01, 0.1, 1, 10, 100, 1000),
            'penalty': ['l2'],
            'class_weight': ('balanced', None),
            'solver': ('newton-cg', 'sag', 'lbfgs')
        },
    ],
    [
        'LinearSVC',
        LinearSVC(),
        {
            'C': [0.001, 0.01, 0.1, 1, 10],
        }
    ],
    [
        'Perceptron',
        Perceptron(),
        {
            'alpha': [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3],
            'n_iter': [5, 10, 15, 20, 50],
            'penalty': [None, 'l2', 'l1', 'elasticnet']
        }
    ],
#     [
#         'MultinomialNB',
#         MultinomialNB(),
#         {
#             'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]
#         }
#     ],
    [
        'DecisionTreeClassifier',
        DecisionTreeClassifier(),
        {
            'min_samples_split' : range(10,500,20),
            'max_depth': range(1,20,2)
        } 
    ],
       [
        'RandomForestClassifier',
        RandomForestClassifier(),
        {
            'max_depth': [10, 50, 110, None],
            'max_features': ['auto', 'sqrt'],
            'n_estimators': [200, 500, 1000]
        }
    ]    
]

In [20]:
for model_name, model, model_params in models:
    transformer_name = 'doc2vec'
    cleaner_name = 'stemmer'
    print('%s with %s (%s cleaner)...' %(transformer_name, model_name, cleaner_name))

    name = get_model_name(model_name, transformer_name, cleaner_name)
    fname = get_filename_for_model_name(name)

    if not os.path.exists(fname):
        K = 5

        pipeline = Pipeline(
            [
                (model_name, model)
            ]
        )

        new_params = {}

        for k, v in model_params.items():
            new_k = model_name + "__" + k
            new_params[new_k] = v

        parameters = {
            **new_params
        }
        print(parameters)

        grid = GridSearchCV(
            pipeline,
            parameters,
            n_jobs=-1,
            cv=K
        )

        grid.fit(X_vecs, y)
        model = grid.best_estimator_
        score = grid.best_score_

        res = {
            'name': name,
            'transformer': transformer_name,
            'transformer_name': transformer_name,
            'model': model.get_params(),
            'model_name': model_name,
            'cleaner': cleaner_name,
            'score': score,
            }

        dump_results(res)
        print('Saved to %s' %fname)

    else:
        print('Skipping. Model exists at %s' %fname)

doc2vec with LogisticRegression-l2 (stemmer cleaner)...
Skipping. Model exists at models\stemmer-doc2vec-logisticregression-l2.model
doc2vec with LinearSVC (stemmer cleaner)...
Skipping. Model exists at models\stemmer-doc2vec-linearsvc.model
doc2vec with Perceptron (stemmer cleaner)...
Skipping. Model exists at models\stemmer-doc2vec-perceptron.model
doc2vec with DecisionTreeClassifier (stemmer cleaner)...
Skipping. Model exists at models\stemmer-doc2vec-decisiontreeclassifier.model
doc2vec with RandomForestClassifier (stemmer cleaner)...
Skipping. Model exists at models\stemmer-doc2vec-randomforestclassifier.model
