# MODELS

In [1]:
from glob import glob
import re
import pickle
import os
import string

import nltk
from nltk.corpus import stopwords

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from stemmercleaner import StemmerCleaner

# algorithms
from sklearn.linear_model import LogisticRegression,Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
if not os.path.exists('models'):
    os.mkdir('models')

In [3]:
tweets = pd.read_csv("data.csv", index_col="tweet_id")

In [4]:
tweets.head()

Unnamed: 0_level_0,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [5]:
def encode_sentiment_classes(v):
    if v == 'neutral':
        return 0
    elif v == 'positive':
        return 1
    else:
        return -1

In [6]:
X_raw = tweets.text
X = StemmerCleaner().fit(X_raw).transform(X_raw)

In [7]:
X.to_csv('x-stemmer.csv', encoding='utf8')

In [8]:
y = tweets.airline_sentiment.apply(encode_sentiment_classes)

In [9]:
y.to_csv("y.csv", encoding="utf8")

In [10]:
y = y.values
y[:5]

array([ 0,  1,  0, -1, -1], dtype=int64)

In [11]:
X = X.values
X[:5]

array(['said', 'plu ad commerci experi tacki',
       'today must mean need take anoth trip',
       'realli aggress blast obnoxi entertain guest face amp littl recours',
       'realli big bad thing'], dtype=object)

### STORAGE PROCEDURE

In [12]:
def format_filename(s):
    """Take a string and return a valid filename constructed from the string.
Uses a whitelist approach: any characters not present in valid_chars are
removed. Also spaces are replaced with underscores.
 
Note: this method may produce invalid filenames such as ``, `.` or `..`
When I use this method I prepend a date string like '2009_01_15_19_46_32_'
and append a file extension like '.txt', so I avoid the potential of using
an invalid filename.
 
from https://gist.github.com/seanh/93666
"""
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    filename = ''.join(c for c in s if c in valid_chars)
    filename = filename.replace(' ','_') # I don't like spaces in filenames.
    return filename

In [13]:
def get_model_name(transformer, model, cleaner_name):
    name = '%s-%s-%s' %(cleaner_name, transformer_name.lower(), model_name.lower())
    return name

In [14]:
def dump_results(res):
    fname = get_filename_for_model_name(res['name'])
    pickle.dump(res, open(fname, 'wb'))

In [15]:
def get_filename_for_model_name(name):
    name = format_filename(name)
    name = '%s.model' %name
    name = os.path.join('models', name)
    return name

In [16]:
def is_grid_search_with_transformer(model):
    return type(model) == sklearn.model_selection._search.GridSearchCV and len(model.estimator.steps) == 2

## GRIDSEARCH

Only using stemmer, since it performs better in all situations

In [17]:
cleaners = [
#     ('basic', BasicCleaner().fit(X_raw).transform(X_raw)),
    ('stemmer', X),
]

In [18]:
transformers = [
    [
        'CountVectorizerDefault', 
        CountVectorizer(),
        {}
    ],
    [
        'TfidfVectorizerDefault', 
        TfidfVectorizer(),
        {}
    ]
]

In [19]:
models = [
    [
        'LogisticRegression-l2',
        LogisticRegression(max_iter=2000),
        {
            'C': (0.001, 0.01, 0.1, 1, 10, 100, 1000),
            'penalty': ['l2'],
            'class_weight': ('balanced', None),
            'solver': ('newton-cg', 'sag', 'lbfgs')
        },
    ],
    [
        'LinearSVC',
        LinearSVC(),
        {
            'C': [0.001, 0.01, 0.1, 1, 10],
        }
    ],
    [
        'Perceptron',
        Perceptron(),
        {
            'alpha': [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3],
            'n_iter': [5, 10, 15, 20, 50],
            'penalty': [None, 'l2', 'l1', 'elasticnet']
        }
    ],
    [
        'MultinomialNB',
        MultinomialNB(),
        {
            'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]
        }
    ],
    [
        'DecisionTreeClassifier',
        DecisionTreeClassifier(),
        {
            'min_samples_split' : range(10,500,20),
            'max_depth': range(1,20,2)
        } 
    ],
       [
        'RandomForestClassifier',
        RandomForestClassifier(),
        {
            'max_depth': [10, 50, 110, None],
            'max_features': ['auto', 'sqrt'],
            'n_estimators': [200, 500, 1000]
        }
    ]    
]

In [20]:
for cleaner_name, data in cleaners:
    for transformer_name, transformer, transformer_params in transformers:
        for model_name, model, model_params in models:
            
            print('%s with %s (%s cleaner)...' %(transformer_name, model_name, cleaner_name))
            
            name = get_model_name(model_name, transformer_name, cleaner_name)
            fname = get_filename_for_model_name(name)
            
            if not os.path.exists(fname):
                X = data
                K = 5
                
                pipeline = Pipeline(
                    [
                        (transformer_name, transformer),
#                         ('to_dense', DenseTransformer()), 
                        (model_name, model)
                    ]
                )
                
                new_params = {}

                for k, v in model_params.items():
                    new_k = model_name + "__" + k
                    new_params[new_k] = v
                
                parameters = {
                    **transformer_params,
                    **new_params
                }
                print(parameters)
        
                grid = GridSearchCV(
                    pipeline,
                    parameters,
                    n_jobs=-1,
                    cv=K
                )
            
                grid.fit(X, y)
                model = grid.best_estimator_
                score = grid.best_score_
                
                res = {
                    'name': name,
                    'transformer': transformer,
                    'transformer_name': transformer_name,
                    'model': model,
                    'model_name': model_name,
                    'cleaner': cleaner_name,
                    'score': score,
                    }

                dump_results(res)
                print('Saved to %s' %fname)

            else:
                print('Skipping. Model exists at %s' %fname)

CountVectorizerDefault with LogisticRegression-l2 (stemmer cleaner)...
Skipping. Model exists at models\stemmer-countvectorizerdefault-logisticregression-l2.model
CountVectorizerDefault with LinearSVC (stemmer cleaner)...
Skipping. Model exists at models\stemmer-countvectorizerdefault-linearsvc.model
CountVectorizerDefault with Perceptron (stemmer cleaner)...
Skipping. Model exists at models\stemmer-countvectorizerdefault-perceptron.model
CountVectorizerDefault with MultinomialNB (stemmer cleaner)...
Skipping. Model exists at models\stemmer-countvectorizerdefault-multinomialnb.model
CountVectorizerDefault with DecisionTreeClassifier (stemmer cleaner)...
Skipping. Model exists at models\stemmer-countvectorizerdefault-decisiontreeclassifier.model
CountVectorizerDefault with RandomForestClassifier (stemmer cleaner)...
Skipping. Model exists at models\stemmer-countvectorizerdefault-randomforestclassifier.model
TfidfVectorizerDefault with LogisticRegression-l2 (stemmer cleaner)...
Skipping.