### Preprocessing

In [1]:
import datetime
import pandas as pd
import warnings
import sklearn.externals as extjoblib
import joblib
warnings.filterwarnings("ignore")

#from google.cloud import storage

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score
from sklearn.tree import DecisionTreeClassifier
#from sklearn.externals import joblib
from sklearn.pipeline import Pipeline

In [2]:
RANDOM_SEED = 42

In [3]:
df_proc = pd.read_csv('cleaned_non-vectorized_data.csv')
df_proc = df_proc[~df_proc['Text'].isna()]
X = df_proc['Text']

In [4]:
df_proc

Unnamed: 0,Text,tokens,disgust,joy,anger,surprised,sad,fear,neutral
0,come mert ’ today let u take care lunch enjoy ...,"['come', 'mert', '’', 'today', 'let', 'u', 'ta...",0,0,0,0,0,0,1
1,nxt gt lay 20 staff tech 's latest cutback rb_...,"['nxt', 'gt', 'lay', '20', 'staff', 'tech', ""'...",0,0,0,0,0,0,1
2,layoff 20 workforce 100 employee sf bay area h...,"['layoff', '20', 'workforce', '100', 'employee...",0,0,0,0,0,0,1
3,today ’ lunch special smoked pork sausage onio...,"['today', '’', 'lunch', 'special', 'smoked', '...",0,0,0,0,0,0,1
4,come mert ’ today grab salmon cake two home co...,"['come', 'mert', '’', 'today', 'grab', 'salmon...",0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
9277,traik01 cdc people warmed u 2 year ago .... sa...,"['traik01', 'cdc', 'people', 'warmed', 'u', '2...",0,0,0,0,1,0,0
9278,sorry ’ promo code share lately 😭 promos autom...,"['sorry', '’', 'promo', 'code', 'share', 'late...",0,0,0,0,1,0,0
9279,poor lad http //t.co/36o565zsc3,"['poor', 'lad', 'http', '//t.co/36o565zsc3']",0,0,0,0,1,0,0
9280,one day able bill order tmobile bill sadly tod...,"['one', 'day', 'able', 'bill', 'order', 'tmobi...",0,0,0,0,1,0,0


In [5]:
y = df_proc.loc[:,['disgust', 'joy', 'anger', 'surprised', 'sad', 'fear', 'neutral']]

In [6]:
y

Unnamed: 0,disgust,joy,anger,surprised,sad,fear,neutral
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
9277,0,0,0,0,1,0,0
9278,0,0,0,0,1,0,0
9279,0,0,0,0,1,0,0
9280,0,0,0,0,1,0,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [8]:
vectorizer = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,2), min_df=50)
transformer = TfidfTransformer()

In [10]:
clf = DecisionTreeClassifier(random_state=RANDOM_SEED, 
                             #max_features='log2',
                             #class_weight='balanced'
                            )
multi_out_clf = MultiOutputClassifier(clf)

In [11]:
# Transform the features and fit them to the classifier
# multi_out_clf.fit(transformer.fit_transform(vectorizer.fit_transform(X_train)), y_train)

In [12]:
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('transformer', transformer),
    ('multi-classifier', multi_out_clf)
])

In [13]:
parameters = {
    "vectorizer__max_df": (0.5, 0.75, 1.0),
    'vectorizer__max_features': (None, 5000, 10000, 50000),
    "vectorizer__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    'transformer__use_idf': (True, False),
    'transformer__norm': ('l1', 'l2'),
}


In [14]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vectorizer', 'transformer', 'multi-classifier', 'vectorizer__analyzer', 'vectorizer__binary', 'vectorizer__decode_error', 'vectorizer__dtype', 'vectorizer__encoding', 'vectorizer__input', 'vectorizer__lowercase', 'vectorizer__max_df', 'vectorizer__max_features', 'vectorizer__min_df', 'vectorizer__ngram_range', 'vectorizer__preprocessor', 'vectorizer__stop_words', 'vectorizer__strip_accents', 'vectorizer__token_pattern', 'vectorizer__tokenizer', 'vectorizer__vocabulary', 'transformer__norm', 'transformer__smooth_idf', 'transformer__sublinear_tf', 'transformer__use_idf', 'multi-classifier__estimator__ccp_alpha', 'multi-classifier__estimator__class_weight', 'multi-classifier__estimator__criterion', 'multi-classifier__estimator__max_depth', 'multi-classifier__estimator__max_features', 'multi-classifier__estimator__max_leaf_nodes', 'multi-classifier__estimator__min_impurity_decrease', 'multi-classifier__estimator__min_impurity_split', 'multi-classif

In [15]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


GridSearchCV(estimator=Pipeline(steps=[('vectorizer',
                                        CountVectorizer(min_df=50,
                                                        ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('transformer', TfidfTransformer()),
                                       ('multi-classifier',
                                        MultiOutputClassifier(estimator=DecisionTreeClassifier(random_state=42)))]),
             n_jobs=-1,
             param_grid={'transformer__norm': ('l1', 'l2'),
                         'transformer__use_idf': (True, False),
                         'vectorizer__max_df': (0.5, 0.75, 1.0),
                         'vectorizer__max_features': (None, 5000, 10000, 50000),
                         'vectorizer__ngram_range': ((1, 1), (1, 2))},
             verbose=1)

In [16]:
best_parameters = grid_search.best_estimator_.get_params()

In [17]:
best_parameters

{'memory': None,
 'steps': [('vectorizer',
   CountVectorizer(max_df=0.75, min_df=50, ngram_range=(1, 2),
                   stop_words='english')),
  ('transformer', TfidfTransformer(norm='l1', use_idf=False)),
  ('multi-classifier',
   MultiOutputClassifier(estimator=DecisionTreeClassifier(random_state=42)))],
 'verbose': False,
 'vectorizer': CountVectorizer(max_df=0.75, min_df=50, ngram_range=(1, 2),
                 stop_words='english'),
 'transformer': TfidfTransformer(norm='l1', use_idf=False),
 'multi-classifier': MultiOutputClassifier(estimator=DecisionTreeClassifier(random_state=42)),
 'vectorizer__analyzer': 'word',
 'vectorizer__binary': False,
 'vectorizer__decode_error': 'strict',
 'vectorizer__dtype': numpy.int64,
 'vectorizer__encoding': 'utf-8',
 'vectorizer__input': 'content',
 'vectorizer__lowercase': True,
 'vectorizer__max_df': 0.75,
 'vectorizer__max_features': None,
 'vectorizer__min_df': 50,
 'vectorizer__ngram_range': (1, 2),
 'vectorizer__preprocessor': None,

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score

def print_classification_scores(y_test, pred):
    print('Accuracy Score:',accuracy_score(y_test, pred))
    print('Precision Score:',precision_score(y_test, pred, average='micro'))
    print('Recall Score:',recall_score(y_test, pred, average='micro'))
    print('F1 Score:',f1_score(y_test, pred, average='micro'))
    print('AUC Score:',roc_auc_score(y_test, pred, average='micro'))

In [19]:
pred = grid_search.predict(X_test)
print_classification_scores(y_test, pred)

Accuracy Score: 0.4639008620689655
Precision Score: 0.624432104997476
Recall Score: 0.5834905660377359
F1 Score: 0.6032674957327482
AUC Score: 0.7575289474780291


In [None]:
pred