# Supervised Learning with scikit-learn

* http://scikit-learn.org/stable/supervised_learning.html
* Bare bones Python ML implementations: https://github.com/eriklindernoren/ML-From-Scratch

### Classification Metrics:

* http://scikit-learn.org/stable/modules/model_evaluation.html
* Today we will be using accuracy (compare to fscore, mrr, ...)

In [1]:
import pandas as pd
import numpy as np
from collections import Counter

## Prepare Data (once again, sentiment analysis)

In [2]:
data = pd.read_pickle('sentiment.pkl')

data.shape

(8001, 4)

In [3]:
data[:5]

Unnamed: 0,polarity,tweet,split_tweet,w2v
888312,4,Breaky burrito at Whole Foods is a good way to...,"[breaky, burrito, at, whole, foods, is, a, goo...","[-0.077301025, 0.6977997, 1.0196915, 2.9152832..."
516573,0,i'm out! gonna check my facebook. please!!!!!!...,"[i'm, out, !, gonna, check, my, facebook, ., p...","[0.74108887, -0.13287354, 0.54785156, 1.409133..."
970735,4,yay just won mac msf in petticoat on ebay and ...,"[yay, just, won, mac, msf, in, petticoat, on, ...","[0.20593262, -0.48257446, -0.27441406, 1.66168..."
862961,4,@shezDOPEx3 i love you more,"[i, love, you, more]","[0.14550781, -0.41455078, 0.107910156, 0.83886..."
122643,0,@mahdi Maybe the problem is from my ISP,"[maybe, the, problem, is, from, my, isp]","[0.318573, 0.029671669, 0.38720703, 1.3040771,..."


In [4]:
dev=data.sample(frac=0.1,random_state=200)
train=data.drop(dev.index)


train.shape, dev.shape, Counter(train.polarity)

((7201, 4), (800, 4), Counter({4: 3586, 0: 3615}))

In [5]:
neg = train[train.polarity == 0]
pos = train[train.polarity == 4]

neg.w2v.shape, pos.w2v.shape

((3615,), (3586,))

In [6]:
pos.w2v[:1]

888312    [-0.077301025, 0.6977997, 1.0196915, 2.9152832...
Name: w2v, dtype: object

In [7]:
[0] * 10

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [8]:
pos = [x for x in pos.w2v if type(x) is not np.float64]
neg = [x for x in neg.w2v if type(x) is not np.float64]
labels = len(neg) * [0] + len(pos) * [1]

train_data =  list(neg) + list(pos)

len(train_data)

7158

## Logistic Regression

In [9]:
from sklearn import linear_model
model = linear_model.LogisticRegression(penalty='l2')

## Support Vector Machines

In [43]:
from sklearn import svm
model = svm.SVC()

In [45]:
# Linear SVM
model = svm.LinearSVC()

## Decision Tree

* A deeper explanation: http://nbviewer.jupyter.org/github/justmarkham/DAT4/blob/master/notebooks/15_decision_trees.ipynb

In [11]:
from sklearn import tree
model = tree.DecisionTreeClassifier()

## Random Forest

In [50]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)

## Neural Network

In [13]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(1, 2), random_state=1)

## Pipeline (logres & SVM)

In [54]:
import sklearn
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
model = Pipeline([('logres', linear_model.LogisticRegression(penalty='l2')),
                  ('svm', svm.SVC())
                 ])


## Train & Test

In [14]:
from sklearn.metrics import accuracy_score

model.fit(train_data, labels)

dev = data[data.index.isin(dev.index)]

neg = dev[dev.polarity == 0].w2v
pos = dev[dev.polarity == 1].w2v

pos = [x for x in pos if type(x) is not np.float64]
neg = [x for x in neg if type(x) is not np.float64]

neg_guess = [model.predict(v.reshape(1, -1)) for v in neg]
pos_guess = [model.predict(v.reshape(1, -1)) for v in pos]

accuracy_score(neg_guess+pos_guess, len(neg_guess)*[0] + len(pos_guess) * [1])

1.0