## Feature extraction 

In [1]:
import pandas as pd
X_train = pd.read_csv("/code/data/train.csv")['headline']
y_train = pd.read_csv("/code/data/train.csv")['is_sarcastic']

X_val = pd.read_csv("/code/data/val.csv")['headline']
y_val = pd.read_csv("/code/data/val.csv")['is_sarcastic']

In [2]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

# Initialize the `tfidf_vectorizer` 
#tfidf_vectorizer = TfidfVectorizer(stop_words='english') 
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the val set 
tfidf_val = tfidf_vectorizer.transform(X_val)

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV

class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [6]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

models1 = {
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'SVC': SVC()
}

params1 = {
    'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
    'RandomForestClassifier': { 'n_estimators': [16, 32] },
    'AdaBoostClassifier':  { 'n_estimators': [16, 32] },
    'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
    'SVC': [
        {'kernel': ['linear'], 'C': [1, 10]},
        {'kernel': ['rbf'], 'C': [1, 10], 'gamma': [0.001, 0.0001]},
    ]
}

In [7]:
helper1 = EstimatorSelectionHelper(models1, params1)
helper1.fit(tfidf_train, y_train, scoring='accuracy', n_jobs=4)

Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Running GridSearchCV for SVC.
Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=4)]: Done   6 out of   6 | elapsed:   11.2s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   6 out of   6 | elapsed:   11.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Running GridSearchCV for GradientBoostingClassifier.
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=4)]: Done  18 out of  18 | elapsed:  2.4min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Running GridSearchCV for AdaBoostClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed:   10.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of   6 | elapsed:    5.7s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   6 out of   6 | elapsed:    5.7s finished


In [8]:
helper1.score_summary(sort_by='max_score')

RandomForestClassifier
SVC
GradientBoostingClassifier
AdaBoostClassifier


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,C,gamma,kernel,learning_rate,n_estimators
2,SVC,0.827568,0.830354,0.835576,0.00369522,1.0,,linear,,
1,RandomForestClassifier,0.783368,0.795412,0.807618,0.00990074,,,,,32.0
3,SVC,0.784416,0.794946,0.805522,0.00861618,10.0,,linear,,
0,RandomForestClassifier,0.781796,0.789646,0.801503,0.00852951,,,,,16.0
9,GradientBoostingClassifier,0.754542,0.76175,0.766207,0.00514381,,,,0.8,32.0
11,GradientBoostingClassifier,0.754193,0.760934,0.766032,0.00497109,,,,1.0,32.0
8,GradientBoostingClassifier,0.734976,0.745443,0.753626,0.00778352,,,,0.8,16.0
10,GradientBoostingClassifier,0.737596,0.744453,0.749083,0.00494683,,,,1.0,16.0
13,AdaBoostClassifier,0.724668,0.732223,0.736897,0.00539215,,,,,32.0
12,AdaBoostClassifier,0.703005,0.712247,0.720825,0.00729006,,,,,16.0


## Classifiers 

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf1 = MultinomialNB() 

# Predict bag of words

# clf1.fit(tfidf_train, y_train)
# pred1 = clf1.predict(tfidf_val)
# metrics.accuracy_score(y_val, pred1)

0.8322851153039832

In [4]:
from sklearn import svm

clf2 = svm.SVC(gamma='scale', probability=True)

# clf2.fit(tfidf_train, y_train)
# pred2 = clf2.predict(tfidf_val)
# metrics.accuracy_score(y_val, pred2)

0.8481248544141626

In [5]:
from itertools import product
from sklearn.ensemble import VotingClassifier
# Loading some example data

eclf = VotingClassifier(estimators=[('mnb', clf1), ('svm', clf2)],
                        voting='soft', weights=[1, 1])
# clf1 = clf1.fit(X, y)
# clf2 = clf2.fit(X, y)
eclf = eclf.fit(tfidf_train, y_train)
pred3 = eclf.predict(tfidf_val)
metrics.accuracy_score(y_val, pred3)

0.8481248544141626

In [None]:
from sklearn.model_selection import GridSearchCV

0.8621010948054973