In [1]:
!ls ../input/simplesentiment

products_sentiment_sample_submission.csv  products_sentiment_train.tsv
products_sentiment_test.tsv


In [2]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression , LogisticRegressionCV

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler, Binarizer, StandardScaler

from sklearn.feature_selection import SelectKBest, chi2

from sklearn.model_selection import cross_val_score, GridSearchCV

In [3]:
files = ['products_sentiment_train.tsv', 'products_sentiment_test.tsv']

PATH_TO_DATA = '../input/simplesentiment/'

In [4]:
train =  pd.read_csv(PATH_TO_DATA+files[0], sep = '\t', header = None, names = ['text', 'target']) 
test =pd.read_csv(PATH_TO_DATA+files[1], sep = '\t')

In [5]:
train.target.value_counts()

1    1274
0     726
Name: target, dtype: int64

In [8]:
X = train.text
y = train.target

In [234]:
def mean_std(estimator, X, y):
    scores = cross_val_score(estimator, X, y, cv=5, n_jobs = 4, scoring = 'accuracy')
    return np.mean(scores), np.std(scores) , scores

# SGDClassifier

In [11]:
from sklearn.linear_model import SGDClassifier

In [105]:
sgd = Pipeline(steps=[
    #     ('vectorizer', CountVectorizer(ngram_range=(1, 4), max_features=60000)),
    ('vectorizer',
     TfidfVectorizer(ngram_range=(1,
                                  4), sublinear_tf=True, max_features=50000)),
    #     ('selection', SelectKBest(score_func=chi2, k=50000)),
    ('abs_scale', MaxAbsScaler()),
    ('classifier',
     GridSearchCV(n_jobs=6,
                  cv=5,
                  scoring='accuracy',
                  estimator=SGDClassifier(class_weight='balanced',
                                          penalty='l2',
                                          n_jobs=-1,
                                          random_state=11),
                  param_grid={'alpha': np.logspace(-5, -1, 151)}))
])

In [106]:
sgd.fit(X,y)
sgd['classifier'].best_params_ , sgd['classifier'].best_score_

({'alpha': 0.0034145488738336043}, 0.771)

In [107]:
%%time

mean_std(sgd, X, y)

CPU times: user 535 ms, sys: 71.7 ms, total: 606 ms
Wall time: 13.7 s


(0.7725000000000001,
 0.007582875444051565,
 array([0.7775, 0.785 , 0.7675, 0.7675, 0.765 ]))

# RidgeClassifier

In [198]:
from sklearn.linear_model import PassiveAggressiveClassifier

pa = Pipeline(
    steps=[('vectorizer',
            CountVectorizer(ngram_range=(1, 4), max_features=60000)),
#            ('vectorizer',
#             TfidfVectorizer(
#                 ngram_range=(1, 4), sublinear_tf=True, max_features=50000)),
           ('abs_scale', MaxAbsScaler()),
           ('classifier',
            GridSearchCV(n_jobs=4,
                         cv=5,
                         scoring='accuracy',
                         estimator=PassiveAggressiveClassifier(
                             n_jobs=4, random_state=1),
                         param_grid={'C': np.logspace(-4, -3, 51)}))])

In [243]:
pa = Pipeline(steps=[
    #     ('vectorizer', CountVectorizer(ngram_range=(1, 4), max_features=60000)),
    ('vectorizer',
     TfidfVectorizer(ngram_range=(1,
                                  4), sublinear_tf=True, max_features=50000)),
    ('abs_scale', MaxAbsScaler()),
    ('classifier', PassiveAggressiveClassifier(n_jobs=4, random_state=13, C = 5*10**-4))
])

In [244]:
%%time

mean_std(pa, X, y)

CPU times: user 53.1 ms, sys: 48.8 ms, total: 102 ms
Wall time: 1.82 s


(0.7715, 0.00982344135219425, array([0.7725, 0.7675, 0.78  , 0.755 , 0.7825]))

# Random

In [228]:
from sklearn.ensemble import ExtraTreesClassifier

rf = Pipeline(steps=[
    ('vectorizer', CountVectorizer(ngram_range=(
        1, 4), max_features=50000)), ('abs_scale', MaxAbsScaler()),
    ('classifier',
     RandomForestClassifier(
         n_jobs=4, random_state=1, n_estimators=4000, class_weight='balanced'))
])

%%time

mean_std(rf, X, y)