In [3]:
import json
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Normalizer

In [4]:
# Constants

# File paths
DATA_FP = "./data/" # Data file path
TWEETS_FP = DATA_FP + "tweets.json"
TRAIN_DEV_FP = DATA_FP + "labels-train+dev.tsv"
TEST_FP = DATA_FP + "labels-test.tsv"

# Column names
COL_ID = 'ID'
COL_TWEET = 'Tweet'
COL_LABEL = 'Label'

---
# Data Preprocessing

## Tweets

In [5]:
# process the first file (Tweets)
tweets = []
with open(TWEETS_FP, 'r') as tweets_fh:  # Tweets file handle
    for line in tweets_fh:   # put each line in a list of lines
        j_content = json.loads(line)
        tweets.append(j_content)

tweets = pd.DataFrame(tweets, columns=[COL_ID, COL_TWEET])  # make a dataframe out of it

In [6]:
tweets # looks good!

Unnamed: 0,ID,Tweet
0,483885347374243841,اللهم أفرح قلبي وقلب من أحب وأغسل أحزاننا وهمو...
1,484023414781263872,إضغط على منطقتك يتبين لك كم يتبقى من الوقت عن ...
2,484026168300273664,اللَّهٌمَّ صَلِّ وَسَلِّمْ عَلىٰ نَبِيِّنَآ مُ...
3,483819942878650369,@Dinaa_ElAraby اها يا بيبي والله اتهرست علي تو...
4,483793769079123971,• افضل كتاب قرأته هو : أمي (ابراهام لنكولن)\n🌹...
5,483934868070350849,@hudc7721 انتظري اجل \nخيره لك يارب 😘
6,483863369972473856,(وإن تجهر بالقول فإنه يعلم السر وأخفى) [طه:7]\...
7,483871567311413248,ﺧﻟك ﻋزﯾز آﻟﻧﻓس ﻟۈ ﮪﻣۈﻣك ﺟﺑآللاﭠﺷﺷﮐي ﻟﻟﻧآﺳس ﻣن ...
8,483931429902884864,عشان الجنّة أجمل ؟ الله يبعدنا عن كل ذنب مايخ...
9,483773756897124352,توجيه كيفية تثبيت البرامج الثابتة ROM التحميل ...


## Labels

In [7]:
# deal with both label documents

train_dev_labels = pd.read_csv(TRAIN_DEV_FP, sep='\t', header=None, names=[COL_LABEL, COL_ID])
test_labels = pd.read_csv(TEST_FP, sep='\t', header=None, names=[COL_LABEL, COL_ID])

In [8]:
train_dev_labels # looks about right

Unnamed: 0,Label,ID
0,ar,483762194908479488
1,ar,483762916097654784
2,ar,483764828784582656
3,ar,483765526683209728
4,ar,483768342315282432
5,ar,483770765985083392
6,ar,483770900127285248
7,ar,483770997892345857
8,ar,483773690769702912
9,ar,483773756897124352


In [9]:
tweets[COL_ID]=tweets[COL_ID].astype(int) # to allow for merge, need the same type

train_dev_data = pd.merge(tweets, train_dev_labels, on=COL_ID) # merge by ID
test_data = pd.merge(tweets, test_labels, on=COL_ID) # merge by ID

In [10]:
train_dev_data.head()

Unnamed: 0,ID,Tweet,Label
0,483885347374243841,اللهم أفرح قلبي وقلب من أحب وأغسل أحزاننا وهمو...,ar
1,484023414781263872,إضغط على منطقتك يتبين لك كم يتبقى من الوقت عن ...,ar
2,484026168300273664,اللَّهٌمَّ صَلِّ وَسَلِّمْ عَلىٰ نَبِيِّنَآ مُ...,ar
3,483819942878650369,@Dinaa_ElAraby اها يا بيبي والله اتهرست علي تو...,ar
4,483793769079123971,• افضل كتاب قرأته هو : أمي (ابراهام لنكولن)\n🌹...,ar


In [11]:
test_data.tail() 

Unnamed: 0,ID,Tweet,Label
13447,491992347278725120,『 』 免費的女性向戀愛遊戲★想和哥哥假結婚看看嗎？　https://t.co/YOUkKU...,zh-TW
13448,494650331528433666,#204.7.25 花蓮區漁會 http://t.co/yOukLkXwCr,zh-TW
13449,485752157937741824,@makzihau 当然是史地啦,zh-CN
13450,484586108282736641,@Official_SABC1 Moloooo nakuwe!!!,zu
13451,484672019091300352,精神統一精神統一精神統一精神統一精神統一精神統一精神統一精神統一精神統一精神統一精神統一精神...,zh-TW


In [12]:
def drop_n_shuffle(data):
    data_no_na = data.dropna().copy()
    return data_no_na.sample(frac=1)

train_dev_data_prepared = drop_n_shuffle(train_dev_data).reset_index(drop = True)
train_set = train_dev_data_prepared.sample(frac=0.9, random_state=0) # take 90% of the data, reshuffle
test_set = drop_n_shuffle(test_data)
dev_set = train_dev_data_prepared.drop(train_set.index) # take 10% that remain

In [13]:
print(train_set.head()) #some checks
print(type(train_set))
print(dev_set.head())
print(type(dev_set))
print(test_set.head())
print(type(test_set))

                       ID                                              Tweet  \
36399  488684418572111872                                                Wtg   
20147  488074632235589633  Que Holanda le esté ganando a Brasil no mejora...   
3049   493779280405741568  “@viniloskombi: Mañana martes 29 amanecemos en...   
23766  488336323900493824  ほんまだいすき！友は宝やっ！\n最高にたのしかったです😭💓 http://t.co/G8Vl...   
38766  487533560379621377                                 @iqbaalupha sm" ^^   

      Label  
36399   und  
20147    es  
3049     es  
23766    ja  
38766   und  
<class 'pandas.core.frame.DataFrame'>
                    ID                                              Tweet  \
10  490919702957666304               il veux pas vivre longtemp ce lui la   
19  487397772337741826                                              Tst!!   
43  485776569655255040  You're so beautiful that you made me forget my...   
52  491719616172163072                           @ivansancheeez  pot ser.   
55  48898523

In [14]:
# drop the ID columns, not needed anymore

train = train_set.drop(COL_ID, axis=1)
dev = dev_set.drop(COL_ID, axis=1)
test = test_set.drop(COL_ID, axis=1)

In [15]:
train.head()

Unnamed: 0,Tweet,Label
36399,Wtg,und
20147,Que Holanda le esté ganando a Brasil no mejora...,es
3049,“@viniloskombi: Mañana martes 29 amanecemos en...,es
23766,ほんまだいすき！友は宝やっ！\n最高にたのしかったです😭💓 http://t.co/G8Vl...,ja
38766,"@iqbaalupha sm"" ^^",und


In [16]:
dev.head()

Unnamed: 0,Tweet,Label
10,il veux pas vivre longtemp ce lui la,fr
19,Tst!!,und
43,You're so beautiful that you made me forget my...,en
52,@ivansancheeez pot ser.,und
55,@SunshineZhouMi handsome mimi,en


In [17]:
test.head()

Unnamed: 0,Tweet,Label
1783,Oh wait wasn't him lol,en
2475,@InjaIsMyName @Harry_Styles anytime. I'm off t...,en
1613,@justinbieber please follow me ♥ ♥ ♥ i love yo...,en
1532,@SiahhLaw this is why everyone needs to stay a...,en
10388,よこちんクリーナーもらった http://t.co/r7H31I8P9o,ja


In [18]:
test

Unnamed: 0,Tweet,Label
1783,Oh wait wasn't him lol,en
2475,@InjaIsMyName @Harry_Styles anytime. I'm off t...,en
1613,@justinbieber please follow me ♥ ♥ ♥ i love yo...,en
1532,@SiahhLaw this is why everyone needs to stay a...,en
10388,よこちんクリーナーもらった http://t.co/r7H31I8P9o,ja
5237,Argentine Primera Division suspends play follo...,en
1552,3. there's nothing u can do for them to see wh...,en
10653,@yambiknim 정말 개 같군,ko
12986,@s724x_ 😂😭,und
1997,my fckn ear killing me !!!!!!!,en


In [19]:
train.describe() # descrption of data

Unnamed: 0,Tweet,Label
count,48122,48122
unique,48050,75
top,:(,en
freq,8,16897


In [20]:
train.groupby(COL_LABEL).size() # more description of teh data. See that there are lots of Arabian and English tweets, also quite a few in Spanish and Portuguese

Label
ar          2089
ar             1
ar_LATN        8
az             1
bg             1
bn             8
bs             4
ca            18
cs             3
cy             1
da             5
de           156
dv             1
el            34
en         16897
es          5398
et             2
fa            16
fi            14
fr           865
gl             2
ha             1
he            21
hi            14
hi-Latn       14
hr             4
ht             2
hu            13
hy             2
id          2733
           ...  
ne             2
nl           162
no            11
pl            84
ps             1
ps_LATN        1
pt          2592
ro            10
ru           886
si             1
sl             2
sq             9
sr            22
su             7
sv            45
sw             6
ta             9
th           419
tl           289
tn             1
tr           607
uk            13
und         4303
ur             7
ur_LATN       12
vi            16
wo             1
xh      

In [21]:
X_train = train.Tweet # split the data in Series
y_train = train.Label
X_test = test.Tweet
y_test = test.Label

In [22]:
print(type(X_train))
print(type(y_train))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [23]:
from sklearn.preprocessing import LabelEncoder
#encode the labels. First step means simple encoding, the second makes a series out of the array that was outputted and
    #the third step means we output strings again (strings are apparently needed as a format)

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_train_series = pd.Series(y_train_encoded)
y_train_str = y_train_series.apply(str)

y_test_encoded = label_encoder.fit_transform(y_test)
y_test_series = pd.Series(y_test_encoded)
y_test_str = y_test_series.apply(str)

In [24]:
pipeline1 = Pipeline([('tfidf', TfidfVectorizer()), ('clf0', MultinomialNB())]) #first test with a pipeline

In [25]:
pipeline1.fit(X_train, y_train_str) #using y_train_str

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf0',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [26]:
pipeline1.predict(X_test)

array(['14', '14', '14', ..., '14', '14', '67'], dtype='<U2')

In [27]:
y_test_str #doesn't look too right, but it is a very simple model

0        10
1        10
2        10
3        10
4        23
5        10
6        10
7        27
8        52
9        10
10       11
11       39
12        0
13       39
14       23
15       15
16       36
17       11
18       11
19       39
20       23
21       10
22       39
23       23
24        0
25       23
26       39
27       10
28       50
29        0
         ..
13422    21
13423    21
13424    23
13425    10
13426    52
13427    10
13428    23
13429     0
13430    10
13431    10
13432    50
13433    10
13434    11
13435    23
13436    21
13437    10
13438    52
13439    11
13440    10
13441    10
13442     8
13443    52
13444    48
13445    10
13446    52
13447    10
13448    10
13449    10
13450    10
13451    52
Length: 13452, dtype: object

In [28]:
#honestly i don't know why i did that anymore!!!
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_tranformer = TfidfTransformer(smooth_idf=True).fit(X_train_counts)
X_train_tfidf = tfidf_tranformer.transform(X_train_counts)
print(type(X_train_tfidf))

<class 'scipy.sparse.csr.csr_matrix'>


In [29]:
#same here
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tfidf, y_train_str)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
#follows here
y_predicted = nb_clf.predict(X_train_tfidf)
print(y_predicted[0:50])

['14' '15' '15' '32' '14' '29' '32' '29' '14' '14' '14' '51' '32' '19'
 '15' '14' '14' '19' '14' '14' '14' '14' '32' '14' '15' '14' '14' '32'
 '14' '14' '32' '14' '15' '14' '32' '14' '32' '14' '14' '29' '32' '14'
 '14' '15' '32' '14' '32' '14' '15' '32']


In [31]:
#and here
y_train_str

0        67
1        15
2        15
3        32
4        67
5        29
6        32
7        29
8        19
9        62
10       14
11       51
12       32
13       19
14       15
15       70
16       14
17       19
18       14
19       14
20       31
21       14
22       32
23       14
24       15
25       14
26       14
27       32
28       67
29        0
         ..
48092     0
48093    32
48094    32
48095    32
48096    67
48097    63
48098    15
48099    67
48100    14
48101    14
48102    14
48103    14
48104    14
48105    32
48106    14
48107    14
48108    14
48109    51
48110    67
48111    15
48112    14
48113    29
48114    14
48115    51
48116    15
48117    14
48118    14
48119    51
48120    51
48121    67
Length: 48122, dtype: object

In [32]:
#first serious pipeline with ngrams and tfidf
pipeline_NB01 = Pipeline([
    ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='word')),
    ('tfidf', TfidfTransformer()),
    ('clf01', MultinomialNB())
])

In [33]:
#param_grid is fitting the pipeline_NB01
param_grid01 = {'clf01__alpha': [0.2, 0.6, 0.8, 1.0],
                 'clf01__fit_prior': [True, False]}  #'ngram__ngram_range': [(1, 1), (1, 2), (1, 4)]


In [34]:
#This model here seems to work
gs_NB01= GridSearchCV(pipeline_NB01, param_grid01, cv=2, n_jobs=2, verbose=1)
gs_NB01.fit(X_train, y_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  16 out of  16 | elapsed:   49.3s finished


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('ngram',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 4),
                                           

In [35]:
y_NB01 = gs_NB01.predict(X_test)

In [36]:
y_test #results are quite terrible though 
# they are actually better than with the avg word length extractor included in the features :D wtf o_o

1783      en
2475      en
1613      en
1532      en
10388     ja
5237      en
1552      en
10653     ko
12986    und
1997      en
6680      es
11063     pt
536       ar
11314     pt
9952      ja
7058      fr
10795     nl
5872      es
6013      es
10879     pt
8265      ja
3352      en
11500     pt
8770      ja
9         ar
8474      ja
11279     pt
2361      en
12131     tr
83        ar
        ... 
7799      id
7474      id
9197      ja
3379      en
12327    und
5056      en
9537      ja
378       ar
3351      en
4400      en
12045     tr
1880      en
6129      es
8377      ja
7201      id
4114      en
12614    und
6535      es
2819      en
1690      en
577       de
12447    und
11911     th
2288      en
12275    und
4341      en
1788      en
2218      en
5391      en
13048    und
Name: Label, Length: 13452, dtype: object

In [37]:
accuracy_score(y_test, y_NB01) 

0.732530478739221

But here we really need to find the best_model out of this data i got. I don't remember how to see it in the matrix, see Tutorial!

In [45]:
#Average word length extractor, inspired  by https://michelleful.github.io/code-blog/2015/06/20/pipelines/)
class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
    """Takes in dataframe, extracts tweet column, outputs average word length"""

    def __init__(self):
        pass

    def average_word_length(self, tweet):
        """Helper code to compute average word length of a tweet"""
        return np.mean([len(word) for word in tweet.split()])

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        # the result of the transform needs to be a 2d array a.k.a. dataframe
        # https://stackoverflow.com/a/50713209
        result = df.apply(self.average_word_length).to_frame()
        print(result)
        return result

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

In [46]:
# read this blog post on how to construct feature unions :) 
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html

# the problem was that our previous pipeline went sorta like this:
# create n-grams from text (CountVectorizer) -> tfidf from ngrams (TfidfTransformer) -> average length from ngrams (AvgWLExtractor) ummm :) that wouldn't work... we need to compute the average word length from the original data (tweets / strings).
# that's why we have to do these two "pipelines" separately => now we just compute the avg from the strings like so...

pipeline_NB1 = Pipeline([
    ('features', FeatureUnion([
        # first feature
        ('ngram_tfidf', Pipeline([
            ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='word')),
            ('tfidf', TfidfTransformer())
        ])),
        # second feature
        ('ave_scaled', Pipeline([
            ('ave', AverageWordLengthExtractor()),
            ('scale', Normalizer())
        ]))
    ])),
    ('nb_clf', MultinomialNB()) # classifier
])

In [40]:
#-*- coding: utf-8 -*-
y_train.describe()

count     48122
unique       75
top          en
freq      16897
Name: Label, dtype: object

In [41]:
param_grid1 = {'nb_clf__alpha': [0.2, 0.6, 0.8, 1.0],
                 'nb_clf__fit_prior': [True, False]}  #'ngram__ngram_range': [(1, 1), (1, 2), (1, 4)]


In [47]:
gs_NB1= GridSearchCV(pipeline_NB1, param_grid1, cv=2, n_jobs=2, verbose=1)
gs_NB1.fit(X_train, y_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  16 out of  16 | elapsed:  1.1min finished


           Tweet
36399   3.000000
20147   5.050000
3049    7.437500
23766  16.333333
38766   5.333333
5650    9.500000
13094  15.666667
33107   6.052632
43340   4.200000
42515   8.000000
51396   6.000000
44254   4.375000
34152  13.333333
51384   3.818182
771     4.625000
47383   4.666667
47590   5.800000
27982   5.578947
40752   4.136364
16687   6.200000
43396   7.000000
48082   6.000000
24973  17.500000
40278   5.500000
44075   4.500000
16965   6.800000
11910   5.000000
7958   24.000000
40026   7.000000
2134    6.750000
...          ...
24834   6.411765
45173  40.333333
37938  10.000000
46872  16.000000
23163   1.000000
49577   5.428571
42737   6.000000
171    18.000000
39140   6.166667
1798    5.000000
36724   6.000000
28622   6.555556
10810   4.217391
3765   24.000000
19841   5.500000
4587    8.333333
2240    6.200000
6833    6.200000
3397    3.400000
11795   5.882353
50072   3.142857
18164   6.000000
24022   5.300000
8294    4.529412
15467   5.285714
32018   4.600000
44447   7.0588

GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('features',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('ngram_tfidf',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('ngram',
                                                                                         CountVectorizer(analyzer='word',
                                                                                                         binary=False,
                                                                                                         decode_error='strict',
                                                                                                         dtype=<class 'numpy.int64'>,
   

In [54]:
y_NB1 = gs_NB1.predict(X_test)

           Tweet
1783    3.600000
2475    5.444444
1613    3.285714
1532    4.272727
10388  17.500000
5237    7.058824
1552    3.105263
10653   3.750000
12986   4.500000
1997    4.166667
6680    3.961538
11063   4.230769
536     4.863636
11314   5.200000
9952   12.666667
7058    3.944444
10795   7.333333
5872    3.500000
6013    4.625000
10879   4.000000
8265   18.000000
3352    6.666667
11500   4.200000
8770   17.666667
9       6.266667
8474    1.000000
11279   4.000000
2361    3.722222
12131  11.444444
83      6.500000
...          ...
7799    7.272727
7474    4.500000
9197    2.900000
3379    5.250000
12327   7.500000
5056    7.875000
9537   12.500000
378     4.714286
3351    4.200000
4400    5.000000
12045   7.312500
1880    4.833333
6129    4.333333
8377   10.200000
7201   10.000000
4114    4.423077
12614  10.500000
6535    5.562500
2819    4.333333
1690    5.800000
577     8.500000
12447  16.500000
11911  16.500000
2288    4.117647
12275   8.833333
4341    6.166667
1788    5.0000

In [55]:
accuracy_score(y_test, y_NB1)
# not rly sure why the accuracy is lower than NB01, kinda confused bout this :D

0.6856972940826643

In [123]:
# Let's test without avg but with a nested FeatureUnion with 1 feature (pipeline)
# This should have the same accuracy as NB01, and it does... which implies that the pipeline/featureunion thing is correct. I'd say we have to look into hyperparams to fix the accuracy.
pipeline_NB2 = Pipeline([
    ('features', FeatureUnion([
        # first feature
        ('ngram_tfidf', Pipeline([
            ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='word')),
            ('tfidf', TfidfTransformer())
        ]))
    ])),
    ('nb_clf', MultinomialNB()) # classifier
])

gs_NB2= GridSearchCV(pipeline_NB2, param_grid1, cv=2, n_jobs=2, verbose=1) # same param_grid as NB1
gs_NB2.fit(X_train, y_train)
y_NB2 = gs_NB2.predict(X_test)
accuracy_score(y_test, y_NB2)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  16 out of  16 | elapsed:   52.5s finished


0.7334225393993459

In [138]:
#have a try with SGD, same features!
pipeline_SGD = Pipeline([
    ('feats', FeatureUnion([
        ('ngram_tfidf', Pipeline([
            ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='word')),
            ('tfidf', TfidfTransformer()), 
        ])),
        ('ave', AverageWordLengthExtractor())
    ])),
    ('SGD_clf', SGDClassifier())# classifier
])

In [139]:
grid_param_SGD = {'SGD_clf__loss': ['hinge', 'log'],
                  'SGD_clf__penalty': ['none', 'l1', 'l2'],
                  'SGD_clf__max_iter': [50, 100, 500, 1000]}

In [140]:
gs_SGD = GridSearchCV(pipeline_SGD, grid_param_SGD, cv=2, n_jobs=4, verbose=1)
gs_SGD.fit(X_train, y_train)
##NOTE: it crashed^s at the same point: could not convert string to float: 'ញាំថ្នាំផ្តាសសាយខ្លាំពេកឡើងគេញលែងចង់ចង់ក្រោកហើយ....'
#Maybe some problem with the encoding?

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  48 out of  48 | elapsed:  5.5min finished


<class 'pandas.core.series.Series'>


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('feats',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('ngram_tfidf',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('ngram',
                                                                                         CountVectorizer(analyzer='word',
                                                                                                         binary=False,
                                                                                                         decode_error='strict',
                                                                                                         dtype=<class 'numpy.int64'>,
      

In [136]:
y_SGD = gs_SGD.predict(X_test)

In [141]:
accuracy_score(y_test, y_SGD)

0.8403954802259888