In [34]:
import json
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [35]:
# Constants

# File paths
DATA_FP = "./data/" # Data file path
TWEETS_FP = DATA_FP + "tweets.json"
TRAIN_DEV_FP = DATA_FP + "labels-train+dev.tsv"
TEST_FP = DATA_FP + "labels-test.tsv"

# Column names
COL_ID = 'ID'
COL_TWEET = 'Tweet'
COL_LABEL = 'Label'

---
# Data Preprocessing

## Tweets

In [36]:
# process the first file (Tweets)
tweets = []
with open(TWEETS_FP, 'r') as tweets_fh:  # Tweets file handle
    for line in tweets_fh:   # put each line in a list of lines
        j_content = json.loads(line)
        tweets.append(j_content)

tweets = pd.DataFrame(tweets, columns=[COL_ID, COL_TWEET])  # make a dataframe out of it

In [37]:
tweets # looks good!

Unnamed: 0,ID,Tweet
0,483885347374243841,اللهم أفرح قلبي وقلب من أحب وأغسل أحزاننا وهمو...
1,484023414781263872,إضغط على منطقتك يتبين لك كم يتبقى من الوقت عن ...
2,484026168300273664,اللَّهٌمَّ صَلِّ وَسَلِّمْ عَلىٰ نَبِيِّنَآ مُ...
3,483819942878650369,@Dinaa_ElAraby اها يا بيبي والله اتهرست علي تو...
4,483793769079123971,• افضل كتاب قرأته هو : أمي (ابراهام لنكولن)\n🌹...
5,483934868070350849,@hudc7721 انتظري اجل \nخيره لك يارب 😘
6,483863369972473856,(وإن تجهر بالقول فإنه يعلم السر وأخفى) [طه:7]\...
7,483871567311413248,ﺧﻟك ﻋزﯾز آﻟﻧﻓس ﻟۈ ﮪﻣۈﻣك ﺟﺑآللاﭠﺷﺷﮐي ﻟﻟﻧآﺳس ﻣن ...
8,483931429902884864,عشان الجنّة أجمل ؟ الله يبعدنا عن كل ذنب مايخ...
9,483773756897124352,توجيه كيفية تثبيت البرامج الثابتة ROM التحميل ...


## Labels

In [38]:
# deal with both label documents

train_dev_labels = pd.read_csv(TRAIN_DEV_FP, sep='\t', header=None, names=[COL_LABEL, COL_ID])
test_labels = pd.read_csv(TEST_FP, sep='\t', header=None, names=[COL_LABEL, COL_ID])

In [39]:
train_dev_labels # looks about right

Unnamed: 0,Label,ID
0,ar,483762194908479488
1,ar,483762916097654784
2,ar,483764828784582656
3,ar,483765526683209728
4,ar,483768342315282432
5,ar,483770765985083392
6,ar,483770900127285248
7,ar,483770997892345857
8,ar,483773690769702912
9,ar,483773756897124352


In [40]:
tweets[COL_ID]=tweets[COL_ID].astype(int) # to allow for merge, need the same type

train_dev_data = pd.merge(tweets, train_dev_labels, on=COL_ID) # merge by ID
test_data = pd.merge(tweets, test_labels, on=COL_ID) # merge by ID

In [41]:
train_dev_data.head()

Unnamed: 0,ID,Tweet,Label
0,483885347374243841,اللهم أفرح قلبي وقلب من أحب وأغسل أحزاننا وهمو...,ar
1,484023414781263872,إضغط على منطقتك يتبين لك كم يتبقى من الوقت عن ...,ar
2,484026168300273664,اللَّهٌمَّ صَلِّ وَسَلِّمْ عَلىٰ نَبِيِّنَآ مُ...,ar
3,483819942878650369,@Dinaa_ElAraby اها يا بيبي والله اتهرست علي تو...,ar
4,483793769079123971,• افضل كتاب قرأته هو : أمي (ابراهام لنكولن)\n🌹...,ar


In [42]:
test_data.tail() 

Unnamed: 0,ID,Tweet,Label
13447,491992347278725120,『 』 免費的女性向戀愛遊戲★想和哥哥假結婚看看嗎？　https://t.co/YOUkKU...,zh-TW
13448,494650331528433666,#204.7.25 花蓮區漁會 http://t.co/yOukLkXwCr,zh-TW
13449,485752157937741824,@makzihau 当然是史地啦,zh-CN
13450,484586108282736641,@Official_SABC1 Moloooo nakuwe!!!,zu
13451,484672019091300352,精神統一精神統一精神統一精神統一精神統一精神統一精神統一精神統一精神統一精神統一精神統一精神...,zh-TW


In [43]:
def drop_n_shuffle(data):
    data_no_na = data.dropna().copy()
    return data_no_na.sample(frac=1)

train_dev_data_prepared = drop_n_shuffle(train_dev_data).reset_index(drop = True)
train_set = train_dev_data_prepared.sample(frac=0.9, random_state=0) # take 90% of the data, reshuffle
test_set = drop_n_shuffle(test_data)
dev_set = train_dev_data_prepared.drop(train_set.index) # take 10% that remain

In [44]:
print(train_set.head()) #some checks
print(type(train_set))
print(dev_set.head())
print(type(dev_set))
print(test_set.head())
print(type(test_set))

                       ID                                              Tweet  \
36399  491331570322907136  dois mil e catorze pior ano #MTVHottest One Di...   
20147  488373339380285440            Chidinma you so cuuuteee #ChurchillShow   
3049   487037857337724928  Posta que hace como dos dias no salgo de mi ca...   
23766  489222461859196928  @RevyChamber who's ready for this weekend's ev...   
38766  489431347136851968       muita onda esse comercial do NUCE com Magrão   

      Label  
36399   und  
20147    en  
3049     es  
23766    en  
38766    pt  
<class 'pandas.core.frame.DataFrame'>
                    ID                                              Tweet  \
10  492611996479533058                             @vampirechance thnx ;)   
19  487018018589777920  New Job: Environmental, Occupation, Safety, an...   
43  488785743330811904            @yugro f4bfae1ecbe90d04e55f5f1dd5a75b3a   
52  486612107845459970  @De_Zurda Este partido #BRAvsALE le da la razó...   
55  49230617

In [45]:
# drop the ID columns, not needed anymore

train = train_set.drop(COL_ID, axis=1)
dev = dev_set.drop(COL_ID, axis=1)
test = test_set.drop(COL_ID, axis=1)

In [46]:
train.head()

Unnamed: 0,Tweet,Label
36399,dois mil e catorze pior ano #MTVHottest One Di...,und
20147,Chidinma you so cuuuteee #ChurchillShow,en
3049,Posta que hace como dos dias no salgo de mi ca...,es
23766,@RevyChamber who's ready for this weekend's ev...,en
38766,muita onda esse comercial do NUCE com Magrão,pt


In [47]:
dev.head()

Unnamed: 0,Tweet,Label
10,@vampirechance thnx ;),und
19,"New Job: Environmental, Occupation, Safety, an...",en
43,@yugro f4bfae1ecbe90d04e55f5f1dd5a75b3a,und
52,@De_Zurda Este partido #BRAvsALE le da la razó...,es
55,@JoeLydon51,und


In [48]:
test.head()

Unnamed: 0,Tweet,Label
5857,Tardee demasiado buenas con las que mas quiero...,es
10019,フィズの尻尾に巻いてある布…何あれ？,ja
1974,“@tinysatii: Fting jay is so funny 😭😂🙌”Yall Ir...,en
12988,"Pesta natal heboh, ada di Now Jakarta The Chri...",und
10262,4th見終わった\n寝る,ja


In [49]:
test

Unnamed: 0,Tweet,Label
5857,Tardee demasiado buenas con las que mas quiero...,es
10019,フィズの尻尾に巻いてある布…何あれ？,ja
1974,“@tinysatii: Fting jay is so funny 😭😂🙌”Yall Ir...,en
12988,"Pesta natal heboh, ada di Now Jakarta The Chri...",und
10262,4th見終わった\n寝る,ja
10568,俺たち～ガチで吐きにっ！,ja
3098,Why human always do the same mistake? 'coz hum...,en
8431,誰か部内宴でどぶろっくとテツandトモを一緒にやってくれる人いませんか,ja
5491,Todo es cuestion de superaaaar.,es
7100,Franchement si tu pouvais ne pas exister ça me...,fr


In [50]:
train.describe() # descrption of data

Unnamed: 0,Tweet,Label
count,48122,48122
unique,48054,76
top,:(,en
freq,6,16849


In [51]:
train.groupby(COL_LABEL).size() # more description of teh data. See that there are lots of Arabian and English tweets, also quite a few in Spanish and Portuguese

Label
ar          2031
ar             1
ar_LATN       10
az             1
bg             2
bn             7
bs             3
ca            20
cs             3
cy             1
da             7
de           153
dv             1
el            35
en         16849
es          5401
et             2
fa            18
fi            14
fr           874
gl             2
ha             1
he            27
hi            15
hi-Latn       15
hr             5
ht             2
hu            13
hy             2
id          2734
           ...  
nl           163
no             9
pl            83
ps             1
ps_LATN        1
pt          2606
ro            11
ru           894
si             1
sl             2
sq             7
sr            19
su             9
sv            49
sw             6
ta             8
ta_LATN        1
th           422
tl           289
tn             1
tr           606
uk            16
und         4357
ur             6
ur_LATN       12
vi            14
wo             1
xh      

In [52]:
X_train = train.Tweet # split the data in Series
y_train = train.Label
X_test = test.Tweet
y_test = test.Label

In [53]:
print(type(X_train))
print(type(y_train))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [54]:
from sklearn.preprocessing import LabelEncoder
#encode the labels. First step means simple encoding, the second makes a series out of the array that was outputted and
    #the third step means we output strings again (strings are apparently needed as a format)

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_train_series = pd.Series(y_train_encoded)
y_train_str = y_train_series.apply(str)

y_test_encoded = label_encoder.fit_transform(y_test)
y_test_series = pd.Series(y_test_encoded)
y_test_str = y_test_series.apply(str)

In [55]:
pipeline1 = Pipeline([('tfidf', TfidfVectorizer()), ('clf0', MultinomialNB())]) #first test with a pipeline

In [56]:
pipeline1.fit(X_train, y_train_str) #using y_train_str

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf0',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [57]:
pipeline1.predict(X_test)

array(['15', '14', '14', ..., '14', '14', '0'], dtype='<U2')

In [58]:
y_test_str #doesn't look too right, but it is a very simple model

0        11
1        23
2        10
3        52
4        23
5        23
6        10
7        23
8        11
9        15
10       36
11       39
12       23
13       41
14       23
15       48
16       23
17       39
18       10
19       52
20       21
21       11
22        0
23       10
24       15
25       10
26       23
27       21
28       52
29       10
         ..
13422    10
13423    52
13424    11
13425    23
13426    23
13427    52
13428    23
13429    15
13430    21
13431    23
13432    41
13433    11
13434    50
13435    10
13436    10
13437    23
13438    23
13439    10
13440     8
13441    10
13442    11
13443    11
13444    39
13445    21
13446    11
13447    10
13448    23
13449    23
13450    10
13451     0
Length: 13452, dtype: object

In [59]:
#honestly i don't know why i did that anymore!!!
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_tranformer = TfidfTransformer(smooth_idf=True).fit(X_train_counts)
X_train_tfidf = tfidf_tranformer.transform(X_train_counts)
print(type(X_train_tfidf))

<class 'scipy.sparse.csr.csr_matrix'>


In [60]:
#same here
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tfidf, y_train_str)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [61]:
#follows here
y_predicted = nb_clf.predict(X_train_tfidf)
print(y_predicted[0:50])

['14' '14' '15' '14' '51' '15' '15' '29' '32' '14' '14' '29' '19' '15'
 '32' '14' '14' '14' '15' '14' '15' '51' '68' '14' '0' '32' '14' '14' '14'
 '32' '15' '29' '15' '29' '14' '14' '51' '14' '14' '14' '32' '14' '14'
 '15' '15' '15' '14' '29' '32' '0']


In [62]:
#and here
y_train_str

0        68
1        14
2        15
3        14
4        51
5        15
6        15
7        29
8        32
9        14
10       14
11       29
12       19
13       15
14       32
15       14
16       18
17       63
18       15
19       61
20       15
21       51
22       68
23       68
24        0
25       32
26       14
27       66
28       14
29       32
         ..
48092    32
48093    64
48094    14
48095    53
48096    32
48097    51
48098    14
48099    36
48100    14
48101    14
48102    15
48103    68
48104    15
48105    14
48106    15
48107    32
48108    32
48109    14
48110     0
48111    14
48112    15
48113    68
48114    14
48115    15
48116    64
48117    51
48118    32
48119    14
48120    32
48121    31
Length: 48122, dtype: object

In [64]:
#first serious pipeline with ngrams and tfidf
pipeline_NB01 = Pipeline([
    ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='word')),
    ('tfidf', TfidfTransformer()),
    ('clf01', MultinomialNB())
])

In [65]:
#param_grid is fitting the pipeline_NB01
param_grid01 = {'clf01__alpha': [0.2, 0.6, 0.8, 1.0],
                 'clf01__fit_prior': [True, False]}  #'ngram__ngram_range': [(1, 1), (1, 2), (1, 4)]


In [66]:
#This model here seems to work
gs_NB01= GridSearchCV(pipeline_NB01, param_grid01, cv=2, n_jobs=2, verbose=1)
gs_NB01.fit(X_train, y_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  16 out of  16 | elapsed:   54.0s finished


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('ngram',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 4),
                                           

In [113]:
y_NB01 = gs_NB01.predict(X_test)

In [114]:
y_test #results are quite terrible though 
# they are actually better than with the avg word length extractor included in the features :D wtf o_o

5857      es
10019     ja
1974      en
12988    und
10262     ja
10568     ja
3098      en
8431      ja
5491      es
7100      fr
10804     nl
11141     pt
8484      ja
11781     ru
9145      ja
11875     th
8236      ja
11145     pt
903       en
13265    und
7542      id
5736      es
98        ar
4328      en
7141      fr
2267      en
8122      ja
7473      id
13181    und
3533      en
        ... 
3482      en
12302    und
5578      es
8998      ja
9113      ja
12873    und
10017     ja
7021      fr
7694      id
10117     ja
11774     ru
6887      es
12150     tr
2042      en
4824      en
9738      ja
8207      ja
5360      en
585       de
3605      en
5443      es
6502      es
10951     pt
7536      id
5936      es
1683      en
8733      ja
9740      ja
3279      en
302       ar
Name: Label, Length: 13452, dtype: object

In [115]:
accuracy_score(y_test, y_NB01) 

0.7334225393993459

But here we really need to find the best_model out of this data i got. I don't remember how to see it in the matrix, see Tutorial!

In [88]:
#Average word length extractor, inspired  by https://michelleful.github.io/code-blog/2015/06/20/pipelines/)
class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
    """Takes in dataframe, extracts tweet column, outputs average word length"""

    def __init__(self):
        pass

    def average_word_length(self, tweet):
        """Helper code to compute average word length of a tweet"""
        return np.mean([len(word) for word in tweet.split()])

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        # the result of the transform needs to be a 2d array a.k.a. dataframe
        # https://stackoverflow.com/a/50713209
        return df.apply(self.average_word_length).to_frame()

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

In [124]:
# read this blog post on how to construct feature unions :) 
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html

# the problem was that our previous pipeline went sorta like this:
# create n-grams from text (CountVectorizer) -> tfidf from ngrams (TfidfTransformer) -> average length from ngrams (AvgWLExtractor) ummm :) that wouldn't work... we need to compute the average word length from the original data (tweets / strings).
# that's why we have to do these two "pipelines" separately => now we just compute the avg from the strings like so...

pipeline_NB1 = Pipeline([
    ('features', FeatureUnion([
        # first feature
        ('ngram_tfidf', Pipeline([
            ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='word')),
            ('tfidf', TfidfTransformer())
        ])),
        # second feature
        ('ave', AverageWordLengthExtractor())
    ])),
    ('nb_clf', MultinomialNB()) # classifier
])

In [118]:
#-*- coding: utf-8 -*-
y_train.describe()

count     48122
unique       76
top          en
freq      16849
Name: Label, dtype: object

In [125]:
param_grid1 = {'nb_clf__alpha': [0.2, 0.6, 0.8, 1.0],
                 'nb_clf__fit_prior': [True, False]}  #'ngram__ngram_range': [(1, 1), (1, 2), (1, 4)]


In [126]:
gs_NB1= GridSearchCV(pipeline_NB1, param_grid1, cv=2, n_jobs=2, verbose=1)
gs_NB1.fit(X_train, y_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  16 out of  16 | elapsed:  1.0min finished


<class 'pandas.core.series.Series'>


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('features',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('ngram_tfidf',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('ngram',
                                                                                         CountVectorizer(analyzer='word',
                                                                                                         binary=False,
                                                                                                         decode_error='strict',
                                                                                                         dtype=<class 'numpy.int64'>,
   

In [127]:
y_NB1 = gs_NB1.predict(X_test)

<class 'pandas.core.series.Series'>


In [128]:
accuracy_score(y_test, y_NB1)
# not rly sure why the accuracy is lower than NB01, kinda confused bout this :D

0.605783526613143

In [123]:
# Let's test without avg but with a nested FeatureUnion with 1 feature (pipeline)
# This should have the same accuracy as NB01, and it does... which implies that the pipeline/featureunion thing is correct. I'd say we have to look into hyperparams to fix the accuracy.
pipeline_NB2 = Pipeline([
    ('features', FeatureUnion([
        # first feature
        ('ngram_tfidf', Pipeline([
            ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='word')),
            ('tfidf', TfidfTransformer())
        ]))
    ])),
    ('nb_clf', MultinomialNB()) # classifier
])

gs_NB2= GridSearchCV(pipeline_NB2, param_grid1, cv=2, n_jobs=2, verbose=1) # same param_grid as NB1
gs_NB2.fit(X_train, y_train)
y_NB2 = gs_NB2.predict(X_test)
accuracy_score(y_test, y_NB2)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  16 out of  16 | elapsed:   52.5s finished


0.7334225393993459

In [138]:
#have a try with SGD, same features!
pipeline_SGD = Pipeline([
    ('feats', FeatureUnion([
        ('ngram_tfidf', Pipeline([
            ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='word')),
            ('tfidf', TfidfTransformer()), 
        ])),
        ('ave', AverageWordLengthExtractor())
    ])),
    ('SGD_clf', SGDClassifier())# classifier
])

In [139]:
grid_param_SGD = {'SGD_clf__loss': ['hinge', 'log'],
                  'SGD_clf__penalty': ['none', 'l1', 'l2'],
                  'SGD_clf__max_iter': [50, 100, 500, 1000]}

In [140]:
gs_SGD = GridSearchCV(pipeline_SGD, grid_param_SGD, cv=2, n_jobs=4, verbose=1)
gs_SGD.fit(X_train, y_train)
##NOTE: it crashed^s at the same point: could not convert string to float: 'ញាំថ្នាំផ្តាសសាយខ្លាំពេកឡើងគេញលែងចង់ចង់ក្រោកហើយ....'
#Maybe some problem with the encoding?

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  48 out of  48 | elapsed:  5.5min finished


<class 'pandas.core.series.Series'>


GridSearchCV(cv=2, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('feats',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('ngram_tfidf',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('ngram',
                                                                                         CountVectorizer(analyzer='word',
                                                                                                         binary=False,
                                                                                                         decode_error='strict',
                                                                                                         dtype=<class 'numpy.int64'>,
      

In [136]:
y_SGD = gs_SGD.predict(X_test)

In [141]:
accuracy_score(y_test, y_SGD)

0.8403954802259888