In [211]:
import json
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.neural_network import MLPClassifier

In [212]:
# Constants

# File paths
DATA_FP = "./data/" # Data file path
TWEETS_FP = DATA_FP + "tweets.json"
TRAIN_DEV_FP = DATA_FP + "labels-train+dev.tsv"
TEST_FP = DATA_FP + "labels-test.tsv"

# Column names
COL_ID = 'ID'
COL_TWEET = 'Tweet'
COL_LABEL = 'Label'

---
# Data

## Tweets

In [213]:
# process the first file (Tweets)
tweets = []
with open(TWEETS_FP, 'r') as tweets_fh:  # Tweets file handle
    for line in tweets_fh:   # put each line in a list of lines
        j_content = json.loads(line)
        tweets.append(j_content)

tweets = pd.DataFrame(tweets, columns=[COL_ID, COL_TWEET])  # make a dataframe out of it

## Labels

In [214]:
# deal with both label documents

train_dev_labels = pd.read_csv(TRAIN_DEV_FP, sep='\t', header=None, names=[COL_LABEL, COL_ID])
test_labels = pd.read_csv(TEST_FP, sep='\t', header=None, names=[COL_LABEL, COL_ID])

# deal with class imbalance in the train set
lang_occurence = train.groupby(COL_LABEL).size()
MIN_NR_OCCURENCES = 5  # minimum number of instances that we require to be present in the training set for a given language to be included in fitting of the model
balanced_languages = lang_occurence.where(lambda x: x >= MIN_NR_OCCURENCES).dropna().index.values
balanced_labels = train_dev_labels.Label.isin(balanced_languages)

# Option 1 - replace rows that are labelled with an imbalanced language
train_dev_labels.loc[~balanced_labels, 'Label'] = 'unknown'  # ~ is the element-wise logical not

# Option 2 - keep the rows that are labelled with a balanced language
# train_dev_labels = train_dev_labels[balanced_labels]

In [215]:
tweets[COL_ID] = tweets[COL_ID].astype(int) # to allow for merge, need the same type

train_dev_data = pd.merge(tweets, train_dev_labels, on=COL_ID) # merge by ID
test_data = pd.merge(tweets, test_labels, on=COL_ID) # merge by ID

### Splitting

In [216]:
def drop_n_shuffle(data):
    data_no_na = data.dropna().copy()
    return data_no_na.sample(frac=1)

train_dev_data_prepared = drop_n_shuffle(train_dev_data).reset_index(drop = True)
train_set = train_dev_data_prepared.sample(frac=0.9, random_state=0) # take 90% of the data, reshuffle
test_set = drop_n_shuffle(test_data)
dev_set = train_dev_data_prepared.drop(train_set.index) # take 10% that remain

In [217]:
# drop the ID columns, not needed anymore

train = train_set.drop(COL_ID, axis=1)
dev = dev_set.drop(COL_ID, axis=1)
test = test_set.drop(COL_ID, axis=1)

In [218]:
X_train = train.Tweet 
y_train = train.Label
X_dev = dev.Tweet
y_dev = dev.Label
X_test = test.Tweet
y_test = test.Label

# Part 1 - Language classification with linear classification

## Multinomial Naïve Bayes

In [219]:
#Average word length extractor, inspired  by https://michelleful.github.io/code-blog/2015/06/20/pipelines/)
class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
    """Takes in dataframe, extracts tweet column, outputs average word length"""

    def __init__(self):
        pass

    def average_word_length(self, tweet):
        """Helper code to compute average word length of a tweet"""
        return np.mean([len(word) for word in tweet.split()])

    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        # the result of the transform needs to be a 2d array a.k.a. dataframe
        # https://stackoverflow.com/a/50713209
        result = df.apply(self.average_word_length).to_frame()
        return result

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

In [235]:
pipeline_NB = Pipeline([
    ('features', FeatureUnion([
        # first feature
        ('ngram_tfidf', Pipeline([
            ('ngram', CountVectorizer(analyzer='word')),
            ('tfidf', TfidfTransformer())
        ])),
        # second feature
        ('ave_scaled', Pipeline([
            ('ave', AverageWordLengthExtractor()),
            ('scale', MinMaxScaler())
        ]))
    ])),
    ('nb_clf', MultinomialNB()) # classifier
])

param_grid1 = {'nb_clf__alpha': [0.1, 0.2, 0.3, 0.6],
                'nb_clf__fit_prior': [False],
                'features__ngram_tfidf__ngram__ngram_range': [(1, 2), (1, 4)]}

In [None]:
gs_NB = GridSearchCV(pipeline_NB, param_grid1, cv=4, n_jobs=-1, verbose=10)
gs_NB.fit(X_train, y_train)

Fitting 4 folds for each of 8 candidates, totalling 32 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   22.8s


In [None]:
y_NB = gs_NB.predict(X_dev)
accuracy_score(y_dev, y_NB)

### Results

In [232]:
res = pd.DataFrame.from_dict(gs_NB1.cv_results_)
res.sort_values(by='rank_test_score')[['rank_test_score', 'param_features__ngram_tfidf__ngram__ngram_range', 'param_nb_clf__alpha', 'param_nb_clf__fit_prior', 'mean_test_score']]

Unnamed: 0,rank_test_score,param_features__ngram_tfidf__ngram__ngram_range,param_nb_clf__alpha,param_nb_clf__fit_prior,mean_test_score
1,1,"(1, 2)",0.2,False,0.844728
9,2,"(1, 4)",0.2,False,0.840946
3,3,"(1, 2)",0.6,False,0.831179
5,4,"(1, 2)",0.8,False,0.826337
11,5,"(1, 4)",0.6,False,0.825776
7,6,"(1, 2)",1.0,False,0.821973
13,7,"(1, 4)",0.8,False,0.82029
15,8,"(1, 4)",1.0,False,0.816176
0,9,"(1, 2)",0.2,True,0.675346
8,10,"(1, 4)",0.2,True,0.660363


## SGD

In [175]:
pipeline_SGD = Pipeline([
    ('feats', FeatureUnion([
        ('ngram_tfidf', Pipeline([
            ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='word')),
            ('tfidf', TfidfTransformer()), 
        ])),
        # second feature
        ('ave_scaled', Pipeline([
            ('ave', AverageWordLengthExtractor()),
            ('scale', MinMaxScaler())
        ]))
    ])),
    ('SGD_clf', SGDClassifier())# classifier
])

In [176]:
grid_param_SGD = {'SGD_clf__loss': ['hinge', 'log'],
                  'SGD_clf__penalty': ['none', 'l1', 'l2'],
                  'SGD_clf__max_iter': [100, 200]}

In [177]:
gs_SGD = GridSearchCV(pipeline_SGD, grid_param_SGD, cv=4, n_jobs=-1, verbose=10)
gs_SGD.fit(X_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  4.7min finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('feats',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('ngram_tfidf',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('ngram',
                                                                                         CountVectorizer(analyzer='word',
                                                                                                         binary=False,
                                                                                                         decode_error='strict',
                                                                                                         dtype=<class 'numpy.int64'>,
      

In [178]:
y_SGD = gs_SGD.predict(X_dev)
accuracy_score(y_dev, y_SGD)

0.8447727697774453

In [210]:
res = pd.DataFrame.from_dict(gs_SGD.cv_results_)
res.sort_values(by='rank_test_score')[['rank_test_score', 'param_SGD_clf__loss', 'param_SGD_clf__max_iter', 'param_SGD_clf__penalty', 'mean_test_score']]

Unnamed: 0,rank_test_score,param_SGD_clf__loss,param_SGD_clf__max_iter,param_SGD_clf__penalty,mean_test_score
3,1,hinge,200,none,0.835709
0,2,hinge,100,none,0.835564
2,3,hinge,100,l2,0.794917
5,4,hinge,200,l2,0.792216
6,5,log,100,none,0.789348
9,6,log,200,none,0.789265
4,7,hinge,200,l1,0.724596
1,8,hinge,100,l1,0.717219
10,9,log,200,l1,0.68717
7,10,log,100,l1,0.686277


# Part 2 - MLP

In [187]:
MLP_CLF = 'MLP_clf'

pipeline_MLP = Pipeline([
    ('features', FeatureUnion([
        # first feature
        ('ngram_tfidf', Pipeline([
            ('ngram', CountVectorizer(ngram_range=(1, 4), analyzer='word')),
            ('tfidf', TfidfTransformer())
        ])),
        # second feature
        ('ave_scaled', Pipeline([
            ('ave', AverageWordLengthExtractor()),
            ('scale', MinMaxScaler())
        ]))
    ])),
    (MLP_CLF, MLPClassifier()) 
])

grid_param_MLP = { MLP_CLF + '__hidden_layer_sizes': [(25,)],
                   MLP_CLF + '__activation': ['relu'],
                   MLP_CLF + '__solver': ['adam'],
                   MLP_CLF + '__max_iter': [20],
                   MLP_CLF + '__momentum': [0.9]}

In [195]:
gs_MLP = GridSearchCV(pipeline_MLP, grid_param_MLP, n_jobs=-1, verbose=10)
gs_MLP.fit(X_train, y_train)

Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 96.2min
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed: 96.2min remaining: 96.2min
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 96.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 96.5min finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('features',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('ngram_tfidf',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('ngram',
                                                                                         CountVectorizer(analyzer='word',
                                                                                                         binary=False,
                                                                                                         decode_error='strict',
                                                                                                         dtype=<class 'numpy.int64'>,
   

In [197]:
y_mlp = gs_MLP.predict(X_dev)

In [198]:
accuracy_score(y_dev, y_mlp)

0.814101365251543

In [77]:
models = pd.DataFrame.from_dict(gs_MLP.cv_results_)
models

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_MLP_clf__activation,param_MLP_clf__hidden_layer_sizes,param_MLP_clf__max_iter,param_MLP_clf__momentum,param_MLP_clf__solver,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,277.731456,0.010334,1.302234,0.010232,tanh,"(4, 3)",50,0.9,sgd,"{'MLP_clf__activation': 'tanh', 'MLP_clf__hidd...",0.350828,0.351429,0.351128,0.0003,5
1,373.916279,0.055184,1.333323,0.01304,tanh,"(4, 3)",50,0.9,adam,"{'MLP_clf__activation': 'tanh', 'MLP_clf__hidd...",0.648922,0.673115,0.661007,0.012096,2
2,357.605429,0.338305,1.623696,0.034237,tanh,"(5, 3)",50,0.9,sgd,"{'MLP_clf__activation': 'tanh', 'MLP_clf__hidd...",0.350828,0.351429,0.351128,0.0003,5
3,474.193237,0.280791,1.402589,0.120243,tanh,"(5, 3)",50,0.9,adam,"{'MLP_clf__activation': 'tanh', 'MLP_clf__hidd...",0.597974,0.627356,0.612651,0.014691,4
4,264.683728,0.129365,1.416832,0.077422,relu,"(4, 3)",50,0.9,sgd,"{'MLP_clf__activation': 'relu', 'MLP_clf__hidd...",0.350828,0.351429,0.351128,0.0003,5
5,353.529966,0.124757,1.388335,0.100204,relu,"(4, 3)",50,0.9,adam,"{'MLP_clf__activation': 'relu', 'MLP_clf__hidd...",0.668937,0.740422,0.704647,0.035743,1
6,340.663349,0.152085,1.470408,0.085144,relu,"(5, 3)",50,0.9,sgd,"{'MLP_clf__activation': 'relu', 'MLP_clf__hidd...",0.350828,0.351429,0.351128,0.0003,5
7,547.581941,0.392927,1.741871,0.329597,relu,"(5, 3)",50,0.9,adam,"{'MLP_clf__activation': 'relu', 'MLP_clf__hidd...",0.600507,0.705021,0.652716,0.052257,3
