In [1]:
%matplotlib inline

In [2]:
import json
import re
import unicodedata
import itertools
import collections

In [3]:
import pandas as pd
import seaborn as sns
import numpy as np

In [4]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn import metrics, model_selection, pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin

Using Theano backend.


In [5]:
def text_without_entities(tweet_json):
    entities = tweet_json['entities'].values()
    indicies = list(itertools.chain.from_iterable((e['indices'] for e in es) for es in entities))

    text = list(tweet_json['text'])
    for start, end in indicies:
        length = end - start
        text[start:end] = [None] * length

    return ''.join(filter(None, text))

In [6]:
def read_tweetlid_json(f_name):
    labels = ['ca', 'en', 'es', 'eu', 'gl', 'pt', 'und', 'other']
    with open(f_name) as f:
        data = map(json.loads, f)
        data = [
            {
                'id': t['id'],
                'text': t['text'],
                'text_without_entities': text_without_entities(t),
                'tweetlid_lang': t['tweetlid_lang'],
                **{
                    lang: 0 for lang in labels
                },
                **{
                    lang: 1 #/ len(re.split('[/]|[+]', t['tweetlid_lang']))
                    for lang in re.split('[/]|[+]', t['tweetlid_lang'])
                }
            }
            for t in data
        ]

    data = pd.DataFrame.from_records(data, index='id')

    return data[['text', 'text_without_entities']], data[labels]

In [7]:
class TweetCleanup(BaseEstimator, TransformerMixin):
    def __init__(self, keep_entities=True):
        self.keep_entities = keep_entities

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.keep_entities:
            return X['text'].values
        else:
            return X['text_without_entities'].values

In [8]:
class TweetTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, case=None,normal_form=None):
        self.case = case
        self.normal_form = normal_form

    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        if self.case == 'lowercase':
            X = map(str.lower, X)
        
        if self.normal_form is not None:
            X = map(lambda text: unicodedata.normalize(self.normal_form, text), X)
            
        return list(X)

In [23]:
class TweetClassifier(BaseEstimator, TransformerMixin, ClassifierMixin):
    def __init__(
        self,
        analyzer='char', ngram_range=(2, 2), min_df=1, max_df=1.0,
        optimizer='rmsprop', activation='relu', epochs=20, batch_size=32,
        verbose=0,
    ):
        self.analyzer = analyzer
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.max_df = max_df

        self.vect = CountVectorizer(
            analyzer=analyzer,
            ngram_range=ngram_range,
            lowercase=False,
            min_df=min_df,
            max_df=max_df,
        )
        
        self.optimizer = optimizer
        self.activation = activation
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        
    def fit(self, X, y=None):
        X = self.vect.fit_transform(X)
        
        self.input_dim = X.shape[1]
        self.classifier = KerasClassifier(self.create_model)
        
        self.classifier.fit(
            X, y,
            epochs=self.epochs, batch_size=self.batch_size,
            verbose=self.verbose,
        )
        
        return self
        
    def transform(self, X):
        return self.classifier.transform(self.vect.transform(X))
    
    def predict(self, X):
        return self.classifier.predict(self.vect.transform(X))

    def predict_proba(self, X):
        return self.classifier.predict_proba(self.vect.transform(X))

    def create_model(self):
        model = Sequential()
        model.add(Dense(32, input_dim=self.input_dim))
        model.add(Activation(self.activation))
        model.add(Dense(8))
        model.add(Activation('softmax'))

        model.compile(
            optimizer=self.optimizer,
            loss='categorical_crossentropy',
            metrics=['accuracy'],
        )

        return model

In [24]:
pipe = pipeline.Pipeline(
    [
        ('cleanup', TweetCleanup()),
        ('transformer', TweetTransformer()),
        ('nn', TweetClassifier()),
    ]
)

In [25]:
param_grid = {
    'cleanup__keep_entities': [True, False],
    
    'transformer__case': [None, 'lowercase',],
#     'transformer__normal_form': [None, 'NFD', 'NFC'],
    
    'nn__ngram_range': [
        (2, 2),
#         (3, 3),
    ],
    'nn__min_df': [1],
    'nn__max_df': [1.0],
    'nn__optimizer': ['rmsprop', 'adam'],
    'nn__activation': [
        'relu',
        'sigmoid',
    ],
    'nn__epochs': [
        2,
#         5,
#         10,
#         20,
    ],
    'nn__batch_size': [
        32,
#         64,
#         128,
    ],
}

In [26]:
train, y_train = read_tweetlid_json('TweetLID_corpusV2/tweetlid-training-tweets.json')

In [27]:
grid = model_selection.GridSearchCV(
    pipe, cv=3, param_grid=param_grid,
    scoring='accuracy',
    n_jobs=-1,
    verbose=True,
)

In [None]:
grid.fit(train, y_train.values.argmax(axis=1));

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [20]:
grid.best_params_

{'nn__activation': 'relu',
 'nn__batch_size': 32,
 'nn__epochs': 2,
 'nn__max_df': 1.0,
 'nn__min_df': 1,
 'nn__ngram_range': (2, 2),
 'nn__optimizer': 'rmsprop'}

In [15]:
test, y_test = read_tweetlid_json('TweetLID_corpusV2/tweetlid-test-tweets.json')

In [16]:
y_test_estimate = grid.best_estimator_.predict_proba(test) > 0.3

In [17]:
run = pd.DataFrame(y_test_estimate, index=test.index, columns=y_test.columns)
run_output = run.apply(lambda r: '+'.join(r.index[r > 0.3]), axis='columns')
run_output.to_csv('run_output.tsv', sep='\t')

In [18]:
# !perl TweetLID_corpusV2/tweetLID_eval.pl \
# -r TweetLID_corpusV2/tweetlid-test-tweets.tsv \
# -d run_output.tsv \
# 2> /dev/null

In [19]:
!perl TweetLID_corpusV2/tweetLID_eval.pl \
-r TweetLID_corpusV2/tweetLID-testOfficial-7july.tsv \
-d run_output.tsv \
2> /dev/null


 RESULTS ONLY taking into account SUBMITTED RESULTS IN THE REFERENCE: 
Category pt : P => 0.809480401093892 , R => 0.901522842639594 , F => 0.853025936599424 
Category en : P => 0.73209549071618 , R => 0.756855575868373 , F => 0.744269662921348 
Category amb : P => 1 , R => 0.907692307692308 , F => 0.951612903225806 
Category gl : P => 0.548780487804878 , R => 0.415704387990762 , F => 0.473061760840999 
Category und : P => 0.904761904761905 , R => 0.0402542372881356 , F => 0.077079107505071 
Category eu : P => 0.898507462686567 , R => 0.654347826086957 , F => 0.757232704402516 
Category ca : P => 0.705723542116631 , R => 0.859303090072321 , F => 0.774977764601245 
Category es : P => 0.881475310511966 , R => 0.961582947785856 , F => 0.919788209261893 

Global results : P => 0.810103074961502 , R => 0.687157901928038 , F => 0.693881006169788 

Submitted run contains => 19993 tweets. From those 18423 are in the reference. 
Provided reference has => 18423 tweets. From those 0 tweets were 