In [1]:
%matplotlib inline

In [2]:
import json
import re
import unicodedata
import itertools
import collections

In [3]:
import pandas as pd
import seaborn as sns
import numpy as np

In [4]:
from sklearn import metrics, model_selection, pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin

In [5]:
def text_without_entities(tweet_json):
    entities = tweet_json['entities'].values()
    indicies = list(itertools.chain.from_iterable((e['indices'] for e in es) for es in entities))

    text = list(tweet_json['text'])
    for start, end in indicies:
        length = end - start
        text[start:end] = [None] * length

    return ''.join(filter(None, text))

In [6]:
def read_tweetlid_json(f_name):
    labels = ['ca', 'en', 'es', 'eu', 'gl', 'pt', 'und', 'other']
    with open(f_name) as f:
        data = map(json.loads, f)
        data = [
            {
                'id': t['id'],
                'text': t['text'],
                'text_without_entities': text_without_entities(t),
                'tweetlid_lang': t['tweetlid_lang'],
                **{
                    lang: 0 for lang in labels
                },
                **{
                    lang: 1 #/ len(re.split('[/]|[+]', t['tweetlid_lang']))
                    for lang in re.split('[/]|[+]', t['tweetlid_lang'])
                }
            }
            for t in data
        ]

    data = pd.DataFrame.from_records(data, index='id')

    return data[['text', 'text_without_entities']], data[labels]

In [7]:
class TweetCleanup(BaseEstimator, TransformerMixin):
    def __init__(self, keep_entities=True):
        self.keep_entities = keep_entities

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.keep_entities:
            return X['text'].values
        else:
            return X['text_without_entities'].values

In [8]:
class TweetTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, case=None,normal_form=None):
        self.case = case
        self.normal_form = normal_form

    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        if self.case == 'lowercase':
            X = map(str.lower, X)
        
        if self.normal_form is not None:
            X = map(lambda text: unicodedata.normalize(self.normal_form, text), X)
            
        return list(X)

In [9]:
class TweetClassifier(BaseEstimator, TransformerMixin, ClassifierMixin):
    def __init__(
        self,
        analyzer='char', ngram_range=(2, 2), min_df=1, max_df=1.0,
        optimizer='rmsprop', activation='relu', epochs=20, batch_size=32,
        verbose=0,
    ):
        self.analyzer = analyzer
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.max_df = max_df

        self.vect = CountVectorizer(
            analyzer=analyzer,
            ngram_range=ngram_range,
            lowercase=False,
            min_df=min_df,
            max_df=max_df,
        )
        
        self.optimizer = optimizer
        self.activation = activation
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        
    def fit(self, X, y=None):
        from keras.wrappers.scikit_learn import KerasClassifier

        X = self.vect.fit_transform(X)
        
        self.input_dim = X.shape[1]
        self.classifier = KerasClassifier(self.create_model)
        
        self.classifier.fit(
            X, y,
            epochs=self.epochs, batch_size=self.batch_size,
            verbose=self.verbose,
        )
        
        return self
        
    def transform(self, X):
        return self.classifier.transform(self.vect.transform(X))
    
    def predict(self, X):
        return self.classifier.predict(self.vect.transform(X))

    def predict_proba(self, X):
        return self.classifier.predict_proba(self.vect.transform(X))

    def create_model(self):
        import tensorflow as tf
        from keras.backend.tensorflow_backend import set_session

        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.05
        set_session(tf.Session(config=config))
        
        from keras.models import Sequential
        from keras.layers import Dense, Activation
        from keras.utils import multi_gpu_model
        
        from random import randrange
        
        with tf.device('/device:GPU:{}'.format(randrange(4))):
        
            model = Sequential()
            model.add(Dense(32, input_dim=self.input_dim))
            model.add(Activation(self.activation))
            model.add(Dense(8))
            model.add(Activation('softmax'))

#             model = multi_gpu_model(model, gpus=4, cpu_merge=True, cpu_relocation=False)

            model.compile(
                optimizer=self.optimizer,
                loss='categorical_crossentropy',
                metrics=['accuracy'],
            )

        return model

In [10]:
pipe = pipeline.Pipeline(
    [
        ('cleanup', TweetCleanup()),
        ('transformer', TweetTransformer()),
        ('nn', TweetClassifier()),
    ]
)

In [11]:
param_grid = {
    'cleanup__keep_entities': [
        True,
        False,
    ],
    
    'transformer__case': [None, 'lowercase',],
    'transformer__normal_form': [
        None,
        'NFD',
#         'NFC',
    ],
    
    'nn__ngram_range': [
        (1, 2),
        (2, 2),
        (1, 3),
        (3, 3),
#         (4, 4),
    ],
#     'nn__min_df': [1, 10],
#     'nn__max_df': [1.0, 0.9],
#     'nn__optimizer': ['rmsprop', 'adam'],
    'nn__activation': [
        'relu',
        'sigmoid',
    ],
    'nn__epochs': [
#         2,
#         5,
#         10,
#         20,
        30,
    ],
    'nn__batch_size': [
        32,
#         64,
#         128,
    ],
    'nn__verbose': [0],
}

In [12]:
train, y_train = read_tweetlid_json('TweetLID_corpusV2/tweetlid-training-tweets.json')

In [13]:
grid = model_selection.GridSearchCV(
    pipe, cv=3, param_grid=param_grid,
    scoring='accuracy',
    n_jobs=4,
    verbose=1,
)

In [None]:
grid.fit(train, y_train.values.argmax(axis=1));

Fitting 3 folds for each of 64 candidates, totalling 192 fits


Using TensorFlow backend.
Using TensorFlow backend.
Using TensorFlow backend.
Using TensorFlow backend.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  7.2min


In [None]:
_keys = [k for k in grid.cv_results_.keys() if k.startswith('param_') or k == 'mean_test_score']
runs = pd.DataFrame(
    {k: grid.cv_results_[k] for k in _keys},
#     
)

runs.columns =[k_[len('param_'):] if k_.startswith('param_') else k_ for k_ in _keys]
runs

In [None]:
runs.to_csv('runs.csv', index=False)

In [24]:
grid.best_params_

{'cleanup__keep_entities': False,
 'nn__activation': 'sigmoid',
 'nn__batch_size': 32,
 'nn__epochs': 30,
 'nn__ngram_range': (2, 2),
 'nn__verbose': 0,
 'transformer__case': 'lowercase',
 'transformer__normal_form': None}

In [25]:
test, y_test = read_tweetlid_json('TweetLID_corpusV2/tweetlid-test-tweets.json')

In [26]:
y_test_estimate = grid.best_estimator_.predict_proba(test) > 0.3

In [27]:
run = pd.DataFrame(y_test_estimate, index=test.index, columns=y_test.columns)
run_output = run.apply(lambda r: '+'.join(r.index[r]), axis='columns')
run_output.to_csv('run_output.tsv', sep='\t')

In [29]:
!perl TweetLID_corpusV2/tweetLID_eval.pl \
-r TweetLID_corpusV2/tweetLID-testOfficial-7july.tsv \
-d run_output.tsv \
2> /dev/null


 RESULTS ONLY taking into account SUBMITTED RESULTS IN THE REFERENCE: 
Category und : P => 0.638297872340426 , R => 0.190677966101695 , F => 0.293637846655791 
Category es : P => 0.905294439665285 , R => 0.956378056840714 , F => 0.930135390301715 
Category gl : P => 0.410301953818828 , R => 0.533487297921478 , F => 0.463855421686747 
Category pt : P => 0.814932126696833 , R => 0.914213197969543 , F => 0.861722488038278 
Category eu : P => 0.847715736040609 , R => 0.726086956521739 , F => 0.782201405152225 
Category amb : P => 1 , R => 0.915384615384615 , F => 0.955823293172691 
Category en : P => 0.729218106995885 , R => 0.809872029250457 , F => 0.767431788653097 
Category ca : P => 0.699791883454735 , R => 0.884286653517423 , F => 0.78129538193436 

Global results : P => 0.755694014876575 , R => 0.741298346688458 , F => 0.729512876949363 

Submitted run contains => 19993 tweets. From those 18423 are in the reference. 
Provided reference has => 18423 tweets. From those 0 tweets were l

In [30]:
!perl TweetLID_corpusV2/tweetLID_eval.pl \
-r TweetLID_corpusV2/tweetlid-test-tweets.tsv \
-d run_output.tsv \
2> /dev/null


 RESULTS ONLY taking into account SUBMITTED RESULTS IN THE REFERENCE: 
Category en : P => 0.72911787665886 , R => 0.802405498281787 , F => 0.7640081799591 
Category ca : P => 0.691510365251727 , R => 0.886148007590133 , F => 0.776822844469088 
Category pt : P => 0.816517493897478 , R => 0.912272727272727 , F => 0.861743237440962 
Category es : P => 0.906317300789663 , R => 0.957818071466505 , F => 0.931356276050312 
Category amb : P => 1 , R => 0.910394265232975 , F => 0.953095684803002 
Category ind : P =>  , R => 0 , F => 0 
Category und : P => 0.636963696369637 , R => 0.188845401174168 , F => 0.291320754716981 
Category gl : P => 0.40625 , R => 0.530042918454936 , F => 0.459962756052142 
Category eu : P => 0.845036319612591 , R => 0.725571725571726 , F => 0.78076062639821 

Global results : P => 0.670190339175551 , R => 0.657055401671662 , F => 0.646563373321089 

Submitted run contains => 19993 tweets. From those 19993 are in the reference. 
Provided reference has => 19993 tweets.