In [1]:
%matplotlib inline

In [2]:
import json
import re
import unicodedata
import itertools
import collections

In [3]:
import pandas as pd
import seaborn as sns
import numpy as np

In [20]:
from keras.models import Sequential
from keras.layers import Dense, Activation

Using Theano backend.


In [4]:
def text_without_entities(tweet_json):
    '''
    The text of the tweet without entities (hashtags, ursl and
    user mentions).
    '''
    entities = tweet_json['entities'].values()
    indicies = list(itertools.chain.from_iterable((e['indices'] for e in es) for es in entities))

    text = list(tweet_json['text'])
    for start, end in indicies:
        length = end - start
        text[start:end] = [None] * length

    return ''.join(filter(None, text))

In [7]:
def read_data(f_name):
    with open(f_name) as f:
        data = map(json.loads, f)
        data = [
            {
                'id': t['id'],
                'text': t['text'],
                'text_without_entities': text_without_entities(t),
                'tweetlid_lang': t['tweetlid_lang'],
                **{
                    lang: 0 for lang in ['ca', 'en', 'es', 'eu', 'gl', 'pt', 'und', 'other']
                },
                **{
                    lang: 1 / len(re.split('[/]|[+]', t['tweetlid_lang']))
                    for lang in re.split('[/]|[+]', t['tweetlid_lang'])
                }
            }
            for t in data
        ]
        
    data = pd.DataFrame.from_records(data, index='id')

    input_ = data[['text', 'text_without_entities']]
    labels = data[['ca', 'en', 'es', 'eu', 'gl', 'pt', 'und', 'other']]

    def postprocess(text):
        return pd.Series(
            {
                'raw': text,
                'NFD': unicodedata.normalize('NFD', text),
                'NFC': unicodedata.normalize('NFC', text),
            }
        )
        
    input_ = pd.concat(
        [
            input_['text'].apply(postprocess),
            input_['text_without_entities'].apply(postprocess),
        ],
        keys=['none', 'without_entities',],
        names=['cleanup', 'tweet_id'],
    ).unstack('cleanup')
    
    return input_, labels

In [11]:
def ngrams(text, length=2):
    text = list(text) + ['']
    current = collections.deque(text[:length], maxlen=length)
    
    for c in text[length:]:
        yield ''.join((current))
        current.append(c)

In [12]:
def vocabulary(items):
    items = list(items)
    counts = pd.Series(items).value_counts()
    counts = counts.to_frame('count')
    counts['id'] = list(range(len(counts)))
    
    return counts

In [16]:
def extract_features(t, vocab=None):
    build_vocab = vocab is None
    if build_vocab:
        vocab = vocabulary(itertools.chain.from_iterable(map(ngrams, t)))
        
    n_features = len(vocab)
    X = np.zeros((len(t), n_features))

    for i, text in enumerate(t):
        counts = pd.Series(ngrams(text)).value_counts()
        
        if not build_vocab:
            counts = counts.loc[vocab.index.intersection(counts.index)]
        
        ids = vocab.loc[counts.index, 'id']    
        X[i, ids] = counts
        
    return vocab, X

In [10]:
train, train_labels = read_data('TweetLID_corpusV2/tweetlid-training-tweets.json')

In [13]:
local_train = train.sample(frac=0.80)

In [14]:
dev = train.loc[~train.index.isin(local_train.index)]

In [15]:
assert len(train) == len(local_train) + len(dev)

In [17]:
vocab, X_train = extract_features(local_train['raw', 'none'])

In [18]:
y_train = train_labels.loc[local_train.index]

In [51]:
_, X_dev = extract_features(dev['raw', 'none'], vocab=vocab)

In [52]:
y_dev = train_labels.loc[dev.index]

In [53]:
test, y_test = read_data('TweetLID_corpusV2/tweetlid-test-tweets.json')

In [54]:
_, X_test = extract_features(test['raw', 'none'], vocab=vocab)

  app.launch_new_instance()
  app.launch_new_instance()


In [26]:
model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1]))
model.add(Activation('relu'))
model.add(Dense(8))
model.add(Activation('softmax'))

In [27]:
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)

In [29]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11adfa208>

In [31]:
score = model.evaluate(X_train, y_train, batch_size=128)
score



[0.16657257468479941, 0.94538480800335079]

In [55]:
score = model.evaluate(X_dev, y_dev, batch_size=128)
score



[0.52916348294626481, 0.85957304861960693]

In [56]:
score = model.evaluate(X_test, y_test.values, batch_size=128)
score



[0.79870648619361606, 0.84689641374481073]

In [60]:
y_test_estimate = model.predict(X_test)

In [69]:
run_output = pd.DataFrame(y_test_estimate, index=test.index, columns=y_test.columns).idxmax(axis=1)

In [73]:
run_output.to_csv('run_output.tsv', sep='\t')

In [75]:
!perl TweetLID_corpusV2/tweetLID_eval.pl \
-r TweetLID_corpusV2/tweetlid-test-tweets.tsv \
-d run_output.tsv


Gold standard reference file: TweetLID_corpusV2/tweetlid-test-tweets.tsv 
Provided run file: run_output.tsv 


 RESULTS ONLY taking into account SUBMITTED RESULTS IN THE REFERENCE: 
Category en : P => 0.733210671573137 , R => 0.68412017167382 , F => 0.707815275310835 
Category amb : P => 1 , R => 0.874551971326165 , F => 0.933078393881453 
Category eu : P => 0.821510297482838 , R => 0.746361746361746 , F => 0.782135076252723 
Category ca : P => 0.734469488730071 , R => 0.845034788108792 , F => 0.785882352941176 
Category gl : P => 0.436724565756824 , R => 0.377682403433476 , F => 0.405063291139241 
Category pt : P => 0.862238074008025 , R => 0.879090909090909 , F => 0.870582939455323 
Category und : P => 0.598958333333333 , R => 0.11252446183953 , F => 0.189456342668863 
Category ind : P =>  , R => 0 , F => 0 
Category es : P => 0.909271816037736 , R => 0.936039453717754 , F => 0.922461492448034 

Global results : P => 0.677375916324663 , R => 0.606156211728021 , F => 0.62183057378862

In [77]:
!perl TweetLID_corpusV2/tweetLID_eval.pl \
-r TweetLID_corpusV2/tweetLID-testOfficial-7july.tsv \
-d run_output.tsv \
2> /dev/null


 RESULTS ONLY taking into account SUBMITTED RESULTS IN THE REFERENCE: 
Category und : P => 0.601123595505618 , R => 0.113347457627119 , F => 0.190730837789661 
Category gl : P => 0.44 , R => 0.381062355658199 , F => 0.408415841584158 
Category eu : P => 0.832116788321168 , R => 0.743478260869565 , F => 0.785304247990815 
Category es : P => 0.90804690009637 , R => 0.934231182351483 , F => 0.920952962736713 
Category ca : P => 0.738904899135447 , R => 0.842866535174227 , F => 0.787469287469287 
Category en : P => 0.734892787524366 , R => 0.688584474885845 , F => 0.710985384252711 
Category amb : P => 1 , R => 0.880769230769231 , F => 0.936605316973415 
Category pt : P => 0.860684184432325 , R => 0.881218274111675 , F => 0.870830198143968 

Global results : P => 0.764471144376912 , R => 0.683194721430918 , F => 0.701411759617591 

Submitted run contains => 19993 tweets. From those 18423 are in the reference. 
Provided reference has => 18423 tweets. From those 0 tweets were left unanswere