In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from sklearn import metrics, model_selection, pipeline

In [4]:
import util as u

In [5]:
pipe = pipeline.Pipeline(
    [
        ('cleanup', u.TweetCleanup()),
        ('transformer', u.TweetTransformer()),
        ('nn', u.TweetClassifier()),
        ('label', u.TweetThresholdLabeller()),
    ]
)

In [6]:
param_grid = {
    'cleanup__keep_entities': [
        True,
#         False,
    ],
    
    'transformer__case': [
#         None,
        'lowercase',
    ],
    'transformer__normal_form': [
        None,
#         'NFD',
#         'NFC',
    ],
    
    'nn__ngram_range': [
#         (1, 2),
        (2, 2),
#         (1, 3),
#         (3, 3),
#         (4, 4),
    ],
#     'nn__min_df': [1, 10],
#     'nn__max_df': [1.0, 0.9],
#     'nn__optimizer': ['rmsprop', 'adam'],
    'nn__activation': [
#         'relu',
        'sigmoid',
    ],
    'nn__epochs': [
#         2,
#         5,
#         10,
        20,
#         30,
    ],
    'nn__batch_size': [
        32,
#         64,
#         128,
    ],
    'nn__verbose': [0],
}

In [7]:
from sklearn.metrics import make_scorer

def tweetlid_score(y, pred):
    run = u.tweetlid_run(y, pred > 0)
    return u.tweetlid_eval_f1(run)

In [8]:
score = make_scorer(tweetlid_score, greater_is_better=True)

In [9]:
grid = model_selection.GridSearchCV(
    pipe, cv=3, param_grid=param_grid,
    n_jobs=1,
    verbose=1,
    scoring=score,
)

In [10]:
train, y_train = u.read_tweetlid_json('TweetLID_corpusV2/tweetlid-training-tweets.json')

In [11]:
grid.fit(train, y_train);

Fitting 3 folds for each of 1 candidates, totalling 3 fits


Using Theano backend.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min finished


In [12]:
runs = u.runs_frame(grid)

In [13]:
runs.to_csv('runs.csv', index=False)

In [14]:
grid.best_params_

{'cleanup__keep_entities': True,
 'nn__activation': 'sigmoid',
 'nn__batch_size': 32,
 'nn__epochs': 20,
 'nn__ngram_range': (2, 2),
 'nn__verbose': 0,
 'transformer__case': 'lowercase',
 'transformer__normal_form': None}

In [15]:
test, y_test = u.read_tweetlid_json(
    'TweetLID_corpusV2/tweetlid-test-tweets.json',
)

In [16]:
pred_test = grid.predict_proba(test)

In [17]:
tweetlid_run = u.tweetlid_run(
    y_test,
    pred_test > 0,
)
tweetlid_run.to_csv('run_output.tsv', sep='\t')

In [18]:
!perl TweetLID_corpusV2/tweetLID_eval.pl \
-r TweetLID_corpusV2/tweetLID-testOfficial-7july.tsv \
-d run_output.tsv \
2> /dev/null


 RESULTS ONLY taking into account SUBMITTED RESULTS IN THE REFERENCE: 
Category amb : P => 1 , R => 0.919230769230769 , F => 0.957915831663327 
Category en : P => 0.653817082388511 , R => 0.789954337899543 , F => 0.715467328370554 
Category ca : P => 0.723772321428571 , R => 0.852728468113083 , F => 0.782976154542711 
Category pt : P => 0.837176470588235 , R => 0.902587519025875 , F => 0.86865234375 
Category gl : P => 0.380305602716469 , R => 0.517321016166282 , F => 0.438356164383562 
Category es : P => 0.892631902784187 , R => 0.9588565763384 , F => 0.92455986616745 
Category und : P => 0.606382978723404 , R => 0.120762711864407 , F => 0.201413427561837 
Category eu : P => 0.792873051224944 , R => 0.773913043478261 , F => 0.783278327832783 

Global results : P => 0.73586992623179 , R => 0.729419305264578 , F => 0.709077430534028 

Submitted run contains => 19993 tweets. From those 18423 are in the reference. 
Provided reference has => 18423 tweets. From those 0 tweets were left una

In [19]:
!perl TweetLID_corpusV2/tweetLID_eval.pl \
-r TweetLID_corpusV2/tweetlid-test-tweets.tsv \
-d run_output.tsv \
2> /dev/null


 RESULTS ONLY taking into account SUBMITTED RESULTS IN THE REFERENCE: 
Category pt : P => 0.83790628957366 , R => 0.901862789641072 , F => 0.86870897155361 
Category gl : P => 0.3824 , R => 0.512875536480687 , F => 0.438130155820348 
Category es : P => 0.892831945816283 , R => 0.960094074804643 , F => 0.925242186072016 
Category ca : P => 0.718234981392876 , R => 0.854522454142947 , F => 0.780473714615829 
Category en : P => 0.654532476802284 , R => 0.787124463519313 , F => 0.71473109898675 
Category eu : P => 0.787685774946921 , R => 0.771309771309771 , F => 0.779411764705882 
Category und : P => 0.608040201005025 , R => 0.11839530332681 , F => 0.198198198198198 
Category amb : P => 1 , R => 0.903225806451613 , F => 0.949152542372881 
Category ind : P =>  , R => 0 , F => 0 

Global results : P => 0.653514629948561 , R => 0.645490022186318 , F => 0.628227625813946 

Submitted run contains => 19993 tweets. From those 19993 are in the reference. 
Provided reference has => 19993 tweets. 