In [1]:
import json
import pandas as pd

In [41]:
%run freq_ngrams.py
%run -n metric.py
%run utils.py
%run pipeline.py
%run w2v_features.py
%run features.py

In [3]:
# days 24-27 was avalilable for training
# day 28 and final holdout was not avalilable
train_files = [
    'datasets/train_20170724.json', 
    'datasets/train_20170725.json',
    'datasets/train_20170726.json',
    'datasets/train_20170727.json'
]
dialogs = []
for path in train_files:
    with open(path) as f:
        dialogs += json.load(f)

data_tr, target_tr = dialogs_preproc(dialogs)

In [4]:
test_files = [
    'datasets/train_final.json'
]
dialogs = []
for path in test_files:
    with open(path) as f:
        dialogs += json.load(f)

data_tst, target_tst = dialogs_preproc(dialogs)

In [5]:
extractors = [
    min_len, max_len, avg_len,
    min_words, max_words, avg_words,
    min_QRatio, max_QRatio, avg_QRatio,
    min_WRatio, max_WRatio, avg_WRatio,
    ngram_intersection,
    get_human_words(), get_human_words_norm(), get_bot_words(), get_bot_words_norm(),
    get_stop_words_share(),
    min_partial_ratio, max_partial_ratio, avg_partial_ratio,
    dialogue_len, dialog_len_share, max_in_row,
    started_dialogue, finished_dialogue,
    get_words_not_in_w2v(),
    min_reply_time, avg_reply_time,
    two_phrases_in_a_row, capital_letters_ratio, question_marks, has_a_question, punctuation_marks,
]

tr1 = DataFrameFeatureUnion([
    simple_transfromer(extractors),
    PCAWordFeatures(side='user1'),
    PCAWordFeatures(side='user2'),
    PCACharFeatures(side='user1'),
    PCACharFeatures(side='user2'),
    FreqNgrams(ngram_len=2),
    FreqNgrams(ngram_len=3),
])

In [6]:
%%time
features_tr = tr1.fit_transform(data_tr)
features_tst = tr1.transform(data_tst)

CPU times: user 3min 6s, sys: 1.82 s, total: 3min 8s
Wall time: 3min 11s


In [7]:
from sklearn.ensemble import RandomForestRegressor

m = RandomForestRegressor(n_estimators=200, max_depth=6, min_samples_leaf=5)
m.fit(features_tr, target_tr)

spearman_scorer(m, features_tst, target_tst)

0.59198789016217235

In [8]:
from catboost import CatBoostRegressor

m = CatBoostRegressor()
m.fit(features_tr, target_tr)

spearman_scorer(m, features_tst, target_tst)

  return f(*args, **kwds)


0.69830802084401244

In [50]:
from catboost import CatBoostRegressor

m = CatBoostRegressor(loss_function='Poisson')
m.fit(features_tr, target_tr)

spearman_scorer(m, features_tst, target_tst)

  return f(*args, **kwds)


0.72215377711892592

In [53]:
from xgboost import XGBRegressor

m = XGBRegressor(n_estimators=500, max_depth=3, learning_rate=0.03, objective='count:poisson')
m.fit(features_tr, target_tr)

spearman_scorer(m, features_tst, target_tst)

0.70423843038732781

In [56]:
from lightgbm import LGBMRegressor

m = LGBMRegressor(n_estimators=500, max_depth=3, learning_rate=0.03, objective='poisson')
m.fit(features_tr, target_tr)

spearman_scorer(m, features_tst, target_tst)

0.67469713219748628

In [93]:
from sklearn.model_selection import train_test_split

def prepare_pairs(features, target):
    slices_x, slices_y = [], []
    for e in features.index:
        _features = features.drop(e, axis=0)
        _target = target.drop(e, axis=0)
        row = features.loc[e]
        row_y = target.loc[e]
        sample_x, _x, sample_y, _y = train_test_split(
            _features,
            _target,
            train_size=100,
            stratify=_target)
        sample_x = sample_x.copy()
        for col, val in row.items():
            sample_x['sample_{}'.format(col)] = val
        sample_dialog_id, sample_id = e
        sample_x['sample_dialog_id'] = sample_dialog_id
        sample_x['sample_id'] = sample_id
        sample_x = sample_x.reset_index().set_index(['dialogId', 'Id', 'sample_dialog_id', 'sample_id'])
        slices_x.append(sample_x)

        sample_y = sample_y.copy().to_frame()
        sample_y['sample_dialog_id'] = sample_dialog_id
        sample_y['sample_id'] = sample_id
        sample_y = sample_y.reset_index().set_index(['dialogId', 'Id', 'sample_dialog_id', 'sample_id']).squeeze()
        slices_y.append(sample_y < row_y)

    pairs = pd.concat(slices_x)
    pairs_y = pd.concat(slices_y)
    return pairs, pairs_y

In [94]:
%%time
pairs_x, pairs_y = prepare_pairs(features_tr, target_tr)

CPU times: user 3min 56s, sys: 7.16 s, total: 4min 3s
Wall time: 4min 15s


In [95]:
from lightgbm import LGBMClassifier

m = LGBMClassifier(n_estimators=500, max_depth=3, learning_rate=0.03)
m.fit(pairs_x, pairs_y)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.03,
        max_bin=255, max_depth=3, min_child_samples=10, min_child_weight=5,
        min_split_gain=0, n_estimators=500, nthread=-1, num_leaves=31,
        objective='binary', reg_alpha=0, reg_lambda=0, seed=0, silent=True,
        subsample=1, subsample_for_bin=50000, subsample_freq=1)

In [96]:
%%time
pairs_tst, pairs_tst_y = prepare_pairs(features_tst, target_tst)

CPU times: user 2min 9s, sys: 2.99 s, total: 2min 12s
Wall time: 2min 17s


In [97]:
preds = m.predict_proba(pairs_tst)

dialogue_preds = pd.Series(preds[:,1], index=pairs_tst.index, name='scores')
dialogue_preds = dialogue_preds.to_frame().reset_index()[['sample_dialog_id', 'sample_id', 'scores']]

scores = dialogue_preds.groupby(['sample_dialog_id', 'sample_id']).sum()

pred_true_df = pd.DataFrame({'target': target_tst, 'rank': scores.scores})

spearman(pred_true_df.target, pred_true_df['rank'])

0.73324411074387852