In [1]:
MAIN_PATH = "/home/carlos/MasterDS/tfm"
JSON_DATA_PATH = '{}/data/json/'.format(MAIN_PATH)
CSV_DATA_PATH = '{}/data/csv/'.format(MAIN_PATH)

In [2]:
import sys
sys.path.insert(0, MAIN_PATH)

In [3]:
%load_ext autoreload
%autoreload 2
from scripts.models.rank_with_model.rank_with_model import RankModel
from scripts.models.rank_with_model.metrics import RankModelMetrics
from scripts.models.ltr.train import LTRTrain
%reload_ext autoreload

# Learning to Rank: salida ordenada

Una vez se ha obtenido un score que permita ordenar cada evento dentro
de un partido, se debe construir el "resumen" a partir de ello.

La aproximación más sencilla sería ordenar los eventos para cada partido, y escoger los N primeros eventos con mayor score para representar la noticia. Sin embargo, como ya hemos comentado con 
anterioridad, esto puede sufrir varios problemas: puede haber muchos
eventos con el mismo score, y además muchos eventos pueden estar
referidos a la misma frase del artículo.

# Uso de modelos

Una vez obtenido un baseline usando directamente los scores, se 
pasa a utilizar la predicción de estos scores para ordenar los eventos, usando los modelos entrenados.

## Random forest

In [4]:
ltr_params = {
    'key_events': ['goal', 'red_card', 'penalty'],
    'lags': [1, 3, 5],
    'target_metric': 'cosine_emb',
    'drop_teams': True,
    'lemma': True,
    'metric_params': {'embedding': 'roberta-base-nli-stsb-mean-tokens',
                     'text_process': 'basic'},
    'count_vec_kwargs': {'ngram_range': (1, 2), 'strip_accents': 'unicode'}
}
num_features = ['tfidf_sum']
cat_features_dict = {'is_key_event': [0, 1],
                     'n_players_cat': ['no_player', 'one_player', 'more_than_one_player']}

model_params = {
    'n_estimators': [100, 200, 500],
    'max_depth': [2, 5, 7],
    'bootstrap': [True],
    
}
cv = 5
opt_metric = 'neg_mean_squared_error'

In [5]:
#CV
ltr_train = LTRTrain(cat_features_dict=cat_features_dict,
                     num_features=num_features,
                     model_params=model_params, ltr_params=ltr_params,
                    cv=cv, opt_metric=opt_metric)

Setting target metric to cosine_emb


In [6]:
rank_model = RankModel(ltr_train, n=10)

Model already trained


In [7]:
rank_model.run()

Categorizing n_players...
Computing new length...
Dropping {'n_stop', 'advantage', 'sim_previous_5', 'sentence_ix', 'total_length', 'length', 'n_players', 'json_file', 'equalize', 'players_importance', 'event_ix', 'url', 'sim_previous_1', 'score', 'position', 'sim_previous_3'}
Ranking events using row_number approach...
Saving to /home/carlos/MasterDS/tfm/models/rank_with_model/975aab0d92/summaries.csv


Metrics

In [15]:
ground_truth_df = rank_model.ltr.ltr.targets.get_targets()

Reading targets from /home/carlos/MasterDS/tfm/data/ltr/targets/6467a8c4d1/targets.csv


In [14]:
scores_df = rank_model.get_scores_df()

Categorizing n_players...
Computing new length...
Dropping {'position', 'total_length', 'sim_previous_3', 'json_file', 'advantage', 'score', 'equalize', 'url', 'event_ix', 'n_players', 'sim_previous_1', 'sentence_ix', 'players_importance', 'length', 'n_stop', 'sim_previous_5'}


In [20]:
ground_truth_df

Unnamed: 0,event_ix,sentence_ix,ground_truth,url,json_file
0,0,4,0.409866,http://www.premierleague.com/match/38678,premier_league_2018_2019.json
1,1,3,0.417585,http://www.premierleague.com/match/38678,premier_league_2018_2019.json
2,2,4,0.318657,http://www.premierleague.com/match/38678,premier_league_2018_2019.json
3,3,3,0.134359,http://www.premierleague.com/match/38678,premier_league_2018_2019.json
4,4,4,0.419201,http://www.premierleague.com/match/38678,premier_league_2018_2019.json
...,...,...,...,...,...
423795,107,6,0.297914,https://www.espn.com/soccer/report?gameId=522804,italian_serie_a_2018_2019.json
423796,108,6,0.199009,https://www.espn.com/soccer/report?gameId=522804,italian_serie_a_2018_2019.json
423797,109,7,0.491633,https://www.espn.com/soccer/report?gameId=522804,italian_serie_a_2018_2019.json
423798,110,7,0.350315,https://www.espn.com/soccer/report?gameId=522804,italian_serie_a_2018_2019.json


In [19]:
ground_truth_df.rename({'score': 'ground_truth'}, axis=1, inplace=True)

In [17]:
url = 'http://www.premierleague.com/match/38678'

In [18]:
scores_df

Unnamed: 0,url,event_ix,score
0,http://www.premierleague.com/match/38678,0,0.387173
1,http://www.premierleague.com/match/38678,1,0.259158
2,http://www.premierleague.com/match/38678,2,0.356472
3,http://www.premierleague.com/match/38678,3,0.256403
4,http://www.premierleague.com/match/38678,4,0.348194
...,...,...,...
423795,https://www.espn.com/soccer/report?gameId=522804,107,0.261912
423796,https://www.espn.com/soccer/report?gameId=522804,108,0.254965
423797,https://www.espn.com/soccer/report?gameId=522804,109,0.390691
423798,https://www.espn.com/soccer/report?gameId=522804,110,0.356763


In [27]:
drop_cols = ['json_file', 'event_ix', 'sentence_ix']
results_df = scores_df.merge(ground_truth_df, on=['url', 'event_ix'], how='inner')
results_df.drop(drop_cols, axis=1, inplace=True)

In [28]:
results_df

Unnamed: 0,url,score,ground_truth
0,http://www.premierleague.com/match/38678,0.387173,0.409866
1,http://www.premierleague.com/match/38678,0.259158,0.417585
2,http://www.premierleague.com/match/38678,0.356472,0.318657
3,http://www.premierleague.com/match/38678,0.256403,0.134359
4,http://www.premierleague.com/match/38678,0.348194,0.419201
...,...,...,...
423795,https://www.espn.com/soccer/report?gameId=522804,0.261912,0.297914
423796,https://www.espn.com/soccer/report?gameId=522804,0.254965,0.199009
423797,https://www.espn.com/soccer/report?gameId=522804,0.390691,0.491633
423798,https://www.espn.com/soccer/report?gameId=522804,0.356763,0.350315


In [29]:
url_df = results_df[results_df.url==url]

In [38]:
from sklearn.metrics import ndcg_score
import numpy as np

In [39]:
scores = np.asarray([url_df['score'].tolist()])
ground_truth = np.asarray([url_df['ground_truth'].tolist()])

In [40]:
scores

array([[0.38717349, 0.25915811, 0.35647164, 0.25640289, 0.34819362,
        0.39744092, 0.38657355, 0.25636408, 0.37185176, 0.38690945,
        0.29263462, 0.39080962, 0.38717349, 0.37167088, 0.34352603,
        0.25898455, 0.36696722, 0.24682219, 0.36644667, 0.37252241,
        0.29124762, 0.50213291, 0.49682173, 0.29109324, 0.37113143,
        0.3873369 , 0.25832205, 0.35456027, 0.34388373, 0.25832205,
        0.39734621, 0.39648939, 0.29554012, 0.49782297, 0.36106491,
        0.24682219, 0.37282961, 0.26326241, 0.38009759, 0.29109324,
        0.34348482, 0.25898455, 0.25869545, 0.34350594, 0.25869545,
        0.34348482, 0.36153483, 0.26051243, 0.39734621, 0.38737585,
        0.25882541, 0.37060976, 0.37059736, 0.26129537, 0.39769341,
        0.2915095 , 0.50123063, 0.37014893, 0.24561336, 0.24436686,
        0.26067017, 0.38549774, 0.25636408, 0.37046694, 0.49403585,
        0.37042084, 0.25640289, 0.28197471, 0.3704534 , 0.25636408,
        0.25640289, 0.37043263, 0.25487725, 0.25

In [41]:
ground_truth

array([[0.40986603, 0.41758507, 0.31865731, 0.13435927, 0.41920128,
        0.38135689, 0.4614774 , 0.20302108, 0.40898579, 0.37311709,
        0.29680306, 0.39059401, 0.40986603, 0.2527889 , 0.3825624 ,
        0.3785888 , 0.51525545, 0.25448772, 0.58829701, 0.27070698,
        0.3140164 , 0.43958074, 0.49398625, 0.33381161, 0.36793208,
        0.34364462, 0.4074727 , 0.34399807, 0.31291628, 0.4074727 ,
        0.29483151, 0.4116157 , 0.27980566, 0.57544255, 0.47494888,
        0.17011783, 0.25148401, 0.30301556, 0.40850282, 0.33381161,
        0.55360907, 0.2358924 , 0.24368083, 0.38276559, 0.24368083,
        0.34998846, 0.41154194, 0.19155502, 0.23389085, 0.44498497,
        0.17830354, 0.33378857, 0.4064461 , 0.1538114 , 0.41452235,
        0.27531984, 0.49988562, 0.45725051, 0.16821909, 0.37322342,
        0.31500453, 0.24832232, 0.20302108, 0.34513324, 0.6317721 ,
        0.4530853 , 0.13435927, 0.20192146, 0.58809513, 0.20302108,
        0.13435927, 0.37342313, 0.23059903, 0.10

In [62]:
ndcg_score(ground_truth, scores, k=10)

0.8089719576128829

In [59]:
a = [[.1,.2,.3]]
b = [[.3,.2,.1]]

In [61]:
ndcg_score(a, b, k=2)

0.5307212739772436

In [63]:
rank_model.path

'/home/carlos/MasterDS/tfm/models/rank_with_model/3829e2a65c'

In [64]:
a = np.asarray([1,2,3])

In [65]:
np.mean(a)

2.0

## Usando librería

In [8]:
metrics = RankModelMetrics(rank_model)

In [9]:
avg_metric = metrics.get_metrics()

Categorizing n_players...
Computing new length...
Dropping {'n_stop', 'advantage', 'sim_previous_5', 'sentence_ix', 'total_length', 'length', 'n_players', 'json_file', 'equalize', 'players_importance', 'event_ix', 'url', 'sim_previous_1', 'score', 'position', 'sim_previous_3'}
Reading targets from /home/carlos/MasterDS/tfm/data/ltr/targets/819812546b/targets.csv


100%|██████████| 4523/4523 [01:34<00:00, 47.69it/s]


In [79]:
avg_metric

0.8401264834493798