In [2]:
MAIN_PATH = "/home/carlos/MasterDS/tfm"
JSON_DATA_PATH = '{}/data/json/'.format(MAIN_PATH)
CSV_DATA_PATH = '{}/data/csv/'.format(MAIN_PATH)

In [3]:
import sys
sys.path.insert(0, MAIN_PATH)

In [4]:
%load_ext autoreload
%autoreload 2
from scripts.text.article_text_processor import ArticleTextProcessor
from scripts.text.basic_text_processor import BasicTextProcessor
from scripts.extractive_summary.ltr.learn_to_rank import LearnToRank
from scripts.extractive_summary.ltr.ltr_features_targets import LTRFeaturesTargets 
from scripts.extractive_summary.ltr.ltr_features import LTRFeatures
from scripts.extractive_summary.ltr.ltr_targets import LTRTargets
from scripts.extractive_summary.ltr.ltr_features_tf import LTRFeaturesTF
from scripts.extractive_summary.ltr.ltr_features_targets_tf import LTRFeaturesTargetsTF

from scripts.conf import TEAMS

from rouge import Rouge

%reload_ext autoreload

In [4]:
import pandas as pd

# Learning to Rank

El objetivo es entrenar un algoritmo que sea capaz de ordenar los eventos de acuerdo a la "probabilidad" de que aparezcan en las noticias.
La propuesta es la siguiente (basado en este [artículo](https://www.aclweb.org/anthology/P16-1129.pdf)):

- Primero, se debe asignar un score a cada evento, que cuantifique lo propenso que es un evento a tener información que aparece en el artículo.
Para ello, se proponen diferentes métodos para obtener este score, que se basan en calcular distancias o palabras en común entre cada evento y las 
frases de cada artículo

- Una vez construidos los targets, debemos sintetizar la información de cada evento en un vector numérico de características. Las features propuestas se
detallan en la sección Features.

- Con todo junto, tendremos para cada partido, vectores de características con su correspondiente target, por lo que se puede entrenar un modelo supervisado
que trate de predecir este target. Al ser numérico, la primera aproximación puede ser entrenar un modelo de regresión, aunque se podría convertir en un 
problema de clasificación si fijamos un umbral de aparece/ no aparece, para poder estimar una probabilidad usando un algoritmo de clasificación.


## Targets

In [5]:
processor = ArticleTextProcessor()
text_proc = BasicTextProcessor()

In [6]:
all_files = processor.load_json()

In [7]:
season_file = 'premier_league_2019_2020.json'
league_season_teams = TEAMS[season_file.split('.')[0]]

In [29]:
all_files[season_file].keys()

dict_keys(['http://www.premierleague.com/match/38678', 'http://www.premierleague.com/match/38679', 'http://www.premierleague.com/match/38680', 'http://www.premierleague.com/match/38681', 'http://www.premierleague.com/match/38682', 'http://www.premierleague.com/match/38683', 'http://www.premierleague.com/match/38684', 'http://www.premierleague.com/match/38685', 'http://www.premierleague.com/match/38686', 'http://www.premierleague.com/match/38687', 'http://www.premierleague.com/match/38674', 'http://www.premierleague.com/match/38671', 'http://www.premierleague.com/match/38673', 'http://www.premierleague.com/match/38668', 'http://www.premierleague.com/match/38669', 'http://www.premierleague.com/match/38676', 'http://www.premierleague.com/match/38677', 'http://www.premierleague.com/match/38670', 'http://www.premierleague.com/match/38675', 'http://www.premierleague.com/match/38672', 'http://www.premierleague.com/match/38662', 'http://www.premierleague.com/match/38659', 'http://www.premierle

In [8]:
match_dict = all_files[season_file]["http://www.premierleague.com/match/46975"]
events = match_dict['events']

In [27]:
match_dict['article']

"Watford were relegated from the Premier League after losing 3-2 to Arsenal as Pierre-Emerick Aubameyang's brace was not enough in the race for the Golden Boot.\nAubameyang opened the scoring in the fifth minute from the penalty spot after a Video Assistant Referee review of Craig Dawson's foul on Alexandre Lacazette.\nArsenal doubled their advantage on 24 minutes through Kieran Tierney's first goal for the club.\xa0\nIt was 3-0 on 33 minutes, Aubameyang producing an overhead kick into the net.\nWatford pulled one back 10 minutes later, Troy Deeney converting a penalty after Danny Welbeck had been fouled by David Luiz.\nWelbeck fired in from Ismaila Sarr's cross to make it 3-2 in the 66th minute before Emiliano Martinez denied the former Arsenal forward an equaliser.\nArsenal rise two places to eighth on 56 points, while\xa0Watford go down in 19th with 34 points.\xa0\nSee: Arsenal report |\xa0Watford report"

In [28]:
events

['Penalty conceded by Craig Dawson (Watford) after a foul in the penalty area.',
 'Penalty Arsenal. Alexandre Lacazette draws a foul in the penalty area.',
 'VAR Decision: Penalty Arsenal.',
 'Goal!   Arsenal 1, Watford 0. Pierre-Emerick Aubameyang (Arsenal) converts the penalty with a right footed shot to the bottom left corner.',
 'Attempt missed. Danny Welbeck (Watford) left footed shot from the left side of the box is high and wide to the left. Assisted by Adam Masina.',
 'Attempt saved. Ismaila Sarr (Watford) right footed shot from the centre of the box is saved in the centre of the goal. Assisted by Abdoulaye Doucouré with a cross.',
 'Troy Deeney (Watford) wins a free kick in the defensive half.',
 'Foul by Dani Ceballos (Arsenal).',
 'Corner,  Watford. Conceded by Kieran Tierney.',
 'Granit Xhaka (Arsenal) wins a free kick in the defensive half.',
 'Foul by Christian Kabasele (Watford).',
 'Attempt blocked. Pierre-Emerick Aubameyang (Arsenal) right footed shot from outside the 

### ROUGE

Utilizaremos esta métrica para asignar un score a cada par evento-frase artículo, para generar un target que indique qué evento tiene 
más opciones de aparecer en el resumen. Con este target se entrenará un modelo Learning to rank, de tal forma que se pueda construir un resumen 
con el conjunto de eventos más representativo de cada partido. Inspirado en [link](https://www.aclweb.org/anthology/P16-1129.pdf)

Probamos los tipos de ROUGE disponibles en el paquete: ROUGE-1, 2 y ROUGE-L. También probamos a usar f1 score y recall como score, ya que hemos visto
con los anteriores experimentos que el las palabras de los eventos.

Vemos que esto puede tener varios problemas:

- Al ser los eventos por lo general mucho más largos que los resúmenes, es probable que la información resultante sea redundante
- Los estilos y palabras usados son bastante distintos en los eventos (más simples y menor vocabulario) que en los artículos (uso de frases más
complejas y una mayor variedad en el vocabulario)

__ROUGE-1__

In [14]:
metric_params = {'rouge_mode': 'rouge-1', 'rouge_metric': 'r'}

In [15]:
ltr_metrics = LTRTargets(metric='rouge', metric_params=metric_params, lemma=True, drop_teams=True)

Setting target metric to rouge


In [16]:
event_article_list = ltr_metrics.create_match_targets(match_dict, verbose=False, league_season_teams=league_season_teams)

En este ejemplo se ve muy bien uno de los problemas. Para la misma frase del artículo aparecen muchos eventos!

In [17]:
ltr_metrics.print_scores_info(match_dict, event_article_list)

Score: 0.42857142857142855
Event: Goal.  Arsenal 3, Watford 0. Pierre-Emerick Aubameyang (Arsenal) right footed shot from very close range to the top left corner. Assisted by Kieran Tierney.
Nearest article sentence: Arsenal doubled their advantage on 24 minutes through Kieran Tierney's first goal for the club. 

Score: 0.42857142857142855
Event: Goal.  Arsenal 2, Watford 0. Kieran Tierney (Arsenal) left footed shot from the centre of the box to the bottom right corner. Assisted by Pierre-Emerick Aubameyang.
Nearest article sentence: Arsenal doubled their advantage on 24 minutes through Kieran Tierney's first goal for the club. 

Score: 0.36363636363636365
Event: Attempt saved. Danny Welbeck (Watford) right footed shot from the centre of the box is saved in the centre of the goal. Assisted by Troy Deeney.
Nearest article sentence: Watford pulled one back 10 minutes later, Troy Deeney converting a penalty after Danny Welbeck had been fouled by David Luiz.

Score: 0.36363636363636365
Eve

__ROUGE-2__

Al usar esta métrica, las correspondencias que se obtienen no tienen demasiado sentido...

In [20]:
metric_params = {'rouge_mode': 'rouge-2', 'rouge_metric': 'r'}

In [21]:
ltr_metrics = LTRTargets(metric='rouge', metric_params=metric_params, lemma=True, drop_teams=True)

Setting target metric to rouge


In [22]:
event_article_list = ltr_metrics.create_match_targets(match_dict, verbose=False, league_season_teams=league_season_teams)

Con bigramas la cosa se complica, y parece que se base unicamente en los nombres de los futbolistas.

In [23]:
ltr_metrics.print_scores_info(match_dict, event_article_list)

Score: 0.3
Event: Goal!   Arsenal 3, Watford 1. Troy Deeney (Watford) converts the penalty with a right footed shot to the bottom right corner.
Nearest article sentence: Watford pulled one back 10 minutes later, Troy Deeney converting a penalty after Danny Welbeck had been fouled by David Luiz.

Score: 0.2222222222222222
Event: Attempt saved. Pierre-Emerick Aubameyang (Arsenal) right footed shot from the left side of the box is saved in the bottom left corner. Assisted by Eddie Nketiah.
Nearest article sentence: Watford were relegated from the Premier League after losing 3-2 to Arsenal as Pierre-Emerick Aubameyang's brace was not enough in the race for the Golden Boot.

Score: 0.2222222222222222
Event: Pierre-Emerick Aubameyang (Arsenal) wins a free kick in the attacking half.
Nearest article sentence: Watford were relegated from the Premier League after losing 3-2 to Arsenal as Pierre-Emerick Aubameyang's brace was not enough in the race for the Golden Boot.

Score: 0.2222222222222222

__ROUGE-L__

In [24]:
metric_params = {'rouge_mode': 'rouge-l', 'rouge_metric': 'r'}

In [25]:
ltr_metrics = LTRTargets(metric='rouge', metric_params=metric_params, lemma=True, drop_teams=True)

Setting target metric to rouge


In [26]:
event_article_list = ltr_metrics.create_match_targets(match_dict, verbose=False, league_season_teams=league_season_teams)

Este realiza una mezcla

In [27]:
ltr_metrics.print_scores_info(match_dict, event_article_list)

Score: 0.36363636363636365
Event: Goal!   Arsenal 3, Watford 1. Troy Deeney (Watford) converts the penalty with a right footed shot to the bottom right corner.
Nearest article sentence: Watford pulled one back 10 minutes later, Troy Deeney converting a penalty after Danny Welbeck had been fouled by David Luiz.

Score: 0.36363636363636365
Event: Penalty Watford. Danny Welbeck draws a foul in the penalty area.
Nearest article sentence: Watford pulled one back 10 minutes later, Troy Deeney converting a penalty after Danny Welbeck had been fouled by David Luiz.

Score: 0.3333333333333333
Event: Pierre-Emerick Aubameyang (Arsenal) wins a free kick in the attacking half.
Nearest article sentence: It was 3-0 on 33 minutes, Aubameyang producing an overhead kick into the net.

Score: 0.3
Event: Attempt saved. Pierre-Emerick Aubameyang (Arsenal) right footed shot from the left side of the box is saved in the bottom left corner. Assisted by Eddie Nketiah.
Nearest article sentence: Watford were re

### Distancia coseno

Usando esta distancia, empezamos a tener claro que muchas de las correspondencias se deben únicamente a la aparición del nombre de un jugador.
De nuevo vemos como muchos eventos se asocian a la misma frase del artículo (sin mucha información) solo porque coincide de pleno con el nombre.

In [9]:
metric_params = {'ngram_range': (1, 2), 'strip_accents': 'unicode'}

In [10]:
ltr_metrics = LTRTargets(metric='cosine_tfidf', metric_params=metric_params, lemma=True, drop_teams=True)

Setting target metric to cosine_tfidf


In [11]:
event_article_list = ltr_metrics.create_match_targets(match_dict, verbose=True, league_season_teams=league_season_teams)

Event: Penalty conceded by Craig Dawson (Watford) after a foul in the penalty area.
Nearest article sentence: Aubameyang opened the scoring in the fifth minute from the penalty spot after a Video Assistant Referee review of Craig Dawson's foul on Alexandre Lacazette.
Processed event: penalty concede craig dawson foul penalty area
Processed article sentence: aubameyang open scoring fifth minute penalty spot video assistant referee review craig dawson foul alexandre

Event: Penalty Arsenal. Alexandre Lacazette draws a foul in the penalty area.
Nearest article sentence: Aubameyang opened the scoring in the fifth minute from the penalty spot after a Video Assistant Referee review of Craig Dawson's foul on Alexandre Lacazette.
Processed event: penalty alexandre draw foul penalty area
Processed article sentence: aubameyang open scoring fifth minute penalty spot video assistant referee review craig dawson foul alexandre

Event: VAR Decision: Penalty Arsenal.
Nearest article sentence: Watford 

In [35]:
ltr_metrics.print_scores_info(match_dict, event_article_list)

Score: 0.5477420284957557
Event: Goal!   Arsenal 3, Watford 1. Troy Deeney (Watford) converts the penalty with a right footed shot to the bottom right corner.
Nearest article sentence: Watford pulled one back 10 minutes later, Troy Deeney converting a penalty after Danny Welbeck had been fouled by David Luiz.

Score: 0.4996658472922424
Event: Attempt saved. Danny Welbeck (Watford) right footed shot from the centre of the box is saved in the centre of the goal. Assisted by Troy Deeney.
Nearest article sentence: Watford pulled one back 10 minutes later, Troy Deeney converting a penalty after Danny Welbeck had been fouled by David Luiz.

Score: 0.4948205264658473
Event: Attempt saved. Pierre-Emerick Aubameyang (Arsenal) right footed shot from the left side of the box is saved in the bottom left corner. Assisted by Eddie Nketiah.
Nearest article sentence: Watford were relegated from the Premier League after losing 3-2 to Arsenal as Pierre-Emerick Aubameyang's brace was not enough in the ra

__Examinando tfidf__

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
import pandas as pd

In [15]:
proc_events, proc_article_sents = ltr_metrics._process_events_article(match_dict)

In [16]:
proc_events

['penalty concede craig dawson foul penalty area',
 'penalty alexandre draw foul penalty area',
 'var decision penalty',
 'goal pierre emerick aubameyang convert penalty footed shot left corner',
 'attempt miss danny welbeck leave footed shot left box high wide left assist adam masina',
 'attempt save sarr footed shot centre box save centre goal assist doucoure cross',
 'troy deeney win free kick defensive half',
 'foul dani ceballos',
 'corner concede kieran tierney',
 'granit xhaka win free kick defensive half',
 'foul christian kabasele',
 'attempt block pierre emerick aubameyang footed shot outside box block assist pepe',
 'doucoure win free kick attack half',
 'foul alexandre',
 'attempt block doucoure leave footed shot centre box block assist sarr',
 'attempt save roberto pereyra footed shot left yard box save centre goal assist doucoure',
 'corner concede rob holding',
 'corner concede ainsley',
 'attempt miss danny welbeck header centre box miss assist roberto pereyra cross fol

In [17]:
proc_article_sents

['relegate premier league lose pierre emerick aubameyang brace race golden',
 'aubameyang open scoring fifth minute penalty spot video assistant referee review craig dawson foul alexandre',
 'double advantage minute kieran tierney goal club',
 'minute aubameyang produce overhead kick net',
 'pull minute troy deeney convert penalty danny welbeck foul david luiz',
 'welbeck fire sarr cross minute emiliano martinez deny forward equaliser',
 'rise eighth point point',
 '',
 'report report']

In [41]:
[proc_events.index(event) for event in proc_events if 'tierney' in event]

[8, 24, 27, 29, 8, 88, 29]

In [42]:
[event for event in proc_events if 'tierney' in event]

['corner concede kieran tierney',
 'goal kieran tierney leave footed shot centre box corner assist pierre emerick aubameyang',
 'goal pierre emerick aubameyang footed shot close range left corner assist kieran tierney',
 'kieran tierney win free kick defensive half',
 'corner concede kieran tierney',
 'attempt miss kieran tierney leave footed shot difficult angle left high wide left assist reiss nelson',
 'kieran tierney win free kick defensive half']

In [53]:
proc_article_sents

['relegate premier league lose pierre emerick aubameyang brace race golden',
 'aubameyang open scoring fifth minute penalty spot video assistant referee review craig dawson foul alexandre',
 'double advantage minute kieran tierney goal club',
 'minute aubameyang produce overhead kick net',
 'pull minute troy deeney convert penalty danny welbeck foul david luiz',
 'welbeck fire sarr cross minute emiliano martinez deny forward equaliser',
 'rise eighth point point',
 'report report']

In [54]:
len(proc_article_sents)

8

In [55]:
len(proc_events)

105

In [44]:
count_vec_kwargs = {'ngram_range': (1, 2), 'strip_accents': 'unicode'}

In [45]:
pipe = Pipeline([('count', CountVectorizer(**count_vec_kwargs)),
                         ('tfid', TfidfTransformer())])

In [46]:
X = pipe.fit_transform(proc_article_sents)

In [47]:
tfidf_df = pd.DataFrame(X.todense(), columns=pipe['count'].get_feature_names())

In [49]:
article_sentences = ltr_metrics.text_proc.get_sentences(match_dict['article'])
article_sentences_text = [str(sent).replace('\n', '') for sent in article_sentences]

In [50]:
article_sentences_text[2]

"Arsenal doubled their advantage on 24 minutes through Kieran Tierney's first goal for the club.\xa0"

In [51]:
proc_article_sents[2]

'double advantage minute kieran tierney goal club'

In [52]:
pd_df_sent = tfidf_df.loc[2]

In [53]:
pd_df_sent[pd_df_sent>0].sort_values(ascending=False)

tierney goal        0.284726
tierney             0.284726
minute kieran       0.284726
kieran tierney      0.284726
kieran              0.284726
goal club           0.284726
goal                0.284726
double advantage    0.284726
double              0.284726
club                0.284726
advantage minute    0.284726
advantage           0.284726
minute              0.164852
Name: 2, dtype: float64

In [54]:
events[24]

'Goal.  Arsenal 2, Watford 0. Kieran Tierney (Arsenal) left footed shot from the centre of the box to the bottom right corner. Assisted by Pierre-Emerick Aubameyang.'

In [55]:
proc_events[24]

'goal kieran tierney leave footed shot centre box corner assist pierre emerick aubameyang'

In [56]:
X_events = pipe.transform(proc_events)

In [57]:
tfidf_events_df = pd.DataFrame(X_events.todense(), columns=pipe['count'].get_feature_names())

In [58]:
pd_df_sent_event = tfidf_events_df.loc[24]

In [59]:
pd_df_sent_event[pd_df_sent_event>0].sort_values(ascending=False)

tierney               0.342207
pierre emerick        0.342207
pierre                0.342207
kieran tierney        0.342207
kieran                0.342207
goal                  0.342207
emerick aubameyang    0.342207
emerick               0.342207
aubameyang            0.251306
Name: 24, dtype: float64

### Distancia coseno (TF)

Probamos a realizar un enfoque más simpre, usando solo un TF.

In [12]:
metric_params = {'ngram_range': (1, 2), 'strip_accents': 'unicode'}

In [13]:
ltr_metrics = LTRTargets(metric='cosine_tf', metric_params=metric_params, lemma=True, drop_teams=True)

Setting target metric to cosine_tf


In [14]:
event_article_list = ltr_metrics.create_match_targets(match_dict, verbose=True, league_season_teams=league_season_teams)

Event: Penalty conceded by Craig Dawson (Watford) after a foul in the penalty area.
Nearest article sentence: Aubameyang opened the scoring in the fifth minute from the penalty spot after a Video Assistant Referee review of Craig Dawson's foul on Alexandre Lacazette.
Processed event: penalty concede craig dawson foul penalty area
Processed article sentence: aubameyang open scoring fifth minute penalty spot video assistant referee review craig dawson foul alexandre

Event: Penalty Arsenal. Alexandre Lacazette draws a foul in the penalty area.
Nearest article sentence: Aubameyang opened the scoring in the fifth minute from the penalty spot after a Video Assistant Referee review of Craig Dawson's foul on Alexandre Lacazette.
Processed event: penalty alexandre draw foul penalty area
Processed article sentence: aubameyang open scoring fifth minute penalty spot video assistant referee review craig dawson foul alexandre

Event: VAR Decision: Penalty Arsenal.
Nearest article sentence: Watford 

In [15]:
ltr_metrics.print_scores_info(match_dict, event_article_list)

Score: 0.5400617248673217
Event: Goal!   Arsenal 3, Watford 1. Troy Deeney (Watford) converts the penalty with a right footed shot to the bottom right corner.
Nearest article sentence: Watford pulled one back 10 minutes later, Troy Deeney converting a penalty after Danny Welbeck had been fouled by David Luiz.

Score: 0.512989176042577
Event: Attempt saved. Pierre-Emerick Aubameyang (Arsenal) right footed shot from the left side of the box is saved in the bottom left corner. Assisted by Eddie Nketiah.
Nearest article sentence: Watford were relegated from the Premier League after losing 3-2 to Arsenal as Pierre-Emerick Aubameyang's brace was not enough in the race for the Golden Boot.

Score: 0.512989176042577
Event: Attempt blocked. Pierre-Emerick Aubameyang (Arsenal) right footed shot from outside the box is blocked. Assisted by Nicolas Pépé.
Nearest article sentence: Watford were relegated from the Premier League after losing 3-2 to Arsenal as Pierre-Emerick Aubameyang's brace was not

### Distancia Levenshtein

Esta distancia se basa en el número de cambios a realizar en una secuencia
para conseguir otra. Se puede aplicar tanto a palabras como a secuencias
más largas.

In [16]:
metric_params = {'norm': True}

In [18]:
ltr_metrics = LTRTargets(metric='leve', metric_params=metric_params, lemma=False, drop_teams=True)

Setting target metric to leve


In [None]:
event_article_list = ltr_metrics.create_match_targets(match_dict, verbose=True, league_season_teams=league_season_teams)

In [15]:
ltr_metrics.print_scores_info(match_dict, event_article_list)

Score: 0.5400617248673217
Event: Goal!   Arsenal 3, Watford 1. Troy Deeney (Watford) converts the penalty with a right footed shot to the bottom right corner.
Nearest article sentence: Watford pulled one back 10 minutes later, Troy Deeney converting a penalty after Danny Welbeck had been fouled by David Luiz.

Score: 0.512989176042577
Event: Attempt saved. Pierre-Emerick Aubameyang (Arsenal) right footed shot from the left side of the box is saved in the bottom left corner. Assisted by Eddie Nketiah.
Nearest article sentence: Watford were relegated from the Premier League after losing 3-2 to Arsenal as Pierre-Emerick Aubameyang's brace was not enough in the race for the Golden Boot.

Score: 0.512989176042577
Event: Attempt blocked. Pierre-Emerick Aubameyang (Arsenal) right footed shot from outside the box is blocked. Assisted by Nicolas Pépé.
Nearest article sentence: Watford were relegated from the Premier League after losing 3-2 to Arsenal as Pierre-Emerick Aubameyang's brace was not

### WMD

WMD (Word Movers Distance) es una distancia basada en la representación de palabras usando Word Embeddings. La principal ventaja de usar word embeddings
es que la distancia puede ser pequeña, aunque no haya palabras en común (sinónimos)

Modelos disponibles en gensim:

__EJECUTAR NUEVO PARTIDO__

In [9]:
import gensim.downloader as api

In [10]:
api.info()['models'].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [11]:
api.info()['models']['word2vec-google-news-300']

{'num_records': 3000000,
 'file_size': 1743563840,
 'base_dataset': 'Google News (about 100 billion words)',
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/word2vec-google-news-300/__init__.py',
 'license': 'not found',
 'parameters': {'dimension': 300},
 'description': "Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in 'Distributed Representations of Words and Phrases and their Compositionality' (https://code.google.com/archive/p/word2vec/).",
 'read_more': ['https://code.google.com/archive/p/word2vec/',
  'https://arxiv.org/abs/1301.3781',
  'https://arxiv.org/abs/1310.4546',
  'https://www.microsoft.com/en-us/research/publication/linguistic-regularities-in-continuous-space-word-representations/?from=http%3A%2F%2Fresearch.microsoft.com%2Fpubs%2F189726%2Frvec

In [61]:
metric_params = {'norm': True}

In [63]:
ltr_metrics = LTRTargets(metric = 'wmd', metric_params=metric_params, lemma=True, drop_teams=True)

Setting target metric to wmd


In [None]:
# Esto a veces casca o tarda mucho
event_article_list = ltr_metrics.create_match_targets(match_dict, verbose=False, league_season_teams=league_season_teams)

In [135]:
ltr_metrics.print_scores_info(match_dict, event_article_list, reverse=False)

Score: 0.13826085867936916
Event: Substitution, Arsenal. Reiss Nelson replaces Nicolas Pépé.
Nearest article sentence: signing Nicholas Pepe failed to last the course, being substituted for Reiss Nelson after 72 minutes.

Score: 0.16723420581333923
Event: Goal!  Manchester United 1, Arsenal 1. Pierre-Emerick Aubameyang (Arsenal) left footed shot from the centre of the box to the centre of the goal. Assisted by Bukayo Saka with a through ball.Goal awarded following VAR Review.
Nearest article sentence: Goalkeeper Bernd Leno excelled for Arsenal with fine saves from Maguire and Marcus Rashford's late free-kick, while Bukayo Saka's goalbound shot crucially struck Victor Lindelof and flew over the top.

Score: 0.17164765828453452
Event: Attempt missed. Scott McTominay (Manchester United) header from very close range is just a bit too high. Assisted by Ashley Young with a cross following a corner.
Nearest article sentence: There was no shortage of effort but this was a scrappy mess of a gam

## Features

Una vez estudiados las distintas distancias o métricas que se pueden usar como target, se pasa a construir las features del modelo.
Siguiendo el artículo, se pueden incorporar las siguientes:

- Posición del evento
- Longitud del evento (después de quitar stopwords)
- Número de stopwords
- Suma de pesos TF-IDF de cada palabra en el evento
- Similiaridad a eventos vecinos
- Presencia de palabras que indican eventos importantes: goles, tarjetas, var...
- Cambios en el resultado: 0/1 en función de si ha habido cambios (equivaldría a tener la palabra gol...)
- Si el cambio sirve de empate o para poner a alguien por delante
- Parte de la que se encuentra cada evento (puede equivaler a posición del evento...)
- Número de jugadores que aparecen en el evento
- Identificación de jugadores importantes, que aparecen mucho en el partido

In [47]:
key_events = ['goal', 'red_card', 'penalty']

In [None]:
ltr_features = LTRFeatures(key_events)

In [78]:
ltr_features.processor.league_season_teams = league_season_teams

In [81]:
goal_event = [e for e in events if 'goal' in e.lower()][0]

In [82]:
goal_event

'Goal!   Arsenal 1, Watford 0. Pierre-Emerick Aubameyang (Arsenal) converts the penalty with a right footed shot to the bottom left corner.'

In [83]:
count_vec_kwargs = {'ngram_range': (1, 2), 'strip_accents': 'unicode'}

In [84]:
tfidf_dict = ltr_features._match_level_features(events, **count_vec_kwargs)

In [90]:
features_dict = ltr_features.create_features(match_dict, league_season_teams, **count_vec_kwargs)

In [91]:
len(features_dict['players_importance'])

105

In [92]:
features = ltr_features.get_features_pandas(match_dict, league_season_teams, **count_vec_kwargs)

In [93]:
features

Unnamed: 0,length,n_stop,is_key_event,n_players,players_importance,advantage,equalize,position,tfidf_sum,sim_previous_1,sim_previous_3,sim_previous_5
0,8,5,0,1,0.038462,0,0,0.009524,3.569398,0.000000,0.000000,0.000000
1,7,3,0,1,0.023077,0,0,0.019048,3.313886,0.439070,0.000000,0.000000
2,4,0,0,0,0.000000,0,0,0.028571,2.557825,0.277217,0.000000,0.000000
3,14,6,1,1,0.030769,1,0,0.038095,4.965324,0.088368,0.096671,0.000000
4,16,10,0,2,0.123077,0,0,0.047619,5.107512,0.159605,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
100,13,6,0,2,0.069231,0,0,0.961905,4.678774,0.000000,0.015249,0.000000
101,4,1,0,0,0.000000,0,0,0.971429,2.525825,0.016328,0.057778,0.021915
102,8,3,0,1,0.015385,0,0,0.980952,3.724435,0.000000,0.012434,0.260072
103,18,12,0,2,0.076923,0,0,0.990476,5.744472,0.007046,0.077170,0.011431


In [94]:
len(events)

105

## Features: tf/tfidf

Adicionalmente a estas features comentadas, podemos hacer algo más
automático, como la generación de un tf (o tfidf) que sirva como
vector de características.

In [12]:
mode = 'tfidf'
count_vec_kwargs = {'ngram_range': (1, 2), 'strip_accents': 'unicode'}
drop_teams = True
lemma = True

In [13]:
ltr_features_tf = LTRFeaturesTF(mode, count_vec_kwargs, drop_teams,
                               lemma)

Setting mode to tfidf


In [14]:
ltr_features_tf.run_all_matches()

Finished processing events
Training tfidf
Saving to /home/carlos/MasterDS/tfm/data/ltr/features/69b3cc6a31/features.pickle


In [7]:
processed_events_list = ltr_features_tf._all_events_list()

In [8]:
type(processed_events_list)

list

In [9]:
processed_events_list[0]

'attempt block ilkay gundogan footed shot outside box block assist bernardo silva'

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [12]:
tf = CountVectorizer(**count_vec_kwargs)

In [13]:
x = tf.fit_transform(processed_events_list)

In [14]:
x

<423800x69252 sparse matrix of type '<class 'numpy.int64'>'
	with 5208446 stored elements in Compressed Sparse Row format>

In [20]:
pipe = ltr_features_tf._choose_pipeline()

In [21]:
x = pipe.fit_transform(processed_events_list)

In [22]:
x

<423800x69252 sparse matrix of type '<class 'numpy.float64'>'
	with 5208446 stored elements in Compressed Sparse Row format>

## Features + targets

In [5]:
key_events = ['goal', 'red_card', 'penalty']
lags = [1, 3, 5]
target_metric = 'rouge'
drop_teams = True
lemma = True
metric_params = {'rouge_mode': 'rouge-1', 'rouge_metric': 'r'}
#metric_params = {'ngram_range': (1, 2), 'strip_accents': 'unicode'}
count_vec_kwargs = {'ngram_range': (1, 2), 'strip_accents': 'unicode'}

train_perc = 0.7
val_perc = 0.2

In [6]:
processor = ArticleTextProcessor(drop_teams=drop_teams, lemma=lemma)

#### Features

In [7]:
features = LTRFeatures(key_events=key_events, lags=lags, processor=processor,
                                    count_vec_kwargs=count_vec_kwargs)
targets = LTRTargets(metric=target_metric, metric_params=metric_params, processor=processor)

Setting target metric to rouge


In [8]:
features.run_all_matches()

0it [00:00, ?it/s]

4523 matches have already been processed
Updated all_files
Results path in /home/carlos/MasterDS/tfm/data/ltr/features/b8bcd377c1/features.csv





#### Targets

In [15]:
targets.run_all_matches()

0it [00:00, ?it/s]

4523 matches have already been processed
Updated all_files
Results path in /home/carlos/MasterDS/tfm/data/ltr/targets/c868aa4c6d/targets.csv





#### Todo

In [16]:
ltr = LTRFeaturesTargets(target_metric=target_metric, 
                        key_events=key_events,
                        lags=lags,
                        metric_params=metric_params,
                        count_vec_kwargs=count_vec_kwargs,
                        drop_teams=drop_teams,
                        lemma=lemma)

Setting target metric to rouge


In [17]:
ltr.run_target_features()

Reading targets from /home/carlos/MasterDS/tfm/data/csv/summaries/ltr/targets/c868aa4c6d/targets.csv
Reading features from /home/carlos/MasterDS/tfm/data/csv/summaries/ltr/features/b8bcd377c1/features.csv
Writing to /home/carlos/MasterDS/tfm/data/csv/summaries/ltr/features_targets/341d2aa93d/features_targets.csv


In [13]:
ltr.file_path

'/home/carlos/MasterDS/tfm/data/csv/summaries/ltr/features_targets/c4f2c5790f/features_targets.csv'

## TF Features + targets

In [9]:
mode = 'tfidf'
count_vec_kwargs = {'strip_accents': 'unicode'}
target_metric = 'cosine_tfidf'
drop_teams = True
lemma = True
#metric_params = {'rouge_mode': 'rouge-1', 'rouge_metric': 'r'}
metric_params = {'ngram_range': (1, 2), 'strip_accents': 'unicode'}

train_perc = 0.7
val_perc = 0.2

In [6]:
processor = ArticleTextProcessor(drop_teams=drop_teams, lemma=lemma)

#### Features

In [7]:
features = LTRFeaturesTF(mode=mode, count_vec_kwargs=count_vec_kwargs, lemma=lemma,
                         drop_teams=drop_teams, processor=processor)
targets = LTRTargets(metric=target_metric, metric_params=metric_params, processor=processor)

Setting mode to tfidf
Setting target metric to rouge


In [97]:
features.run_all_matches()

Writing config in /home/carlos/MasterDS/tfm/data/ltr/features/66b1bb47b2/config.pickle
Finished processing events
Training tfidf
Saving to /home/carlos/MasterDS/tfm/data/ltr/features/66b1bb47b2/features.pickle


In [24]:
x = features.get_features()

Reading features from /home/carlos/MasterDS/tfm/data/ltr/features/69b3cc6a31/features.pickle


In [25]:
type(x)

scipy.sparse.csr.csr_matrix

In [84]:
x.shape

(423800, 69252)

#### Targets

In [20]:
targets.run_all_matches()

0it [00:00, ?it/s]

4523 matches have already been processed
Updated all_files
Results path in /home/carlos/MasterDS/tfm/data/ltr/targets/c868aa4c6d/targets.csv





In [28]:
targets_df = targets.get_targets()

Reading targets from /home/carlos/MasterDS/tfm/data/ltr/targets/c868aa4c6d/targets.csv


In [22]:
targets_df

Unnamed: 0,event_ix,sentence_ix,score,url,json_file
0,0,4,0.153846,http://www.premierleague.com/match/38678,premier_league_2018_2019.json
1,1,3,0.181818,http://www.premierleague.com/match/38678,premier_league_2018_2019.json
2,2,2,0.222222,http://www.premierleague.com/match/38678,premier_league_2018_2019.json
3,3,0,0.000000,http://www.premierleague.com/match/38678,premier_league_2018_2019.json
4,4,5,0.200000,http://www.premierleague.com/match/38678,premier_league_2018_2019.json
...,...,...,...,...,...
423795,107,0,0.000000,https://www.espn.com/soccer/report?gameId=522804,italian_serie_a_2018_2019.json
423796,108,3,0.133333,https://www.espn.com/soccer/report?gameId=522804,italian_serie_a_2018_2019.json
423797,109,7,0.181818,https://www.espn.com/soccer/report?gameId=522804,italian_serie_a_2018_2019.json
423798,110,1,0.090909,https://www.espn.com/soccer/report?gameId=522804,italian_serie_a_2018_2019.json


In [29]:
y = targets_df['score'].values

In [30]:
type(y)

numpy.ndarray

In [86]:
y.shape

(423800,)

#### TODO

In [10]:
ltr = LTRFeaturesTargetsTF(target_metric=target_metric, 
                            metric_params=metric_params,
                            count_vec_kwargs=count_vec_kwargs,
                            mode=mode,
                            drop_teams=drop_teams,
                            lemma=lemma,
                            train_perc=train_perc,
                            val_perc=val_perc)

Setting target metric to cosine_tfidf
Setting mode to tfidf


In [11]:
ltr.run_target_features()

Writing config in /home/carlos/MasterDS/tfm/data/ltr/features_targets/11aeeba295/config.pickle
Reading features from /home/carlos/MasterDS/tfm/data/ltr/features/66b1bb47b2/features.pickle
Reading targets from /home/carlos/MasterDS/tfm/data/ltr/targets/de5e71ba7d/targets.csv
Saving datasets in /home/carlos/MasterDS/tfm/data/ltr/features_targets/11aeeba295
