In [1]:
MAIN_PATH = "/home/carlos/MasterDS/tfm"
JSON_DATA_PATH = '{}/data/json/'.format(MAIN_PATH)
CSV_DATA_PATH = '{}/data/csv/'.format(MAIN_PATH)

In [2]:
import sys
sys.path.insert(0, MAIN_PATH)

In [3]:
%load_ext autoreload
%autoreload 2
from scripts.text.article_text_processor import ArticleTextProcessor
from scripts.text.basic_text_processor import BasicTextProcessor
from scripts.extractive_summary.ltr.learn_to_rank import LearnToRank
from scripts.extractive_summary.ltr.ltr_features import LTRFeatures

from scripts.conf import TEAMS

from rouge import Rouge

%reload_ext autoreload

# Learn to Rank

In [4]:
processor = ArticleTextProcessor()
text_proc = BasicTextProcessor()

In [5]:
all_files = processor.load_json()

In [6]:
season_file = 'premier_league_2019_2020.json'
league_season_teams = TEAMS[season_file.split('.')[0]]

In [7]:
match_dict = all_files[season_file]['https://www.bbc.com/sport/football/49791610']
events = match_dict['events']

In [136]:
events

['Foul by Matteo Guendouzi (Arsenal).',
 'Scott McTominay (Manchester United) wins a free kick in the defensive half.',
 'Granit Xhaka (Arsenal) wins a free kick in the defensive half.',
 'Foul by Jesse Lingard (Manchester United).',
 'Corner,  Arsenal. Conceded by Axel Tuanzebe.',
 'Hand ball by Bukayo Saka (Arsenal).',
 'Corner,  Manchester United. Conceded by Calum Chambers.',
 'Foul by Calum Chambers (Arsenal).',
 'Daniel James (Manchester United) wins a free kick on the left wing.',
 'Calum Chambers (Arsenal) is shown the yellow card for a bad foul.',
 'Offside, Manchester United. Ashley Young tries a through ball, but Harry Maguire is caught offside.',
 'Hand ball by Andreas Pereira (Manchester United).',
 'Offside, Arsenal. Bernd Leno tries a through ball, but Bukayo Saka is caught offside.',
 'Pierre-Emerick Aubameyang (Arsenal) wins a free kick on the left wing.',
 'Foul by Paul Pogba (Manchester United).',
 'Bukayo Saka (Arsenal) wins a free kick in the attacking half.',
 'Fo

## ROUGE

Utilizaremos esta métrica para asignar un score a cada par evento-frase artículo, para generar un target que indique qué evento tiene 
más opciones de aparecer en el resumen. Con este target se entrenará un modelo Learning to rank, de tal forma que se pueda construir un resumen 
con el conjunto de eventos más representativo de cada partido. Inspirado en [link](https://www.aclweb.org/anthology/P16-1129.pdf)

Probamos los tipos de ROUGE disponibles en el paquete: ROUGE-1, 2 y ROUGE-L. También probamos a usar f1 score y recall como score, ya que hemos visto
con los anteriores experimentos que el las palabras de los eventos.

Vemos que esto puede tener varios problemas:

- Al ser los eventos por lo general mucho más largos que los resúmenes, es probable que la información resultante sea redundante
- Los estilos y palabras usados son bastante distintos en los eventos (más simples y menor vocabulario) que en los artículos (uso de frases más
complejas y una mayor variedad en el vocabulario)

In [59]:
ltr = LearnToRank(lemma=True, drop_teams=True)

Setting target metric to rouge


__ROUGE-1__

In [60]:
event_article_list = ltr.create_match_targets(match_dict, verbose=False, league_season_teams=league_season_teams, rouge_mode='rouge-1')

In [61]:
ltr.print_scores_info(match_dict, event_article_list)

Score: 0.6666666616666668
Event: Foul by Bukayo Saka (Arsenal).
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5714285665306124
Event: Hand ball by Bukayo Saka (Arsenal).
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.49999999555555563
Event: Substitution, Arsenal. Reiss Nelson replaces Nicolas Pépé.
Nearest article sentence: signing Nicholas Pepe failed to last the course, being substituted for Reiss Nelson after 72 minutes.

Score: 0.49999999531250006
Event: Substitution, Arsenal. Joe Willock replaces Bukayo Saka.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.4285714236734694
Event: VAR Decision: Goal  Manchester United 1-1 Arsenal (Pierre-Emerick Aubameyang).
Nearest article sentence: Arsenal equalised just before the hour courtesy of Pierre-Emerick Aubameyang's cool finish.

Score: 0.3999999958
Event: Bukayo Saka (Arsenal) wins a free kick in the attacking half.
Nearest article sentence: Bukayo [Saka] is progress

In [64]:
event_article_list = ltr.create_match_targets(match_dict, verbose=False, league_season_teams=league_season_teams, rouge_mode='rouge-1', rouge_metric='r')

En este ejemplo se ve muy bien uno de los problemas. Los 11 primeros eventos se asocian a la misma frase del artículo!

In [65]:
ltr.print_scores_info(match_dict, event_article_list)

Score: 0.6666666666666666
Event: Substitution, Arsenal. Joe Willock replaces Bukayo Saka.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.6666666666666666
Event: Attempt blocked. Bukayo Saka (Arsenal) right footed shot from the centre of the box is blocked. Assisted by Calum Chambers.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.6666666666666666
Event: Goal!  Manchester United 1, Arsenal 1. Pierre-Emerick Aubameyang (Arsenal) left footed shot from the centre of the box to the centre of the goal. Assisted by Bukayo Saka with a through ball.Goal awarded following VAR Review.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.6666666666666666
Event: Bukayo Saka (Arsenal) wins a free kick in the attacking half.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.6666666666666666
Event: Attempt saved. Bukayo Saka (Arsenal) left footed shot from the centre of the box is saved in the bottom right corner.
Neare

__ROUGE-2__

Al usar esta métrica, las correspondencias que se obtienen no tienen demasiado sentido...

In [57]:
event_article_list = ltr.create_match_targets(match_dict, verbose=False, league_season_teams=league_season_teams, rouge_mode='rouge-2')

In [58]:
ltr.print_scores_info(match_dict, event_article_list)

Score: 0.4999999950000001
Event: Foul by Bukayo Saka (Arsenal).
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.39999999520000007
Event: Hand ball by Bukayo Saka (Arsenal).
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.33333332888888895
Event: Substitution, Arsenal. Joe Willock replaces Bukayo Saka.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.33333332847222225
Event: VAR Decision: Goal  Manchester United 1-1 Arsenal (Pierre-Emerick Aubameyang).
Nearest article sentence: Arsenal equalised just before the hour courtesy of Pierre-Emerick Aubameyang's cool finish.

Score: 0.2857142807142858
Event: Pierre-Emerick Aubameyang (Arsenal) wins a free kick on the left wing.
Nearest article sentence: Arsenal equalised just before the hour courtesy of Pierre-Emerick Aubameyang's cool finish.

Score: 0.266666661688889
Event: Offside, Arsenal. Joe Willock tries a through ball, but Pierre-Emerick Aubameyang is caught offside.
Near

In [66]:
event_article_list = ltr.create_match_targets(match_dict, verbose=False, league_season_teams=league_season_teams, rouge_mode='rouge-2', rouge_metric='r')

In [67]:
ltr.print_scores_info(match_dict, event_article_list)

Score: 0.5
Event: Substitution, Arsenal. Joe Willock replaces Bukayo Saka.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5
Event: Attempt blocked. Bukayo Saka (Arsenal) right footed shot from the centre of the box is blocked. Assisted by Calum Chambers.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5
Event: Goal!  Manchester United 1, Arsenal 1. Pierre-Emerick Aubameyang (Arsenal) left footed shot from the centre of the box to the centre of the goal. Assisted by Bukayo Saka with a through ball.Goal awarded following VAR Review.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5
Event: Bukayo Saka (Arsenal) wins a free kick in the attacking half.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5
Event: Attempt saved. Bukayo Saka (Arsenal) left footed shot from the centre of the box is saved in the bottom right corner.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5
Event: F

__ROUGE-L__

In [62]:
event_article_list = ltr.create_match_targets(match_dict, verbose=False, league_season_teams=league_season_teams, rouge_mode='rouge-l')

In [63]:
ltr.print_scores_info(match_dict, event_article_list)

Score: 0.6666666616666668
Event: Foul by Bukayo Saka (Arsenal).
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5714285665306124
Event: Hand ball by Bukayo Saka (Arsenal).
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.49999999531250006
Event: Substitution, Arsenal. Joe Willock replaces Bukayo Saka.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.44444444000000005
Event: Offside, Arsenal. Bukayo Saka tries a through ball, but Sead Kolasinac is caught offside.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.4285714236734694
Event: VAR Decision: Goal  Manchester United 1-1 Arsenal (Pierre-Emerick Aubameyang).
Nearest article sentence: Arsenal equalised just before the hour courtesy of Pierre-Emerick Aubameyang's cool finish.

Score: 0.3999999958
Event: Bukayo Saka (Arsenal) wins a free kick in the attacking half.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.3999999958
Event: 

In [68]:
event_article_list = ltr.create_match_targets(match_dict, verbose=False, league_season_teams=league_season_teams, rouge_mode='rouge-l', rouge_metric='r')

In [69]:
ltr.print_scores_info(match_dict, event_article_list)

Score: 0.6666666666666666
Event: Substitution, Arsenal. Joe Willock replaces Bukayo Saka.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.6666666666666666
Event: Attempt blocked. Bukayo Saka (Arsenal) right footed shot from the centre of the box is blocked. Assisted by Calum Chambers.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.6666666666666666
Event: Goal!  Manchester United 1, Arsenal 1. Pierre-Emerick Aubameyang (Arsenal) left footed shot from the centre of the box to the centre of the goal. Assisted by Bukayo Saka with a through ball.Goal awarded following VAR Review.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.6666666666666666
Event: Bukayo Saka (Arsenal) wins a free kick in the attacking half.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.6666666666666666
Event: Attempt saved. Bukayo Saka (Arsenal) left footed shot from the centre of the box is saved in the bottom right corner.
Neare

## Distancia coseno

Usando esta distancia, empezamos a tener claro que muchas de las correspondencias se deben únicamente a la aparición del nombre de un jugador.
De nuevo vemos como muchos eventos se asocian a la misma frase del artículo (sin mucha información) solo porque coincide de pleno con el nombre.

In [128]:
count_vec_kwargs = {'ngram_range': (1, 3), 'strip_accents': 'unicode'}

In [129]:
ltr = LearnToRank(target_metric = 'cosine_tfidf', drop_teams=True, lemma=True)

Setting target metric to cosine_tfidf


In [130]:
event_article_list = ltr.create_match_targets(match_dict, verbose=True, league_season_teams=league_season_teams, **count_vec_kwargs)

Event: Foul by Matteo Guendouzi (Arsenal).
Nearest article sentence: United had the edge on chances, although they were grateful to David de Gea for a fine double save from Saka and Matteo Guendouzi in the first half
Processed event: foul matteo guendouzi
Processed article sentence: edge chance grateful david gea fine double save saka matteo guendouzi half

Event: Scott McTominay (Manchester United) wins a free kick in the defensive half.
Nearest article sentence: In a disappointing encounter that was a pale shadow of their mighty clashes of years gone by, Scott McTominay gave Manchester United the lead at the end of an attritional first-half with a rising drive from the edge of the area.
Processed event: scott mctominay win free kick defensive half
Processed article sentence: disappointing encounter shadow mighty year go scott mctominay give lead end attritional half rise drive edge area

Event: Granit Xhaka (Arsenal) wins a free kick in the defensive half.
Nearest article sentence: M

In [79]:
ltr.print_scores_info(match_dict, event_article_list)

Score: 0.5357070396843301
Event: Substitution, Arsenal. Joe Willock replaces Bukayo Saka.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5357070396843301
Event: Attempt blocked. Bukayo Saka (Arsenal) right footed shot from the centre of the box is blocked. Assisted by Calum Chambers.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5357070396843301
Event: Attempt saved. Bukayo Saka (Arsenal) left footed shot from the centre of the box is saved in the bottom right corner.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5357070396843301
Event: Foul by Bukayo Saka (Arsenal).
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5357070396843301
Event: Offside, Arsenal. Bukayo Saka tries a through ball, but Sead Kolasinac is caught offside.
Nearest article sentence: Bukayo [Saka] is progressing well

Score: 0.5357070396843301
Event: Attempt missed. Nicolas Pépé (Arsenal) left footed shot from outside the box 

__Examinando tfidf__

In [80]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
import pandas as pd

In [85]:
proc_events, proc_article_sents = ltr.metrics._process_events_article(match_dict)

In [86]:
proc_events

['foul matteo guendouzi',
 'scott mctominay win free kick defensive half',
 'granit xhaka win free kick defensive half',
 'foul jesse lingard',
 'corner concede axel tuanzebe',
 'hand ball bukayo saka',
 'corner concede calum chambers',
 'foul calum chambers',
 'daniel james win free kick left wing',
 'calum chambers show yellow card bad foul',
 'offside ashley young try ball harry maguire offside',
 'hand ball andreas pereira',
 'offside bernd leno try ball bukayo saka offside',
 'pierre emerick aubameyang win free kick left wing',
 'foul pogba',
 'bukayo saka win free kick attack half',
 'foul jesse lingard',
 'matteo guendouzi win free kick defensive half',
 'foul andreas pereira',
 'bukayo saka win free kick left wing',
 'foul ashley young',
 'sokratis win free kick defensive half',
 'foul marcus rashford',
 'marcus rashford show yellow card bad foul',
 'foul lucas torreira',
 'axel tuanzebe win free kick defensive half',
 'attempt save andreas pereira leave footed shot centre box 

In [87]:
proc_article_sents

['grim stalemate old trafford provide evidence illustrate far away premier league challenge club',
 'disappointing encounter shadow mighty year go scott mctominay give lead end attritional half rise drive edge area',
 'equalise hour courtesy pierre emerick aubameyang cool finish',
 'originally rule offside video assistant referee confirm striker onside harry maguire',
 'goalkeeper bernd leno excel fine save maguire marcus rashford free kick bukayo saka goalbound shot crucially strike victor lindelof fly',
 'mctominay head great chance ashley young corner force winner',
 'result take',
 'mean bad start seven game year fail reach double figure time season finish',
 'current circumstance club instantly demonstrate captain lead side',
 'captain old ashley young press defensive service aaron bissaka absence injury lead granit xhaka newly appoint divisive figure supporter',
 'young lead fashion contest decision show remarkable knack get referee kevin friend instant numerous occasion',
 'xhak

In [88]:
len(proc_article_sents)

37

In [89]:
len(proc_events)

119

In [90]:
pipe = Pipeline([('count', CountVectorizer(**count_vec_kwargs)),
                         ('tfid', TfidfTransformer())])

In [91]:
X = pipe.fit_transform(proc_article_sents)

In [92]:
tfidf_df = pd.DataFrame(X.todense(), columns=pipe['count'].get_feature_names())

In [94]:
article_sentences = ltr.metrics.text_proc.get_sentences(match_dict['article'])
article_sentences_text = [str(sent).replace('\n', '') for sent in article_sentences]

In [110]:
article_sentences_text[2]

"Arsenal equalised just before the hour courtesy of Pierre-Emerick Aubameyang's cool finish."

In [111]:
proc_article_sents[2]

'equalise hour courtesy pierre emerick aubameyang cool finish'

In [112]:
pd_df_sent = tfidf_df.loc[2]

In [113]:
pd_df_sent[pd_df_sent>0].sort_values(ascending=False)

pierre emerick aubameyang    0.27735
pierre emerick               0.27735
hour courtesy pierre         0.27735
hour courtesy                0.27735
equalise hour courtesy       0.27735
equalise hour                0.27735
emerick aubameyang cool      0.27735
emerick aubameyang           0.27735
courtesy pierre emerick      0.27735
courtesy pierre              0.27735
cool finish                  0.27735
aubameyang cool finish       0.27735
aubameyang cool              0.27735
Name: 2, dtype: float64

In [117]:
events[59]

'Goal!  Manchester United 1, Arsenal 1. Pierre-Emerick Aubameyang (Arsenal) left footed shot from the centre of the box to the centre of the goal. Assisted by Bukayo Saka with a through ball.Goal awarded following VAR Review.'

In [118]:
proc_events[59]

'goal pierre emerick aubameyang leave footed shot centre box centre goal assist bukayo saka ball goal award follow var review'

In [119]:
X_events = pipe.transform(proc_events)

In [120]:
tfidf_events_df = pd.DataFrame(X_events.todense(), columns=pipe['count'].get_feature_names())

In [105]:
tfidf_events_df

Unnamed: 0,aaron bissaka,aaron bissaka absence,absence injury,absence injury lead,action mctominay,action mctominay drive,action performance,action performance litter,appeal handball,appeal handball sead,...,year fail reach,year go,year go scott,young confidence,young corner,young corner force,young lead,young lead fashion,young press,young press defensive
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
pd_df_sent_event = tfidf_events_df.loc[59]

In [122]:
pd_df_sent_event[pd_df_sent_event>0].sort_values(ascending=False)

pierre emerick aubameyang    0.512653
pierre emerick               0.512653
emerick aubameyang           0.512653
bukayo saka                  0.459956
Name: 59, dtype: float64

# WMD

WMD (Word Movers Distance) es una distancia basada en la representación de palabras usando Word Embeddings. La principal ventaja de usar word embeddings
es que la distancia puede ser pequeña, aunque no haya palabras en común (sinónimos)

Modelos disponibles en gensim:

In [4]:
import gensim.downloader as api

In [10]:
api.info()['models'].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [13]:
api.info()['models']['word2vec-google-news-300']

{'num_records': 3000000,
 'file_size': 1743563840,
 'base_dataset': 'Google News (about 100 billion words)',
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/word2vec-google-news-300/__init__.py',
 'license': 'not found',
 'parameters': {'dimension': 300},
 'description': "Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in 'Distributed Representations of Words and Phrases and their Compositionality' (https://code.google.com/archive/p/word2vec/).",
 'read_more': ['https://code.google.com/archive/p/word2vec/',
  'https://arxiv.org/abs/1301.3781',
  'https://arxiv.org/abs/1310.4546',
  'https://www.microsoft.com/en-us/research/publication/linguistic-regularities-in-continuous-space-word-representations/?from=http%3A%2F%2Fresearch.microsoft.com%2Fpubs%2F189726%2Frvec

In [132]:
ltr = LearnToRank(target_metric = 'wmd', lemma=True, drop_teams=True)

Setting target metric to wmd


In [134]:
# Esto a veces casca o tarda mucho
event_article_list = ltr.create_match_targets(match_dict, verbose=False, league_season_teams=league_season_teams)

In [135]:
ltr.print_scores_info(match_dict, event_article_list, reverse=False)

Score: 0.13826085867936916
Event: Substitution, Arsenal. Reiss Nelson replaces Nicolas Pépé.
Nearest article sentence: signing Nicholas Pepe failed to last the course, being substituted for Reiss Nelson after 72 minutes.

Score: 0.16723420581333923
Event: Goal!  Manchester United 1, Arsenal 1. Pierre-Emerick Aubameyang (Arsenal) left footed shot from the centre of the box to the centre of the goal. Assisted by Bukayo Saka with a through ball.Goal awarded following VAR Review.
Nearest article sentence: Goalkeeper Bernd Leno excelled for Arsenal with fine saves from Maguire and Marcus Rashford's late free-kick, while Bukayo Saka's goalbound shot crucially struck Victor Lindelof and flew over the top.

Score: 0.17164765828453452
Event: Attempt missed. Scott McTominay (Manchester United) header from very close range is just a bit too high. Assisted by Ashley Young with a cross following a corner.
Nearest article sentence: There was no shortage of effort but this was a scrappy mess of a gam

# Features

Una vez estudiados las distintas distancias o métricas que se pueden usar como target, se pasa a construir las features del modelo.
Siguiendo el artículo, se pueden incorporar las siguientes:

- Posición del evento
- Longitud del evento (después de quitar stopwords)
- Número de stopwords
- Suma de pesos TF-IDF de cada palabra en el evento
- Similiaridad a eventos vecinos
- Presencia de palabras que indican eventos importantes: goles, tarjetas, var...
- Cambios en el resultado: 0/1 en función de si ha habido cambios (equivaldría a tener la palabra gol...)
- Si el cambio sirve de empate o para poner a alguien por delante
- Parte de la que se encuentra cada evento (puede equivaler a posición del evento...)
- Número de jugadores que aparecen en el evento
- Identificación de jugadores importantes, que aparecen mucho en el partido

In [8]:
key_events = ['goal', 'red_card', 'penalty']

In [9]:
ltr_features = LTRFeatures(key_events)

In [10]:
ltr_features.processor.league_season_teams = league_season_teams

In [11]:
goal_event = [e for e in events if 'goal' in e.lower()][1]

In [12]:
goal_event

'Goal!  Manchester United 1, Arsenal 1. Pierre-Emerick Aubameyang (Arsenal) left footed shot from the centre of the box to the centre of the goal. Assisted by Bukayo Saka with a through ball.Goal awarded following VAR Review.'

In [14]:
count_vec_kwargs = {'ngram_range': (1, 2), 'strip_accents': 'unicode'}

In [15]:
tfidf_dict = ltr_features._match_level_features(events, **count_vec_kwargs)

In [104]:
x = tfidf_dict['data']
lags = [1, 3, 5]

In [105]:
sim_mat = ltr_features._lag_similarities(x, lags)

In [106]:
sim_mat.shape

(119, 3)

In [138]:
np.sum(x, axis=1)[0]

2.5477822735685622

In [112]:
processed_events = [' '.join(ltr_features.key_events_sum.process_match_text(event)) for event in events]

In [211]:
features_dict = ltr_features.create_features(match_dict, league_season_teams, **count_vec_kwargs)

In [213]:
len(features_dict['players_importance'])

118

In [16]:
features = ltr_features.get_features_pandas(match_dict, league_season_teams, **count_vec_kwargs)

In [17]:
features

Unnamed: 0,length,n_stop,is_key_event,n_players,players_importance,advantage,equalize,position,tfidf_sum,sim_previous_1,sim_previous_3,sim_previous_5
0,4,1,0,1,0.084034,0,0,0.008403,2.547782,0.000000,0.000000,0.000000
1,9,3,0,1,0.084034,0,0,0.016807,4.014484,0.000000,0.000000,0.000000
2,8,3,0,1,0.042017,0,0,0.025210,3.759987,0.470642,0.000000,0.000000
3,5,1,0,1,0.050420,0,0,0.033613,2.804595,0.000000,0.058789,0.000000
4,5,1,0,1,0.058824,0,0,0.042017,2.929353,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
114,15,5,0,1,0.016807,0,0,0.966387,5.168148,0.216441,0.035032,0.301559
115,8,3,0,1,0.033613,0,0,0.974790,3.735468,0.034824,0.000000,0.022269
116,5,1,0,0,0.000000,0,0,0.983193,2.768652,0.000000,0.000000,0.062422
117,8,3,0,1,0.025210,0,0,0.991597,3.706503,0.000000,0.028283,0.060310


In [209]:
len(events)

119

In [210]:
events

['Foul by Matteo Guendouzi (Arsenal).',
 'Scott McTominay (Manchester United) wins a free kick in the defensive half.',
 'Granit Xhaka (Arsenal) wins a free kick in the defensive half.',
 'Foul by Jesse Lingard (Manchester United).',
 'Corner,  Arsenal. Conceded by Axel Tuanzebe.',
 'Hand ball by Bukayo Saka (Arsenal).',
 'Corner,  Manchester United. Conceded by Calum Chambers.',
 'Foul by Calum Chambers (Arsenal).',
 'Daniel James (Manchester United) wins a free kick on the left wing.',
 'Calum Chambers (Arsenal) is shown the yellow card for a bad foul.',
 'Offside, Manchester United. Ashley Young tries a through ball, but Harry Maguire is caught offside.',
 'Hand ball by Andreas Pereira (Manchester United).',
 'Offside, Arsenal. Bernd Leno tries a through ball, but Bukayo Saka is caught offside.',
 'Pierre-Emerick Aubameyang (Arsenal) wins a free kick on the left wing.',
 'Foul by Paul Pogba (Manchester United).',
 'Bukayo Saka (Arsenal) wins a free kick in the attacking half.',
 'Fo