In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import time

import gensim
from gensim.models.word2vec import Word2Vec

from sklearn.manifold import TSNE

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

import alignment_data    as ad
import feature_extractor as fe
import player_sequential as ps



df = pd.read_csv('../../French Ligue One 20162017 season - Match Day 1- 19/parsed_matches.csv', low_memory=False)

In [2]:
t0 = time.time()

df = ad.align_events(df)
res = fe.extract_features(df)
id2cols = dict(enumerate(res.columns))
hashes = res.apply(np.nonzero, axis=1).apply(sum)
hashes = hashes.apply(lambda x: " ".join(sorted([id2cols[k] for k in x])))
df['desc'] = hashes
df['hash'] = hashes.apply(ps.get_hash).tolist()

print("Done in {:02d}m{:02d}s".format(*divmod(int(time.time() - t0), 60)))

Done in 00m48s


In [3]:
hash2text = {k: v for _, (k, v) in df[['hash', 'desc']].drop_duplicates().iterrows()}
hash2event = {k: v for _, (k, v) in df[['hash', 'type_id']].drop_duplicates().iterrows()}

docs = df.groupby(['match_id', 'period_id', "SPLIT"]).agg({"hash": lambda x : " ".join(x)}).hash.tolist()
docs = [d for d in docs if len(d.split())>1]

print("%d docs" % len(docs))
print("Average length of doc : %d" % np.mean([len(d.split()) for d in docs]))

17689 docs
Average length of doc : 18


In [4]:
ev2vec = Word2Vec(size=200, window=5, min_count=0, sg=1, iter=15)
ev2vec.build_vocab([d.split() for d in docs])
ev2vec.train([d.split() for d in docs], total_examples=ev2vec.corpus_count, epochs=ev2vec.epochs)
print('Number of words : ', len(ev2vec.wv.vocab)) 

Number of words :  1182


In [5]:
event_vectors = [ev2vec.wv[w] for w in ev2vec.wv.vocab.keys()]

tsne_model = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=3, verbose=True)
tsne_ev2v = tsne_model.fit_transform(event_vectors)

[t-SNE] Computing 10 nearest neighbors...
[t-SNE] Indexed 1182 samples in 0.017s...


  """Entry point for launching an IPython kernel.


[t-SNE] Computed neighbors for 1182 samples in 0.394s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1182
[t-SNE] Computed conditional probabilities for sample 1182 / 1182
[t-SNE] Mean sigma: 0.092232
[t-SNE] KL divergence after 250 iterations with early exaggeration: 86.418266
[t-SNE] Error after 10000 iterations: 1.133914


In [9]:
output_notebook()
plot_vecs = bp.figure(plot_width=700, plot_height=600, title="A map of %d word vectors" % len(ev2vec.wv.vocab),
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

tsne_df = pd.DataFrame(tsne_ev2v, columns=['x', 'y'])
tsne_df['event'] = [hash2text[e] for e in ev2vec.wv.vocab.keys()]

plot_vecs.scatter(x='x', y='y', size = 8, source=tsne_df)
hover = plot_vecs.select(dict(type=HoverTool))
hover.tooltips={"event": "@event"}
show(plot_vecs)

In [None]:
#ev2vec.wv.save_word2vec_format(fname="_event_vectors_200dim_15epochs.txt")