In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
import umap.umap_ as umap
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
pd.set_option('display.max_columns', 1000)
pd.set_option("display.max_rows", 3000)
pd.set_option('use_inf_as_na', True)

In [4]:
def_vectors = pd.read_pickle("../data/defender_clusters/defender_vectors.pkl")

FileNotFoundError: [Errno 2] No such file or directory: '../data/defender_clusters/defender_vectors.pkl'

In [None]:
def_vectors[def_vectors['player_name'].str.contains("Vert")]

In [None]:
def_vectors = def_vectors[def_vectors['progaccpass'] + def_vectors['proginaccpass'] > 20]

In [None]:
def_vectors.reset_index(inplace=True)
def_vectors.drop(['index'],inplace=True,axis=1)

In [None]:
def_vectors['progaccpass_pm']=def_vectors['progaccpass']/def_vectors['matches_played']
def_vectors['proginaccpass_pm']=def_vectors['proginaccpass']/def_vectors['matches_played']

In [None]:
cols = ['player_name','team','position','footedness','progaccpass_pm','proginaccpass_pm','progpreference_per_region','progaccuracy_per_region','total_off_region_per_pass','off_avgcontri','off_val_opp_avg']
def_vectors_fil = def_vectors[cols]

In [None]:
def_vectors_foot_sep = pd.get_dummies(def_vectors_fil,prefix=['foot'],columns=['footedness'])

In [None]:
def_vectors_foot_sep['position'].value_counts()

In [None]:
def ind_cols(df):
    df1 = pd.DataFrame([sub_list[4:] for sub_list in df['progpreference_per_region'].tolist()], columns = ['att_LF_pref','att_LC_pref','att_RC_pref','att_RF_pref'])
    df2 = pd.DataFrame([sub_list[4:] for sub_list in df['total_off_region_per_pass'].tolist()], columns = ['att_LF_off','att_LC_off','att_RC_off','att_RF_off'])
    df3 = pd.DataFrame([sub_list[4:] for sub_list in df['progaccuracy_per_region'].tolist()], columns = ['att_LF_acc','att_LC_acc','att_RC_acc','att_RF_acc'])
    df4 = pd.DataFrame(df['off_avgcontri'].tolist(), columns = ['att_LF_offcontri','att_LC_offcontri','att_RC_offcontri','att_RF_offcontri'])
    df= pd.concat([df,df1,df3,df2,df4], axis = 1)
    df.drop(['progpreference_per_region','progaccuracy_per_region','total_off_region_per_pass','off_avgcontri'], axis = 1, inplace = True)
    return df

In [None]:
def_vectors_ind_footsep = ind_cols(def_vectors_foot_sep)

In [None]:
def_vectors_ind_footsep.head()

In [None]:
scaler = StandardScaler()
feat_scaled=scaler.fit_transform(def_vectors_ind_footsep[def_vectors_ind_footsep.columns.difference(['player_name','team','position','off_val_opp_avg','foot_left'])].values)


In [None]:
def_vectors_footsep_scaled = pd.concat([def_vectors_ind_footsep[['player_name','team','position','off_val_opp_avg']],pd.DataFrame(feat_scaled,columns=def_vectors_ind_footsep.columns.difference(['player_name','team','position','off_val_opp_avg','foot_left']))],axis=1)


In [None]:
def_vectors_footsep_scaled.head()

## LCB Clusters

In [None]:
lcb_def_vectors = def_vectors_footsep_scaled[def_vectors_footsep_scaled['position']=='L_CB']

In [None]:
lcb_def_vectors[lcb_def_vectors['player_name'].str.contains("Lap")]

In [None]:
lcb_def_vectors.reset_index(inplace=True)
lcb_def_vectors.drop(['index'],axis=1,inplace=True)

In [None]:
lcb_def_vectors.head()

In [None]:
lcb_def_vectors = lcb_def_vectors.merge(def_vectors[['player_name','team','position','footedness']],on=['player_name','position','team'])


In [None]:
standard_embedding = umap.UMAP(random_state=np.random.RandomState(42)).fit_transform(lcb_def_vectors[lcb_def_vectors.columns.difference(['player_name','team','position','off_val_opp_avg','footedness'])].values)
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1])

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=4,
    min_dist=0.0,
    n_components=2,
    random_state=np.random.RandomState(42),
).fit_transform(lcb_def_vectors[lcb_def_vectors.columns.difference(['player_name','team','position','off_val_opp_avg','footedness'])].values)

In [None]:
labels = hdbscan.HDBSCAN(
    min_samples=2,
    min_cluster_size=5,
).fit_predict(clusterable_embedding)

In [None]:
clustered = (labels >= 0)
a0 = (labels==0)
a1 = (labels==1)
a2 = (labels==2)
a3 = (labels==3)

fig1 = plt.scatter(standard_embedding[~clustered, 0],
            standard_embedding[~clustered, 1],
            c=(0.5, 0.5, 0.5),
            alpha=0.5)
fig2 = plt.scatter(standard_embedding[a0, 0],
            standard_embedding[a0, 1],
            #c=labels[a0],
            cmap='Spectral')
fig3 = plt.scatter(standard_embedding[a1, 0],
            standard_embedding[a1, 1],
            #c=labels[a1],
            cmap='Spectral')
fig4 = plt.scatter(standard_embedding[a2, 0],
            standard_embedding[a2, 1],
            #c=labels[a2],
            cmap='Spectral')
fig5 = plt.scatter(standard_embedding[a3, 0],
            standard_embedding[a3, 1],
            #c=labels[a2],
            cmap='Spectral')
plt.legend([fig2, fig3, fig4, fig5], np.unique(labels))
plt.title('LCB',fontsize=12,fontweight='bold')
plt.figtext(0.5,0.01,'Silhouette Score = 0.809',ha='center',fontsize=12)

In [None]:
silhouette_avg = silhouette_score(clusterable_embedding, labels) 

In [None]:
silhouette_avg

In [None]:
lcb_def_vectors['groups'] = labels

In [None]:
lcb_def_vectors[lcb_def_vectors['groups']==2]

In [None]:

from IPython.display import display, HTML
from sklearn.tree import _tree, DecisionTreeClassifier
import pandas as pd

def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

def get_class_rules(tree: DecisionTreeClassifier, feature_names: list):
  inner_tree: _tree.Tree = tree.tree_
  classes = tree.classes_
  class_rules_dict = dict()

  def tree_dfs(node_id=0, current_rule=[]):
    # feature[i] holds the feature to split on, for the internal node i.
    split_feature = inner_tree.feature[node_id]
    if split_feature != _tree.TREE_UNDEFINED: # internal node
      name = feature_names[split_feature]
      threshold = inner_tree.threshold[node_id]
      # left child
      left_rule = current_rule + ["({} <= {})".format(name, threshold)]
      tree_dfs(inner_tree.children_left[node_id], left_rule)
      # right child
      right_rule = current_rule + ["({} > {})".format(name, threshold)]
      tree_dfs(inner_tree.children_right[node_id], right_rule)
    else: # leaf
      dist = inner_tree.value[node_id][0]
      dist = dist/dist.sum()
      max_idx = dist.argmax()
      if len(current_rule) == 0:
        rule_string = "ALL"
      else:
        rule_string = " and ".join(current_rule)
      # register new rule to dictionary
      selected_class = classes[max_idx]
      class_probability = dist[max_idx]
      class_rules = class_rules_dict.get(selected_class, [])
      class_rules.append((rule_string, class_probability))
      class_rules_dict[selected_class] = class_rules
    
  tree_dfs() # start from root, node_id = 0
  return class_rules_dict

def cluster_report(data: pd.DataFrame, clusters, min_samples_leaf=50, pruning_level=0.01):
    # Create Model
    tree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, ccp_alpha=pruning_level)
    tree.fit(data, clusters)
    
    # Generate Report
    feature_names = data.columns
    class_rule_dict = get_class_rules(tree, feature_names)

    report_class_list = []
    for class_name in class_rule_dict.keys():
        rule_list = class_rule_dict[class_name]
        combined_string = ""
        for rule in rule_list:
            combined_string += "[{}] {}\n\n".format(rule[1], rule[0])
        report_class_list.append((class_name, combined_string))
        
    cluster_instance_df = pd.Series(clusters).value_counts().reset_index()
    cluster_instance_df.columns = ['class_name', 'instance_count']
    report_df = pd.DataFrame(report_class_list, columns=['class_name', 'rule_list'])
    report_df = pd.merge(cluster_instance_df, report_df, on='class_name', how='left')
    pretty_print(report_df.sort_values(by='class_name')[['class_name', 'instance_count', 'rule_list']])

In [None]:
cluster_report(pd.DataFrame(clusterable_embedding,columns=['c0','c1']),labels,min_samples_leaf=2,pruning_level=0.05)

## R_CB Clusters

In [None]:
rcb_def_vectors = def_vectors_footsep_scaled[def_vectors_footsep_scaled['position']=='R_CB']

In [None]:
rcb_def_vectors.reset_index(inplace=True)
rcb_def_vectors.drop(['index'],axis=1,inplace=True)

In [None]:
rcb_def_vectors = rcb_def_vectors.merge(def_vectors[['player_name','team','position','footedness']],on=['player_name','position','team'])


In [None]:
standard_embedding = umap.UMAP(random_state=np.random.RandomState(30)).fit_transform(rcb_def_vectors[rcb_def_vectors.columns.difference(['player_name','team','position','off_val_opp_avg','footedness'])].values)
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1],s=5)

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=4,
    min_dist=0.0,
    n_components=2,
    random_state=np.random.RandomState(30),
).fit_transform(rcb_def_vectors[rcb_def_vectors.columns.difference(['player_name','team','position','off_val_opp_avg','footedness'])].values)

In [None]:
labels = hdbscan.HDBSCAN(
    min_samples=2,
    min_cluster_size=5,
).fit_predict(clusterable_embedding)

In [None]:
clustered = (labels >= 0)
a0 = (labels==0)
a1 = (labels==1)
a2 = (labels==2)

fig1 = plt.scatter(standard_embedding[~clustered, 0],
            standard_embedding[~clustered, 1],
            c=(0.5, 0.5, 0.5),
            alpha=0.5)
fig2 = plt.scatter(standard_embedding[a0, 0],
            standard_embedding[a0, 1],
            #c=labels[a0],
            cmap='Spectral')
fig3 = plt.scatter(standard_embedding[a1, 0],
            standard_embedding[a1, 1],
            #c=labels[a1],
            cmap='Spectral')
fig4 = plt.scatter(standard_embedding[a2, 0],
            standard_embedding[a2, 1],
            #c=labels[a2],
            cmap='Spectral')
plt.legend([fig2, fig3, fig4], np.unique(labels))
plt.title('RCB',fontsize=12,fontweight='bold')
plt.figtext(0.5,0.01,'Silhouette Score = 0.619',ha='center',fontsize=12)

In [None]:
silhouette_avg = silhouette_score(clusterable_embedding, labels) 

In [None]:
silhouette_avg

In [None]:
rcb_def_vectors['groups'] = labels

In [None]:
rcb_def_vectors[rcb_def_vectors['groups']==2]

In [None]:

from IPython.display import display, HTML
from sklearn.tree import _tree, DecisionTreeClassifier
import pandas as pd

def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

def get_class_rules(tree: DecisionTreeClassifier, feature_names: list):
  inner_tree: _tree.Tree = tree.tree_
  classes = tree.classes_
  class_rules_dict = dict()

  def tree_dfs(node_id=0, current_rule=[]):
    # feature[i] holds the feature to split on, for the internal node i.
    split_feature = inner_tree.feature[node_id]
    if split_feature != _tree.TREE_UNDEFINED: # internal node
      name = feature_names[split_feature]
      threshold = inner_tree.threshold[node_id]
      # left child
      left_rule = current_rule + ["({} <= {})".format(name, threshold)]
      tree_dfs(inner_tree.children_left[node_id], left_rule)
      # right child
      right_rule = current_rule + ["({} > {})".format(name, threshold)]
      tree_dfs(inner_tree.children_right[node_id], right_rule)
    else: # leaf
      dist = inner_tree.value[node_id][0]
      dist = dist/dist.sum()
      max_idx = dist.argmax()
      if len(current_rule) == 0:
        rule_string = "ALL"
      else:
        rule_string = " and ".join(current_rule)
      # register new rule to dictionary
      selected_class = classes[max_idx]
      class_probability = dist[max_idx]
      class_rules = class_rules_dict.get(selected_class, [])
      class_rules.append((rule_string, class_probability))
      class_rules_dict[selected_class] = class_rules
    
  tree_dfs() # start from root, node_id = 0
  return class_rules_dict

def cluster_report(data: pd.DataFrame, clusters, min_samples_leaf=50, pruning_level=0.01):
    # Create Model
    tree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, ccp_alpha=pruning_level)
    tree.fit(data, clusters)
    
    # Generate Report
    feature_names = data.columns
    class_rule_dict = get_class_rules(tree, feature_names)

    report_class_list = []
    for class_name in class_rule_dict.keys():
        rule_list = class_rule_dict[class_name]
        combined_string = ""
        for rule in rule_list:
            combined_string += "[{}] {}\n\n".format(rule[1], rule[0])
        report_class_list.append((class_name, combined_string))
        
    cluster_instance_df = pd.Series(clusters).value_counts().reset_index()
    cluster_instance_df.columns = ['class_name', 'instance_count']
    report_df = pd.DataFrame(report_class_list, columns=['class_name', 'rule_list'])
    report_df = pd.merge(cluster_instance_df, report_df, on='class_name', how='left')
    pretty_print(report_df.sort_values(by='class_name')[['class_name', 'instance_count', 'rule_list']])

In [None]:
cluster_report(pd.DataFrame(clusterable_embedding,columns=['c0','c1']),labels,min_samples_leaf=2,pruning_level=0.05)

## LB Clusters

In [None]:
lb_def_vectors = def_vectors_footsep_scaled[def_vectors_footsep_scaled['position']=='LB']

In [None]:
lb_def_vectors.reset_index(inplace=True)
lb_def_vectors.drop(['index'],axis=1,inplace=True)

In [None]:
lb_def_vectors.head()

In [None]:
lb_def_vectors = lb_def_vectors.merge(def_vectors[['player_name','team','position','footedness']],on=['player_name','position','team'])


In [None]:
standard_embedding = umap.UMAP(random_state=np.random.RandomState(22)).fit_transform(lb_def_vectors[lb_def_vectors.columns.difference(['player_name','team','position','off_val_opp_avg','footedness'])].values)
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1],s=5)

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=6,
    min_dist=0.0,
    n_components=2,
    random_state=np.random.RandomState(22),
).fit_transform(lb_def_vectors[lb_def_vectors.columns.difference(['player_name','team','position','off_val_opp_avg','footedness'])].values)

In [None]:
labels = hdbscan.HDBSCAN(
    min_samples=2,
    min_cluster_size=6,
).fit_predict(clusterable_embedding)

In [None]:
labels

In [None]:
clustered = (labels >= 0)
a0 = (labels==0)
a1 = (labels==1)
a2 = (labels==2)
fig1 = plt.scatter(standard_embedding[~clustered, 0],
            standard_embedding[~clustered, 1],
            c=(0.5, 0.5, 0.5),
            alpha=0.5)
fig2 = plt.scatter(standard_embedding[a0, 0],
            standard_embedding[a0, 1],
            #c=labels[a0],
            cmap='Spectral')
fig3 = plt.scatter(standard_embedding[a1, 0],
            standard_embedding[a1, 1],
            #c=labels[a1],
            cmap='Spectral')
fig4 = plt.scatter(standard_embedding[a2, 0],
            standard_embedding[a2, 1],
            #c=labels[a1],
            cmap='Spectral')
plt.legend([fig2, fig3, fig4], np.unique(labels))
plt.title('LB',fontsize=12,fontweight='bold')
plt.figtext(0.5,0.01,'Silhouette Score = 0.478',ha='center',fontsize=12)

In [None]:
silhouette_avg = silhouette_score(clusterable_embedding, labels) 

In [None]:
silhouette_avg

In [None]:
lb_def_vectors['groups'] = labels

In [None]:
lb_def_vectors[lb_def_vectors['groups']==0]

In [None]:
from IPython.display import display, HTML
from sklearn.tree import _tree, DecisionTreeClassifier
import pandas as pd

def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

def get_class_rules(tree: DecisionTreeClassifier, feature_names: list):
  inner_tree: _tree.Tree = tree.tree_
  classes = tree.classes_
  class_rules_dict = dict()

  def tree_dfs(node_id=0, current_rule=[]):
    # feature[i] holds the feature to split on, for the internal node i.
    split_feature = inner_tree.feature[node_id]
    if split_feature != _tree.TREE_UNDEFINED: # internal node
      name = feature_names[split_feature]
      threshold = inner_tree.threshold[node_id]
      # left child
      left_rule = current_rule + ["({} <= {})".format(name, threshold)]
      tree_dfs(inner_tree.children_left[node_id], left_rule)
      # right child
      right_rule = current_rule + ["({} > {})".format(name, threshold)]
      tree_dfs(inner_tree.children_right[node_id], right_rule)
    else: # leaf
      dist = inner_tree.value[node_id][0]
      dist = dist/dist.sum()
      max_idx = dist.argmax()
      if len(current_rule) == 0:
        rule_string = "ALL"
      else:
        rule_string = " and ".join(current_rule)
      # register new rule to dictionary
      selected_class = classes[max_idx]
      class_probability = dist[max_idx]
      class_rules = class_rules_dict.get(selected_class, [])
      class_rules.append((rule_string, class_probability))
      class_rules_dict[selected_class] = class_rules
    
  tree_dfs() # start from root, node_id = 0
  return class_rules_dict

def cluster_report(data: pd.DataFrame, clusters, min_samples_leaf=50, pruning_level=0.01):
    # Create Model
    tree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, ccp_alpha=pruning_level)
    tree.fit(data, clusters)
    
    # Generate Report
    feature_names = data.columns
    class_rule_dict = get_class_rules(tree, feature_names)

    report_class_list = []
    for class_name in class_rule_dict.keys():
        rule_list = class_rule_dict[class_name]
        combined_string = ""
        for rule in rule_list:
            combined_string += "[{}] {}\n\n".format(rule[1], rule[0])
        report_class_list.append((class_name, combined_string))
        
    cluster_instance_df = pd.Series(clusters).value_counts().reset_index()
    cluster_instance_df.columns = ['class_name', 'instance_count']
    report_df = pd.DataFrame(report_class_list, columns=['class_name', 'rule_list'])
    report_df = pd.merge(cluster_instance_df, report_df, on='class_name', how='left')
    pretty_print(report_df.sort_values(by='class_name')[['class_name', 'instance_count', 'rule_list']])

In [None]:
cluster_report(pd.DataFrame(clusterable_embedding,columns=['c0','c1']),labels,min_samples_leaf=2,pruning_level=0.04)

## RB Clusters

In [None]:
rb_def_vectors = def_vectors_footsep_scaled[def_vectors_footsep_scaled['position']=='RB']

In [None]:
rb_def_vectors.reset_index(inplace=True)
rb_def_vectors.drop(['index'],axis=1,inplace=True)

In [None]:
rb_def_vectors.head()

In [None]:
rb_def_vectors = rb_def_vectors.merge(def_vectors[['player_name','team','position','footedness']],on=['player_name','position','team'])


In [None]:
standard_embedding = umap.UMAP(random_state=np.random.RandomState(22),learning_rate=1).fit_transform(rb_def_vectors[rb_def_vectors.columns.difference(['player_name','team','position','off_val_opp_avg','footedness'])].values)
plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1],s=5)

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=6,
    min_dist=0.0,
    n_components=2,
    random_state=np.random.RandomState(22),
).fit_transform(rb_def_vectors[rb_def_vectors.columns.difference(['player_name','team','position','off_val_opp_avg','footedness'])].values)

In [None]:
labels = hdbscan.HDBSCAN(
    min_samples=2,
    min_cluster_size=6,
).fit_predict(clusterable_embedding)

In [None]:
labels

In [None]:
clustered = (labels >= 0)
a0 = (labels==0)
a1 = (labels==1)
a2 = (labels==2)

fig1 = plt.scatter(standard_embedding[~clustered, 0],
            standard_embedding[~clustered, 1],
            c=(0.5, 0.5, 0.5),
            alpha=0.5)
fig2 = plt.scatter(standard_embedding[a0, 0],
            standard_embedding[a0, 1],
            #c=labels[a0],
            cmap='Spectral')
fig3 = plt.scatter(standard_embedding[a1, 0],
            standard_embedding[a1, 1],
            #c=labels[a1],
            cmap='Spectral')
fig4 = plt.scatter(standard_embedding[a2, 0],
            standard_embedding[a2, 1],
            #c=labels[a2],
            cmap='Spectral')
plt.legend([fig1, fig2, fig3, fig4], np.unique(labels))
plt.title('RB',fontsize=12,fontweight='bold')
plt.figtext(0.5,0.01,'Silhouette Score = 0.424',ha='center',fontsize=12)

In [None]:
silhouette_avg = silhouette_score(clusterable_embedding, labels) 

In [None]:
silhouette_avg

In [None]:
rb_def_vectors['groups'] = labels

In [None]:
rb_def_vectors[rb_def_vectors['groups']==-1]

In [None]:
from IPython.display import display, HTML
from sklearn.tree import _tree, DecisionTreeClassifier
import pandas as pd

def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

def get_class_rules(tree: DecisionTreeClassifier, feature_names: list):
  inner_tree: _tree.Tree = tree.tree_
  classes = tree.classes_
  class_rules_dict = dict()

  def tree_dfs(node_id=0, current_rule=[]):
    # feature[i] holds the feature to split on, for the internal node i.
    split_feature = inner_tree.feature[node_id]
    if split_feature != _tree.TREE_UNDEFINED: # internal node
      name = feature_names[split_feature]
      threshold = inner_tree.threshold[node_id]
      # left child
      left_rule = current_rule + ["({} <= {})".format(name, threshold)]
      tree_dfs(inner_tree.children_left[node_id], left_rule)
      # right child
      right_rule = current_rule + ["({} > {})".format(name, threshold)]
      tree_dfs(inner_tree.children_right[node_id], right_rule)
    else: # leaf
      dist = inner_tree.value[node_id][0]
      dist = dist/dist.sum()
      max_idx = dist.argmax()
      if len(current_rule) == 0:
        rule_string = "ALL"
      else:
        rule_string = " and ".join(current_rule)
      # register new rule to dictionary
      selected_class = classes[max_idx]
      class_probability = dist[max_idx]
      class_rules = class_rules_dict.get(selected_class, [])
      class_rules.append((rule_string, class_probability))
      class_rules_dict[selected_class] = class_rules
    
  tree_dfs() # start from root, node_id = 0
  return class_rules_dict

def cluster_report(data: pd.DataFrame, clusters, min_samples_leaf=50, pruning_level=0.01):
    # Create Model
    tree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, ccp_alpha=pruning_level)
    tree.fit(data, clusters)
    
    # Generate Report
    feature_names = data.columns
    class_rule_dict = get_class_rules(tree, feature_names)

    report_class_list = []
    for class_name in class_rule_dict.keys():
        rule_list = class_rule_dict[class_name]
        combined_string = ""
        for rule in rule_list:
            combined_string += "[{}] {}\n\n".format(rule[1], rule[0])
        report_class_list.append((class_name, combined_string))
        
    cluster_instance_df = pd.Series(clusters).value_counts().reset_index()
    cluster_instance_df.columns = ['class_name', 'instance_count']
    report_df = pd.DataFrame(report_class_list, columns=['class_name', 'rule_list'])
    report_df = pd.merge(cluster_instance_df, report_df, on='class_name', how='left')
    pretty_print(report_df.sort_values(by='class_name')[['class_name', 'instance_count', 'rule_list']])

In [None]:
cluster_report(pd.DataFrame(clusterable_embedding,columns=['c0','c1']),labels,min_samples_leaf=4,pruning_level=0.04)

## Merging the groups to lineups

In [None]:
four_clusters = ['rlll', 'rrll', 'rrlr', 'rrrl', 'rrrr']
df_fourclusters = pd.concat(
    (pd.read_pickle(f'../data/clusters/clusters_vaep/cluster_{i}.pkl')
     for i in four_clusters),
    axis=0)

In [None]:
df_4clusters_team=pd.DataFrame(df_fourclusters.groupby(['team','RB','R_CB','L_CB','LB'])['wyId'].count())

In [None]:
df_4clusters_team.reset_index(inplace=True)

In [None]:
df_4clusters_team[df_4clusters_team['team'].str.contains("Chelsea")]

In [None]:
 player_map = {  'RamiroFunesMori': 'JoseRamiroFunesMori',
                'KurtZouma': 'KurtHappyZouma',
                'Danilo': 'DaniloLuizdaSilva',
                'CesarAzpilicueta': 'CesarAzpilicuetaTanco',
                'EzequielSchelotto': 'MatiasEzequielSchelotto',
                'GaetanBong': 'GaetanBongSongo',
                'HectorBellerin': 'HectorBellerinMoruno',
                'AhmedHegazi': 'AhmedHegazy',
                'JamaalLascelles': 'JamalLascelles',
                'AngelRangel': 'AngelRangelZaragoza',
                'Zanka': 'MathiasJattahNjieJorgensen',
                'EricBailly': 'EricBertrandBailly',
                'MarcosRojo': 'FaustinoMarcosAlbertoRojo',
                'AngeloOgbonna': 'AngeloObinzeOgbonna',
                'DavinsonSanchez': 'DavinsonSanchezMina',
                'JavierManquillo': 'JavierManquilloGaitan',
                'TommySmith': 'TomSmith',
                'Bruno': 'BrunoSaltorGrau',
                'JosephGomez': 'JoeGomez',
                'AlbertoMoreno':'AlbertoMorenoPerez',
                'LuisAntonioValencia':'LuisAntonioValenciaMosquera',
                'NicolasOtamendi':'NicolasHernanOtamendi',
                'NachoMonreal':'IgnacioMonrealEraso',
                'CedricSoares':'CedricRicardoAlvesSoares',
                'JoelMatip':'JoelAndreJobMatip',
                'MiguelBritos':'MiguelAngelBritosCabrera',
                'VictorLindelof':'VictorNilssonLindelof',
                'JamesCollins':'JamesMichaelCollins',
                'CucoMartina':'RhuendlyMartina',
                'DavidLuiz':'DavidLuizMoreiraMarinho',
                'ChancelMbemba':'ChancelMbembaMangulu',
                'PabloZabaleta':'PabloJavierZabaletaGirod',
                'KikoFemenia':'FranciscoFemeniaFar',
                'JoseFonte':'JoseMigueldaRochaFonte',
                'JesusGamez':'JesusGamezDuarte'}

In [None]:
positions = ['RB','R_CB','L_CB','LB']
for index,row in df_4clusters_team.iterrows():
    for pos in positions:
        try:
            replace = player_map[row[pos]]
        except:
            continue
        df_4clusters_team[pos][index] = replace 

In [None]:
df_4clusters_team[df_4clusters_team['team'].str.contains("Manchester City")]

In [None]:
df_4clusters_team.rename(columns = {'wyId':'matches_played'},inplace=True)

In [None]:
df_4clusters_team = df_4clusters_team.merge(rb_def_vectors[['player_name','team','groups']], how = 'left',
                                            left_on=['RB','team'], right_on = ['player_name','team'])

In [None]:
df_4clusters_team.drop(['player_name'],inplace=True,axis=1)

In [None]:
df_4clusters_team.rename(columns = {'groups':'RB_groups'},inplace=True)

In [None]:
df_4clusters_team = df_4clusters_team.merge(rcb_def_vectors[['player_name','team','groups']], how = 'left',
                                            left_on=['R_CB','team'], right_on = ['player_name','team'])

In [None]:
df_4clusters_team.drop(['player_name'],inplace=True,axis=1)

In [None]:
df_4clusters_team.rename(columns = {'groups':'R_CB_groups'},inplace=True)

In [None]:
df_4clusters_team = df_4clusters_team.merge(lcb_def_vectors[['player_name','team','groups']], how = 'left',
                                            left_on=['L_CB','team'], right_on = ['player_name','team'])

In [None]:
df_4clusters_team.drop(['player_name'],inplace=True,axis=1)

In [None]:
df_4clusters_team.rename(columns = {'groups':'L_CB_groups'},inplace=True)

In [None]:
df_4clusters_team[df_4clusters_team['team'].str.contains("Tottenham Hotspur")]

In [None]:
df_4clusters_team = df_4clusters_team.merge(lb_def_vectors[['player_name','team','groups']], how ='left',
                                            left_on=['LB','team'], right_on = ['player_name','team'])

In [None]:
df_4clusters_team.drop(['player_name'],inplace=True,axis=1)

In [None]:
df_4clusters_team.rename(columns = {'groups':'LB_groups'},inplace=True)

In [None]:
df_4clusters_team[df_4clusters_team['team'].str.contains("West Bromwich Albion")]

In [None]:
top_five = ['Manchester City','Manchester United','Tottenham Hotspur','Liverpool','Chelsea']

In [None]:
bottom_five = ['Huddersfield Town','Southampton','Swansea City','Stoke City','West Bromwich Albion']

In [None]:
df_4clusters_team[df_4clusters_team['team'].str.contains("Manchester United")].dropna()

In [None]:
x= pd.DataFrame(df_4clusters_team[df_4clusters_team['team'].str.contains("Manchester United")].groupby(['RB_groups','R_CB_groups','L_CB_groups','LB_groups'])['matches_played'].sum())

In [None]:
x.reset_index(inplace=True)

In [None]:
x

In [None]:
def_vectors_ind_footsep_lcb = def_vectors_ind_footsep.merge(lcb_def_vectors[['player_name','position','team','groups']],on=['player_name','team','position'])

In [None]:
len(def_vectors_ind_footsep_lcb)

In [None]:
def_vectors_ind_footsep_rcb = def_vectors_ind_footsep.merge(rcb_def_vectors[['player_name','position','team','groups']],on=['player_name','team','position'])

In [None]:
len(def_vectors_ind_footsep_rcb)

In [None]:
def_vectors_ind_footsep_lb = def_vectors_ind_footsep.merge(lb_def_vectors[['player_name','position','team','groups']],on=['player_name','team','position'])

In [None]:
len(def_vectors_ind_footsep_lb)

In [None]:
def_vectors_ind_footsep_rb = def_vectors_ind_footsep.merge(rb_def_vectors[['player_name','position','team','groups']],on=['player_name','team','position'])

In [None]:
len(def_vectors_ind_footsep_rb)

In [None]:
def_vectors_ind_footsep_lb[def_vectors_ind_footsep_lb['groups']==2]

In [None]:
def_vectors_ind_footsep_lb[def_vectors_ind_footsep_lb['groups']==0].describe()

In [None]:
def_vectors_ind_footsep_lb[def_vectors_ind_footsep_lb['groups']==1].describe()

In [None]:
def_vectors_ind_footsep_lb[def_vectors_ind_footsep_lb['groups']==2].describe()

In [None]:
def_vectors_ind_footsep_lb[def_vectors_ind_footsep_lb['groups']==3].describe()

In [None]:
def get_region_graph(df,pref_cols,acc_cols,off_cols,offcontri_cols,pos):
    groups = df['groups'].unique()
    groups = np.sort(groups)
    groups = [g for g in groups if g!=-1]
    print(groups)
    cols = ['pref','acc','off','contri']
    groups_list_pref,groups_list_acc,groups_list_off,groups_list_offcontri = list(),list(),list(),list()
    for g in groups:
        groups_list_pref.append([np.round(df[df['groups']==g][c].median(),2) for c in pref_cols])
        groups_list_acc.append([np.round(df[df['groups']==g][c].median(),2) for c in acc_cols])
        groups_list_off.append([np.round(df[df['groups']==g][c].median(),2) for c in off_cols])
        groups_list_offcontri.append([np.round(df[df['groups']==g][c].median(),2) for c in offcontri_cols])
    
    barWidth = 0.2
    r_list_pref,r_list_acc,r_list_off,r_list_offcontri = [None]*len(groups_list_pref),[None]*len(groups_list_acc),[None]*len(groups_list_off),[None]*len(groups_list_offcontri)
    r_list_pref[0] = np.arange(len(groups_list_pref[0]))
    r_list_acc[0] = np.arange(len(groups_list_acc[0]))
    r_list_off[0] = np.arange(len(groups_list_off[0]))
    r_list_offcontri[0] = np.arange(len(groups_list_offcontri[0]))
    for g in range(1,len(groups_list_pref)):
        r_list_pref[g] = ([x + barWidth for x in r_list_pref[g-1]])
    for g in range(1,len(groups_list_acc)):
        r_list_acc[g] = ([x + barWidth for x in r_list_acc[g-1]])
    for g in range(1,len(groups_list_off)):
        r_list_off[g] = ([x + barWidth for x in r_list_off[g-1]])
    for g in range(1,len(groups_list_offcontri)):
        r_list_offcontri[g] = ([x + barWidth for x in r_list_offcontri[g-1]])
    
    # Make the plot
    bars_pref,bars_acc,bars_off,bars_offcontri = list(),list(),list(),list()
    fig, axes = plt.subplots(2, 2, figsize = (15,10)) 
    for r in range(len(r_list_pref)):
        bars_pref.append(axes[0,0].bar(r_list_pref[r], groups_list_pref[r], width=barWidth, label=r))
    for r in range(len(r_list_acc)):
        bars_acc.append(axes[0,1].bar(r_list_acc[r], groups_list_acc[r], width=barWidth, label=r))
    for r in range(len(r_list_off)):
        bars_off.append(axes[1,0].bar(r_list_off[r], groups_list_off[r], width=barWidth, label=r))
    for r in range(len(r_list_offcontri)):
        bars_offcontri.append(axes[1,1].bar(r_list_offcontri[r], groups_list_offcontri[r], width=barWidth, label=r))
    axes[0,0].set_xticks([r + barWidth for r in range(len(groups_list_pref[0]))])
    axes[0,0].set_xticklabels(['LF','LC','RC','RF'])
    axes[0,1].set_xticks([r + barWidth for r in range(len(groups_list_pref[0]))])
    axes[0,1].set_xticklabels(['LF','LC','RC','RF'])
    axes[1,0].set_xticks([r + barWidth for r in range(len(groups_list_pref[0]))])
    axes[1,0].set_xticklabels(['LF','LC','RC','RF'])
    axes[1,1].set_xticks([r + barWidth for r in range(len(groups_list_pref[0]))])
    axes[1,1].set_xticklabels(['LF','LC','RC','RF'])
#    axes[1,1].axhline(y=0.93,linewidth=1, color='k')
#    axes[1,1].text(0,0.93,"0 contri", position=(-0.3,0.92), ha = 'right',fontsize=14)
    axes[0,0].set_title('Progressive Pass Preference (in %)',fontweight='bold')
    axes[0,1].set_title('Progressive Pass Accuracy (in %)',fontweight='bold')
    axes[1,0].set_title('Offensive Value per Progressive Pass',fontweight='bold')
    axes[1,1].set_title('Offensive Contribution',fontweight='bold')
    axes[0,0].tick_params(axis='both', which='major', labelsize=14)
    axes[0,1].tick_params(axis='both', which='major', labelsize=14)
    axes[1,0].tick_params(axis='both', which='major', labelsize=14)
    axes[1,1].tick_params(axis='both', which='major', labelsize=14)
    
    handles, labels = plt.gca().get_legend_handles_labels()
    fig.legend(handles, labels, loc=(0.65,0.94), ncol = len(groups))
    fig.text(0.5, 0.07, 'Regions', ha='center',fontsize = 14,fontweight='bold')
    fig.text(0.07, 0.5, 'Values', va='center', rotation='vertical', fontsize = 14,fontweight='bold')
    plt.savefig("../paper_charts/"+pos)
    plt.show()
    

In [None]:
get_region_graph(def_vectors_ind_footsep_lb,
                 pref_cols = ['att_LF_pref','att_LC_pref','att_RC_pref','att_RF_pref'],
                 acc_cols = ['att_LF_acc','att_LC_acc','att_RC_acc','att_RF_acc'],
                 off_cols = ['att_LF_off','att_LC_off','att_RC_off','att_RF_off'],
                 offcontri_cols = ['att_LF_offcontri','att_LC_offcontri','att_RC_offcontri','att_RF_offcontri'],
                 pos = "LB")