In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [28]:
lines = pd.read_csv("lines.csv")

In [29]:
lines = lines[lines.situation=='5on5']

In [30]:
feature_columns = [
    "xGoalsPercentage", "corsiPercentage", "fenwickPercentage", "xOnGoalFor",
    "xGoalsFor", "shotsOnGoalFor", "blockedShotAttemptsFor", "hitsFor", 
    "takeawaysFor", "giveawaysFor"
]

In [None]:
# Filter the lines to used features and scale the dataframe
X = lines[feature_columns]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Function to calculate the precision of knn label predictions for the k nearest neighbors
def precision_at_k(knn_indices, cluster_labels, k):
    precision_scores = []
    for i, neighbors in enumerate(knn_indices):
        true_label = cluster_labels[i]
        retrieved_labels = [cluster_labels[j] for j in neighbors[1:k+1]]  
        relevant_count = sum(1 for label in retrieved_labels if label == true_label)
        precision_scores.append(relevant_count / k)
    return sum(precision_scores) / len(precision_scores)

In [None]:
# Baseline values of 5 clusters/neighbors more testing needed
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters)
cluster_labels = kmeans.fit_predict(X_scaled)  

num_neighbors = 5
knn = NearestNeighbors(n_neighbors=num_neighbors+1, metric='cosine') # use cosine similarity to "normalize" data
knn.fit(X_scaled)
_, indices = knn.kneighbors(X_scaled)

# Get the KNN label for each line combination and add to list
knn_labels = []
for i in range(len(lines)):
    knn_labels.append(cluster_labels[indices[i][0]])  

# Calculate silhoutte score from KNN
silhouette_knn = silhouette_score(X_scaled, knn_labels, metric="cosine")

#Calculate precision at k (function above)
precision_k = precision_at_k(indices, cluster_labels, num_neighbors)

print(silhouette_knn)
print(precision_k)

0.4444131485608894
0.9182496607869742


In [None]:
# Finds the five most similar lines for each line and store in dictionary
similar_lines = {lines["lineId"].iloc[i]: list(lines["lineId"].iloc[indices[i][1:]]) for i in range(len(lines))}

num_lines = 0

# Print a few example similar lines
for line, similar in similar_lines.items():
    if num_lines < 5:
        print(f"Main Line")
        print(f"Players: {lines[lines["lineId"]==line].name.values[0]}")
        for sim in similar:
            print(f"    Similar Line")
            print(f"    Players: {lines[lines["lineId"]==sim].name.values[0]}")
        num_lines+= 1

Main Line
Players: Luostarinen-Lundell-Reinhart
    Similar Line
    Players: Necas-Drury-Noesen
    Similar Line
    Players: Bratt-Toffoli-Hughes
    Similar Line
    Players: Bertuzzi-Matthews-Domi
    Similar Line
    Players: Forsling-Montour
    Similar Line
    Players: Guentzel-Aho-Jarvis
Main Line
Players: Lindell-Hakanp
    Similar Line
    Players: Romanov-Dobson
    Similar Line
    Players: Dahlin-Samuelsson
    Similar Line
    Players: York-Sanheim
    Similar Line
    Players: Valimaki-Kesselring
    Similar Line
    Players: Gostisbehere-Maatta
Main Line
Players: Kunin-Granlund-Zadina
    Similar Line
    Players: Girgensons-Cozens-Okposo
    Similar Line
    Players: Eklund-Bordeleau-Hoffman
    Similar Line
    Players: Hirose-Juulsen
    Similar Line
    Players: Pyyhtia-Danforth-Olivier
    Similar Line
    Players: Zetterlund-Sturm-Hoffman
Main Line
Players: Krug-Leddy
    Similar Line
    Players: Lacombe-Lyubushkin
    Similar Line
    Players: Kubalik-Ostapchuk