In [119]:
import json
import pandas as pd
import os

In [121]:
df = pd.read_csv("cluster.csv")
print(len(df))
df = df.sort_values(by = 'total_nodes',ascending = True)

113


### Cluster vs Nodes

In [122]:
total_clusters = len(df)

# Define the conditions
conditions = {
    "== 2     ": df[df['total_nodes'] == 2],
    "== 3     ": df[df['total_nodes'] == 3],
    "== 4     ": df[df['total_nodes'] == 4],
    "== 5     ": df[df['total_nodes'] == 5],
    ">5 & <=10": df[(df['total_nodes'] > 5) & (df['total_nodes'] <= 10)],
    ">10      ": df[df['total_nodes'] > 10]
}
print("Total Clusters: ",total_clusters)
# Calculate the percentage for each condition
for condition, subset in conditions.items():
    percentage = len(subset) / total_clusters * 100
    print(f"Clusters with players {condition}: {percentage:.2f}% & {len(subset)}clusters" )
    

Total Clusters:  113
Clusters with players == 2     : 54.87% & 62clusters
Clusters with players == 3     : 23.01% & 26clusters
Clusters with players == 4     : 8.85% & 10clusters
Clusters with players == 5     : 3.54% & 4clusters
Clusters with players >5 & <=10: 8.85% & 10clusters
Clusters with players >10      : 0.88% & 1clusters


### Cluster vs Matches

In [123]:
# Calculate the average matches played per person for each cluster size
average_matches_per_person = {}
for size in range(2, 12):
    cluster_size_df = df[df['total_nodes'] == size]
    average_matches_per_person[size] = cluster_size_df['total_matches'].mean() / size


# For cluster size between 5 and 10
cluster_size_df = df[(df['total_nodes'] >= 5) & (df['total_nodes'] <= 10)]
average_matches_per_person['(5,10]'] = cluster_size_df['total_matches'].mean() / cluster_size_df['total_nodes'].mean()

# For cluster size greater than 10
cluster_size_df = df[df['total_nodes'] > 10]
average_matches_per_person['(10,max]'] = cluster_size_df['total_matches'].mean() / cluster_size_df['total_nodes'].mean()

# Print the results
for size_range, average_matches in average_matches_per_person.items():
    print(f"Average matches played per person in a cluster having size {size_range}: {2*average_matches:.2f}")


Average matches played per person in a cluster having size 2: 6.47
Average matches played per person in a cluster having size 3: 11.69
Average matches played per person in a cluster having size 4: 12.00
Average matches played per person in a cluster having size 5: 14.90
Average matches played per person in a cluster having size 6: 12.83
Average matches played per person in a cluster having size 7: 27.33
Average matches played per person in a cluster having size 8: nan
Average matches played per person in a cluster having size 9: 25.11
Average matches played per person in a cluster having size 10: 30.20
Average matches played per person in a cluster having size 11: nan
Average matches played per person in a cluster having size (5,10]: 22.88
Average matches played per person in a cluster having size (10,max]: 36.33


### Players vs Clusters

In [209]:
df = pd.read_csv("cluster.csv")

for ind,row in df.iterrows():
    for node in row["nodes"]:
        strr = row["nodes"].strip('{').strip('}')
        strr = strr.strip("'")
        lst = strr.split(',')
        for i in range(0,len(lst)):
            lst[i]=lst[i].strip(" ").strip("'")
            lst[i] = int(lst[i])
    df.at[ind,"nodes"]=lst

with open('player_matches.json') as f:
    match_counts = json.load(f)

# Function to calculate percentage of players with match counts exceeding a threshold
def calculate_percentage(df_slice, threshold):
    percent_players_played_above_threshold_matches=0
    for ind, row in df_slice.iterrows():
        total_matches = row['total_matches']
        total_players = row["total_nodes"]
        players_above_threshold = 0
        
        for node in row["nodes"]:
            if int(match_counts[str(node)]) >= threshold * total_matches:
                players_above_threshold += 1
                #print(threshold * total_matches,match_counts[str(node)])
        percent_players_played_above_threshold_matches+=100 * players_above_threshold/ total_players
    return percent_players_played_above_threshold_matches/len(df_slice)

cluster_analysis_discrete = {}
cluster_frequency = {}

ranges = {(2,5),(5,8),(8,11),(11,max(df['total_nodes'])+1)}
for lims in ranges:
    print("\n\n")
    for cluster_size in range(lims[0],lims[1]):
        if len(df[df["total_nodes"]==cluster_size])==0:
            continue
        lst = []
        thresholds = [0,0.02,0.06,0.3]
        mean_percentages = {}
        for threshold in thresholds:
            percentages = []
            cluster_frequency[cluster_size]=len(df[df["total_nodes"]==cluster_size])
            percentages.append(calculate_percentage(df[df["total_nodes"]==cluster_size], threshold))  # change for cluster_size
            mean_percentages[threshold] = sum(percentages) / len(percentages)
        print(f"Cluster Size: {cluster_size}")
        for threshold, percentage in mean_percentages.items():
            print(f"Percentage of players with more than {threshold*100}% matches: {percentage:.2f}%")
        for iter in mean_percentages.items():
            lst.append(iter)
        cluster_analysis_discrete[cluster_size]=lst
    print(cluster_analysis_discrete)

    cumulative_sums = {}
    counts = {}
    cluster_analysis_discrete
    # Iterate over each cluster size and each threshold to calculate the cumulative sum and count
    cluster_analysis_discrete=dict(sorted(cluster_analysis_discrete.items()))
    for cluster_size, thresholds_data in cluster_analysis_discrete.items():
        if cluster_size ==0:
            continue
        for threshold, percentage in thresholds_data:
            if threshold in cumulative_sums:
                cumulative_sums[threshold] += percentage*cluster_frequency[cluster_size]
                counts[threshold] += cluster_frequency[cluster_size]
            else:
                cumulative_sums[threshold] = percentage*cluster_frequency[cluster_size]
                counts[threshold] = cluster_frequency[cluster_size]

    # Calculate the average percentage for each threshold
    average_array = [(threshold, cumulative_sums[threshold] / counts[threshold]) for threshold in cumulative_sums]

    print("\nAverage array:", average_array)





Cluster Size: 110
Percentage of players with more than 0% matches: 100.00%
Percentage of players with more than 2.0% matches: 24.55%
Percentage of players with more than 6.0% matches: 7.27%
Percentage of players with more than 30.0% matches: 0.00%
{110: [(0, 100.0), (0.02, 24.545454545454547), (0.06, 7.2727272727272725), (0.3, 0.0)]}

Average array: [(0, 100.0), (0.02, 24.545454545454547), (0.06, 7.2727272727272725), (0.3, 0.0)]



Cluster Size: 9
Percentage of players with more than 0% matches: 100.00%
Percentage of players with more than 2.0% matches: 100.00%
Percentage of players with more than 6.0% matches: 88.89%
Percentage of players with more than 30.0% matches: 22.22%
Cluster Size: 10
Percentage of players with more than 0% matches: 100.00%
Percentage of players with more than 2.0% matches: 80.00%
Percentage of players with more than 6.0% matches: 80.00%
Percentage of players with more than 30.0% matches: 20.00%
{110: [(0, 100.0), (0.02, 24.545454545454547), (0.06, 7.2727272

In [110]:
df = pd.read_csv("cluster.csv")

for ind,row in df.iterrows():
    for node in row["nodes"]:
        lst = row["nodes"].strip('[').strip(']').split(',')
        for i in range(0,len(lst)):
            lst[i] = int(lst[i])
    df.at[ind,"nodes"]=lst

type(df['nodes'][0][0])

str

In [142]:
df = pd.read_csv("../dataset/squash_dataset.csv")
df["cntr_id"].unique()

array([3])

In [None]:
with open('player_matches.json') as f:
    match_counts = json.load(f)
    


1. 80% of matches are played by how many players (sort)