In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm as tdqm
import networkx as nx
import matplotlib.pyplot as plt

In [5]:
# Load the data
teams_df = pd.read_csv('Teams.csv')
teams_members_df = pd.read_csv('TeamMemberships.csv')
competitions_df = pd.read_csv('competitions.csv')
competitions_tags_df = pd.read_csv('CompetitionTags.csv')
tags_df = pd.read_csv('Tags.csv')
users_teirs_df = pd.read_csv('users.csv',usecols=['Id','PerformanceTier'])


KeyboardInterrupt: 

In [None]:
teams_df = teams_df[['Id', 'CompetitionId', 'TeamLeaderId']].rename(columns={'Id': 'TeamId'})
teams_members_df = teams_members_df[['TeamId', 'UserId']]
competitions_df= competitions_df[['Id', 'Slug','DeadlineDate','TotalCompetitors','Title']].rename(columns={'Id': 'CompetitionId'})
tags_df= tags_df[['Id', 'Name']].rename(columns={'Id': 'TagId', 'Name': 'TagName'})
users_teirs_df = users_teirs_df.rename(columns={'Id': 'UserId'})



teams_df = teams_df.dropna()
teams_members_df = teams_members_df.dropna()
competitions_df = competitions_df.dropna()

In [None]:
competitions_df = competitions_df[competitions_df['TotalCompetitors'] !=0]
competitions_df = competitions_df.merge(competitions_tags_df[['CompetitionId', 'TagId']], left_on='CompetitionId', right_on='CompetitionId', how='left').drop_duplicates(subset=['CompetitionId'])
competitions_df = competitions_df.merge(tags_df[['TagId', 'TagName']], left_on='TagId', right_on='TagId', how='left')
competitions_df['TagName']= competitions_df['TagName'].fillna('General')

In [None]:
competitions_df['TagName'].value_counts()

In [None]:
teams_df['TeamLeaderId'] = teams_df['TeamLeaderId'].astype(int)

In [None]:
user_to_competition = teams_members_df.merge(
    teams_df[['TeamId', 'CompetitionId']], 
    left_on='TeamId', 
    right_on='TeamId', 
    how='left'
)
user_to_competition = user_to_competition.merge(competitions_df[['CompetitionId', 'Slug','DeadlineDate','TagName','Title']], left_on='CompetitionId', right_on='CompetitionId', how='left')
user_to_competition = user_to_competition.merge(users_teirs_df, left_on='UserId', right_on='UserId', how='left')


In [None]:
user_to_competition

In [None]:
#is DeadlineDate a datetime?
user_to_competition['DeadlineDate'] = pd.to_datetime(user_to_competition['DeadlineDate'])
user_to_competition
user_to_competition = user_to_competition[user_to_competition['DeadlineDate'] < '2025-01-01 00:00:00'] 
user_to_competition

In [None]:
#remove inactive competitions
single_participation_users = user_to_competition['UserId'].value_counts()
single_participation_users = single_participation_users[single_participation_users <2]
user_to_competition = user_to_competition[~user_to_competition['UserId'].isin(single_participation_users.index)]
user_to_competition

In [None]:
single_participation_competitions = user_to_competition['Slug'].value_counts()
single_participation_competitions = single_participation_competitions[single_participation_competitions <2]
len(single_participation_competitions)

In [None]:
users_set = set(user_to_competition['UserId'])
competitions_set = set(user_to_competition['Slug'])

In [None]:
user_competition = list(zip(user_to_competition['UserId'], user_to_competition['Slug']))

In [None]:
#save the data
# user_to_competition.to_csv('user_to_competition.csv', index=False)


## create the network
the network is a bipartite graph with two sets of nodes: users and competitions. The edges are the participation of users in competitions.

In [None]:
network = nx.Graph()
# Add nodes
network.add_nodes_from(users_set, bipartite=0)
network.add_nodes_from(competitions_set, bipartite=1)
# Add edges
network.add_edges_from(user_competition)
#save the network
nx.write_graphml(network, 'kaggle_users_network.graphml')

## Network Analysis

In [None]:
print(f'number of nodes: {network.number_of_nodes()}')
print(f'number of edges: {network.number_of_edges()}')

In [None]:
nx.is_connected(network)

In [None]:
users, competitions = nx.bipartite.sets(network)

In [None]:
competitions_degree,users_degree= nx.bipartite.degrees(network, users)
competitions_degree= dict(competitions_degree)
users_degree = dict(users_degree)

In [None]:
import operator
print(f'the competition with the highest degree is {max(competitions_degree.items(), key=operator.itemgetter(1))[0]} with a degree of {max(competitions_degree.items(), key=operator.itemgetter(1))[1]}')
print(f'the user with the highest degree is {max(users_degree.items(), key=operator.itemgetter(1))[0]} with a degree of {max(users_degree.items(), key=operator.itemgetter(1))[1]}')

### Degree Distribution of competitions

In [None]:
#plot the degree distribution
import plotly.express as px
import pandas as pd

# Convert competitions_degree to DataFrame
df = pd.DataFrame({'node': list(dict(competitions_degree).keys()), 
                   'degree': list(dict(competitions_degree).values())})

# Create interactive histogram
fig = px.histogram(df, x="degree", nbins=1000,
                   labels={'degree': 'Node Degree', 'count': 'Frequency'},
                   text_auto=True,)

# Adjust bar spacing and set y-axis limit
fig.update_layout(bargap=0.1)
fig.update_yaxes(range=[0, 500])  # Limit Y-axis to 100
fig.update_xaxes(range=[0, 60000])  # Limit X-axis to 100
fig.show()

In [None]:
import plotly.express as px
import pandas as pd
import numpy as np

# Convert competitions_degree to DataFrame
df = pd.DataFrame({'node': list(dict(competitions_degree).keys()), 
                   'degree': list(dict(competitions_degree).values())})

# Extract degree values
degree_values = df['degree'].values

# Define logarithmic bin edges (smaller bins for small values, larger bins for large values)
bins = np.logspace(np.log10(max(1, min(degree_values))), np.log10(max(degree_values)), num=50)

# Create histogram with custom bins
fig = px.histogram(df, x="degree",
                   labels={'degree': 'Node Degree', 'count': 'Frequency'})

# Apply custom log-scale bins
fig.update_traces(xbins=dict(start=bins[0], end=bins[-1], size="auto"))

# Set logarithmic x-axis scale
fig.update_xaxes(type="log", title="Node Degree (Log Scale)")
fig.update_yaxes( title="Frequency")  # Limit Y-axis to 500

fig.show()

### Degree Distribution of users



In [None]:
#plot the degree distribution
import plotly.express as px
import pandas as pd

# Convert competitions_degree to DataFrame
df = pd.DataFrame({'node': list(dict(users_degree).keys()), 
                   'degree': list(dict(users_degree).values())})

# Create interactive histogram
fig = px.histogram(df, x="degree", nbins=1000, title="users Degree Distribution",
                   labels={'degree': 'Node Degree', 'count': 'Frequency'},
                   text_auto=True,)

# Adjust bar spacing and set y-axis limit
fig.update_layout(bargap=0.1)
fig.update_yaxes(range=[0, 100])  # Limit Y-axis to 100
fig.update_xaxes(range=[0, 5700])  # Limit X-axis to 100
fig.show()

In [None]:
import plotly.express as px
import pandas as pd
import numpy as np

# Convert competitions_degree to DataFrame
df = pd.DataFrame({'node': list(dict(users_degree).keys()), 
                   'degree': list(dict(users_degree).values())})

# Extract degree values
degree_values = df['degree'].values

# Define logarithmic bin edges (smaller bins for small values, larger bins for large values)
bins = np.logspace(np.log10(max(1, min(degree_values))), np.log10(max(degree_values)), num=50)

# Create histogram with custom bins
fig = px.histogram(df, x="degree",
                   labels={'degree': 'Node Degree', 'count': 'Frequency'})

# Apply custom log-scale bins
fig.update_traces(xbins=dict(start=bins[0], end=bins[-1], size="auto"))

# Set logarithmic x-axis scale
fig.update_xaxes(type="log", title="Node Degree (Log Scale)")
fig.update_yaxes( title="Frequency")  # Limit Y-axis to 500

fig.show()

In [None]:
network = nx.relabel_nodes(network, lambda x: str(x))

from networkx.algorithms.community import  louvain_communities
#communities = greedy_modularity_communities(network)
communities = louvain_communities(network,seed=42)

len(communities)

In [None]:
communities_dict= {}
for i in range(len(communities)):
    communities_dict[i] = communities[i]

In [None]:
cum_df = pd.DataFrame()
for key, value in communities_dict.items():
    num_nodes = len(value)
    print(f'community {key} has {num_nodes} nodes')
    cum_competitions =([node for node in value if node in competitions_set])
    print(f'number of competitions in community {key} is {len(cum_competitions)}')
    cum_df = pd.concat([cum_df,pd.DataFrame({'community': key,'cumpetitions_number': len(cum_competitions),'number_of_nodes':num_nodes ,}, index=[key])])


In [None]:
# Stacked Bar Chart with Log Scale for Number of Nodes
fig, ax = plt.subplots(figsize=(10, 6))

# Apply log transformation to number of nodes to balance visualization
cum_df["log_number_of_nodes"] = np.log1p(cum_df["number_of_nodes"])  # log(1 + x) to avoid log(0)

# Plot stacked bars
bar1 = ax.bar(cum_df["community"], cum_df["cumpetitions_number"], label="Competitions", color='steelblue')
bar2 = ax.bar(cum_df["community"], cum_df["log_number_of_nodes"], bottom=cum_df["cumpetitions_number"],
              label="Log(Number of Nodes)", color='orange')

# Labels & Title
ax.set_xlabel("Community")
ax.set_ylabel("Total Size (Competitions + log(Nodes))")
ax.set_title("Stacked Bar Chart: Community Breakdown (Log Scale for Nodes)")
ax.legend()

# Display values on bars
for bars in [bar1, bar2]:
    for bar in bars:
        height = bar.get_height()
        if height > 0:
            ax.text(bar.get_x() + bar.get_width() / 2, bar.get_y() + height / 2,
                    str(int(height)), ha='center', va='center', fontsize=10, color='black')

plt.xticks(cum_df["community"])
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
def analyze_community(community):
    subgraph = network.subgraph(community)
    print(f"Graph Type: {type(subgraph)}")
    print(f"Number of Nodes: {subgraph.number_of_nodes()}")
    print(f"Number of Edges: {subgraph.number_of_edges()}")
    print(f"Average Degree: {sum(dict(subgraph.degree()).values()) / subgraph.number_of_nodes():.2f}") 
    degree_centrality = nx.degree_centrality(subgraph)
    bwtweenness_centrality = nx.betweenness_centrality(subgraph)
    closeness_centrality = nx.closeness_centrality(subgraph)
    pagerank = nx.pagerank(subgraph)
    print((f'the most central node in the community by degree is: {max(degree_centrality.items(), key=operator.itemgetter(1))[0]}'))
    print((f'the most central node in the community by betweenness is: {max(bwtweenness_centrality.items(), key=operator.itemgetter(1))[0]}'))
    print((f'the most central node in the community by closeness is: {max(closeness_centrality.items(), key=operator.itemgetter(1))[0]}'))
    print((f'the most central node in the community by pagerank is: {max(pagerank.items(), key=operator.itemgetter(1))[0]}'))
    df = pd.DataFrame({'node': list(degree_centrality.keys()),
                       'cummunity_nodes': subgraph.number_of_nodes(),
                       'average_degree':sum(dict(subgraph.degree()).values()) / subgraph.number_of_nodes(),
                       'degree_centrality': list(degree_centrality.values()),
                       'betweenness_centrality': list(bwtweenness_centrality.values()),
                       'closeness_centrality': list(closeness_centrality.values()),
                       'pagerank': list(pagerank.values())
                       })
    return df

In [None]:
all_communities_df = pd.DataFrame()
for key, value in communities_dict.items():
    print(f'community {key}')
    subgraph_df = analyze_community(value)
    subgraph_df['community'] = key
    all_communities_df = pd.concat([all_communities_df, subgraph_df])

In [None]:
users_in_communities_df = all_communities_df[all_communities_df['node'].isin(users_set)]
users_in_communities_df

In [None]:
#draw the sub network 4
subgraph = network.subgraph(communities_dict[25])

# Ensure users_set and subgraph nodes have matching types
users_set = {str(node) for node in users_set}  # Convert users_set to strings (if needed)
subgraph_nodes = set(subgraph.nodes)

# Assign colors based on user type
colors = ['blue' if str(node) in users_set else 'red' for node in subgraph_nodes]

# Assign node sizes based on degree
sizes = [subgraph.degree[node] * 10 for node in subgraph_nodes]  # Scale sizes

# Draw the graph
plt.figure(figsize=(10, 8))
nx.draw(subgraph, with_labels=False, node_color=colors, node_size=sizes, edge_color='gray')

plt.show()  # Ensure plot updates

In [None]:
user_to_competition['UserId']= user_to_competition['UserId'].astype(str)
user_to_competition = user_to_competition.merge(all_communities_df[['node', 'community','cummunity_nodes','average_degree','degree_centrality']].rename(columns={'node': 'UserId'}),on='UserId', how='left')


In [None]:
user_to_competition[user_to_competition['UserId']=='368']

In [None]:
user_to_competition

In [None]:
user_to_competition.to_csv('user_to_competition.csv', index=False)

In [None]:
user_to_competition.isnull().sum()

In [None]:

data = user_to_competition.drop(columns=['UserId','TagName'])
categorical_cols = ['Slug', 'Title']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le
data['DeadlineDate'] = data['DeadlineDate'].astype('int64')//10**9

## since the performance tier in kaagle is from 1-5 we will remove the 0 tier
# data = data[data['PerformanceTier'] !=0]
data = data.dropna()
X = data.drop(columns=['PerformanceTier'])
y = data['PerformanceTier']
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from lightgbm import LGBMClassifier

classificator = LGBMClassifier(n_estimators=500, objective='multiclass')
# classificator= RandomForestClassifier(n_estimators=100)
classificator.fit(X_train, y_train)
pred_classified = classificator.predict(X_test)
accuracy = accuracy_score(y_test, pred_classified)
f1 = f1_score(y_test, pred_classified, average='weighted')

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_classified)

In [None]:
print(f'accuracy: {accuracy}')
print(f'f1: {f1}')
features_importance = classificator.feature_importances_
features = X.columns
features_importance_df = pd.DataFrame({'feature': features, 'importance': features_importance})
features_importance_df = features_importance_df.sort_values(by='importance', ascending=False)

In [None]:
from lightgbm import LGBMRegressor

reg_model = LGBMRegressor(n_estimators=500)
reg_model.fit(X_train, y_train)
pred_regressed = reg_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, pred_regressed)
print(f'mse: {mse}')
r2 = r2_score(y_test, pred_regressed)
print(f'r2: {r2}')

In [None]:
users_max_tier = user_to_competition.merge(max_tier, on='UserId', how='left')
users_max_tier

In [None]:
users_max_tier = users_max_tier[users_max_tier['PerformanceTier'] == users_max_tier['MaxTier']]
data = users_max_tier.drop(columns=['UserId','TagName','MaxTier'])
categorical_cols = ['Slug', 'Title']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le
data['DeadlineDate'] = data['DeadlineDate'].astype('int64')//10**9

## since the performance tier in kaagle is from 1-5 we will remove the 0 tier
data = data[data['PerformanceTier'] !=0]
data = data.dropna()
X = data.drop(columns=['PerformanceTier'])
y = data['PerformanceTier']
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
max_classificator = LGBMClassifier(n_estimators=500, objective='multiclass')
max_classificator.fit(X_train, y_train)
pred_classified = max_classificator.predict(X_test)
accuracy = accuracy_score(y_test, pred_classified)
f1 = f1_score(y_test, pred_classified, average='weighted')


In [None]:
print(f'accuracy: {accuracy}')
print(f'f1: {f1}')
features_importance = max_classificator.feature_importances_
features = X.columns
features_importance_df = pd.DataFrame({'feature': features, 'importance': features_importance})
features_importance_df = features_importance_df.sort_values(by='importance', ascending=False)

In [None]:
confusion_matrix(y_test, pred_classified)