<a href="https://colab.research.google.com/github/gpdbs9409/WST-T01/blob/main/CDS4004_ProjectCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import required library

In [None]:
import re
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from networkx.algorithms.community.centrality import girvan_newman
from networkx.algorithms.community import kernighan_lin_bisection, partition_quality, modularity
warnings.filterwarnings('ignore')

# Fetching Data from website

In [None]:
# Request the web page
site = "https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id=2023&s_w=&leihe=1&intern=0/"
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(site, headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page, 'html.parser')

# Initialize dictionaries to store all 'in' and 'out' players for each team
in_players = {}
out_players = {}

# Iterate through each team's name tag to collect 'in' and 'out' players
for team_headline in soup.find_all('h2', class_='content-box-headline'):
    team_name_tag = team_headline.find('a', title=True)
    if not team_name_tag:
        continue
    team_name = team_name_tag.get('title')

    # Find 'in' players table
    in_table_div = team_headline.find_next_sibling('div', class_='responsive-table')
    in_table = in_table_div.find('table') if in_table_div else None
    if in_table:
        in_players[team_name] = [(row.find_all('td')[0].text.strip(), row.find_all('td')[-1].text.strip()) for row in in_table.find_all('tr')[1:]]

    # Find 'out' players table
    out_table_div = in_table_div.find_next_sibling('div', class_='responsive-table') if in_table_div else None
    out_table = out_table_div.find('table') if out_table_div else None
    if out_table:
        out_players[team_name] = [(row.find_all('td')[0].text.strip(), row.find_all('td')[-1].text.strip()) for row in out_table.find_all('tr')[1:]]

# Create a list to store transfer details
transfers_list = []

# Compare 'in' players of each team with 'out' players of all other teams
for to_team, players_in in in_players.items():
    for player, in_fee in players_in:
        for from_team, players_out in out_players.items():
            for player_out, out_fee in players_out:
                if player == player_out and from_team != to_team:
                    transfers_list.append({
                        'Player Name': player,
                        'From Team': from_team,
                        'To Team': to_team,
                        'Fee In': in_fee,

                    })

# Print out the transfer list
for transfer in transfers_list:
    print(f"Player Name: {transfer['Player Name']}, From Team: {transfer['From Team']}, To Team: {transfer['To Team']}, Fee: {transfer['Fee In']}")


In [None]:
print(transfers_list)

# Network Construction

In [None]:
df = pd.DataFrame(transfers_list)
df.columns = df.columns.str.replace(' ', '_').str.lower()

def clean_player_name(name):
  # Splitting the name by spaces to isolate the last name
  parts = name.split()
  # Getting the last name (assuming the last word is the last name)
  last_name = parts[-1]  # -1 because the last part is the short name's last name
  # Calculating the length to remove: length of the last name + 3 (for initial, period, and space)
  length_to_remove = len(last_name) + 3
  # Removing the last 'length_to_remove' characters
  cleaned_name = name[:-length_to_remove]

  return cleaned_name

def check_loan(fee):
    if 'loan' in fee.lower():
      return 1
    else:
      return 0

df['player_name'] = df['player_name'].apply(clean_player_name)

#Add a column to store if a transfer is loan or not
df['loan'] = df['fee_in'].apply(check_loan)

#Replacing the loaning date to a assumed value
df['fee_in'] = df['fee_in'].apply(lambda x: '€0.00m' if 'free' in x else ('€3.00m' if '€' not in x else x))

df[df['fee_in'] == '€3.00m']

In [None]:
G = nx.DiGraph()
pos = {
    'Arsenal FC': (1, 10),
    'Chelsea FC': (1, 9),
    'Manchester City': (1, 8),
    'Manchester United': (1, 7),
    'Tottenham Hotspur': (1, 6),
    'Liverpool FC': (1, 5),
    'Everton FC': (2, 10),
    'Fulham FC': (2, 9),
    'Brentford FC': (2, 8),
    'Wolverhampton Wanderers': (2, 7),
    'Sheffield United': (2, 6),
    'Nottingham Forest': (2, 5),
    'Burnley FC': (2, 4),
    'Brighton & Hove Albion': (2, 3),
    'Crystal Palace': (3, 10),
    'Luton Town': (3, 9),
    'West Ham United': (3, 8),
    'Newcastle United': (3, 7),
    'Aston Villa': (3, 6),
    'AFC Bournemouth': (3, 5)
}

df2 = pd.DataFrame()
df2['combine'] = df['from_team'].combine_first(df['to_team'])
missed_team = {'combine':'Crystal Palace'}
df2 = pd.concat([df2, pd.DataFrame([missed_team])], ignore_index=True)

for team in df2['combine'].unique():
  G.add_node(str(team))

for i in range(len(df)):
  weights = re.search(r'(?:€)(\d+\.\d+)(?:m)', df['fee_in'][i])
  G.add_edge(df['from_team'][i], df['to_team'][i], weight=weights.group(1), edge_labels=weights.group(1))

nx.draw(G, with_labels=True, hide_ticks=False, pos=pos)

In [None]:
G = nx.DiGraph()
pos = {
    'Arsenal FC': (1, 10),
    'Chelsea FC': (1, 9),
    'Manchester City': (1, 8),
    'Manchester United': (1, 7),
    'Tottenham Hotspur': (1, 6),
    'Liverpool FC': (1, 5),
    'Everton FC': (2, 10),
    'Fulham FC': (2, 9),
    'Brentford FC': (2, 8),
    'Wolverhampton Wanderers': (2, 7),
    'Sheffield United': (2, 6),
    'Nottingham Forest': (2, 5),
    'Burnley FC': (2, 4),
    'Brighton & Hove Albion': (2, 3),
    'Crystal Palace': (3, 10),
    'Luton Town': (3, 9),
    'West Ham United': (3, 8),
    'Newcastle United': (3, 7),
    'Aston Villa': (3, 6),
    'AFC Bournemouth': (3, 5)
}

df2 = pd.DataFrame()
df2['combine'] = df['from_team'].combine_first(df['to_team'])
missed_team = {'combine':'Crystal Palace'}
df2 = pd.concat([df2, pd.DataFrame([missed_team])], ignore_index=True)

for team in df2['combine'].unique():
  G.add_node(str(team))

for i in range(len(df)):
  weights = re.search(r'(?:€)(\d+\.\d+)(?:m)', df['fee_in'][i])
  if weights:  # Checking if the search was successful
      G.add_edge(df['from_team'][i], df['to_team'][i], weight=float(weights.group(1)), edge_labels=weights.group(1))

# Drawing the graph
nx.draw(G, with_labels=True, pos=pos, node_color='lightblue', edge_color='gray')

# To show weights
edge_labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

plt.show()

In [None]:
G = nx.DiGraph()
pos = {
    'Arsenal FC': (1, 10),
    'Chelsea FC': (1, 9),
    'Manchester City': (1, 8),
    'Manchester United': (1, 7),
    'Tottenham Hotspur': (1, 6),
    'Liverpool FC': (1, 5),
    'Everton FC': (2, 10),
    'Fulham FC': (2, 9),
    'Brentford FC': (2, 8),
    'Wolverhampton Wanderers': (2, 7),
    'Sheffield United': (2, 6),
    'Nottingham Forest': (2, 5),
    'Burnley FC': (2, 4),
    'Brighton & Hove Albion': (2, 3),
    'Crystal Palace': (3, 10),
    'Luton Town': (3, 9),
    'West Ham United': (3, 8),
    'Newcastle United': (3, 7),
    'Aston Villa': (3, 6),
    'AFC Bournemouth': (3, 5)
}

df2 = pd.DataFrame()
df2['combine'] = df['from_team'].combine_first(df['to_team'])
missed_team = {'combine':'Crystal Palace'}
df2 = pd.concat([df2, pd.DataFrame([missed_team])], ignore_index=True)

for team in df2['combine'].unique():
  G.add_node(str(team))

for i in range(len(df)):
  weights = re.search(r'(?:€)(\d+\.\d+)(?:m)', df['fee_in'][i])
  #print(weights.groups())
  if weights:  # Checking if the search was successful
      G.add_edge(df['from_team'][i], df['to_team'][i], weight=float(weights.group(1)), edge_labels=weights.group(1))
      print(df['from_team'][i], df['to_team'][i])
      print(G.get_edge_data(df['from_team'][i], df['to_team'][i]))

# Adjusting edge label positions
def adjust_label_pos(pos, x_shift=0.05, y_shift=0.05):
    """Adjusts the positions of the edge labels for better visibility.

    Args:
        pos: Original position dictionary for nodes.
        x_shift: Amount to shift the label along the x-axis.
        y_shift: Amount to shift the label along the y-axis.
    Returns:
        A new dictionary with adjusted label positions.
    """
    pos_labels = {}
    for key, value in pos.items():
        pos_labels[key] = (value[0] + x_shift, value[1] + y_shift)
    return pos_labels

# Adjust label positions based on your graph's layout
pos_labels = adjust_label_pos(pos, x_shift=0.1, y_shift=0)  # Adjust x_shift and y_shift as needed

# Drawing the graph
nx.draw(G, with_labels=True, pos=pos, node_color='lightblue', edge_color='gray', node_size=1500, font_size=8)

# To show weights with adjusted positions
edge_labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos_labels, edge_labels=edge_labels, font_size=7)

plt.show()

In [None]:
G.out_edges()

In [None]:
G.in_edges()

In [None]:
G.degree(weight='weight')

In [None]:
G.in_degree(weight='weight')

In [None]:
G.out_degree(weight='weight')

In [None]:
# get the net spending on each team
net_spending = []
for node_in, indegree in G.in_degree(weight='weight'):
  for node_out, outdegree in G.out_degree(weight='weight'):
    if node_in == node_out:
      spending = outdegree - indegree
      team = node_in
      net_spending.append((team, spending))
net_spending

# Node Importance Analysis (Centrality)

In [None]:
closeness_centrality = nx.closeness_centrality(G)
harmonic_centrality = nx.harmonic_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G)
degree_centrality = nx.degree_centrality(G)

centrality_measures = pd.DataFrame({
    'Team': list(G.nodes()),
    'Closeness Centrality': list(closeness_centrality.values()),
    'Harmonic Centrality': list(harmonic_centrality.values()),
    'Betweenness Centrality': list(betweenness_centrality.values()),
    'Eigenvector Centrality': list(eigenvector_centrality.values()),
    'Degree Centrality': list(degree_centrality.values())
})
print("Centrality Measures:")
print(centrality_measures.to_string(index=False))

In [None]:
best_closeness = centrality_measures.loc[centrality_measures['Closeness Centrality'].idxmax()]
best_harmonic = centrality_measures.loc[centrality_measures['Harmonic Centrality'].idxmax()]
best_betweenness = centrality_measures.loc[centrality_measures['Betweenness Centrality'].idxmax()]
best_eigenvector = centrality_measures.loc[centrality_measures['Eigenvector Centrality'].idxmax()]
best_degree = centrality_measures.loc[centrality_measures['Degree Centrality'].idxmax()]
print(f"\nCloseness Centrality:\nTeam: {best_closeness['Team']}\nValue: {best_closeness['Closeness Centrality']:.4f}")
print(f"\nHarmonic Centrality:\nTeam: {best_harmonic['Team']}\nValue: {best_harmonic['Harmonic Centrality']:.4f}")
print(f"\nBetweenness Centrality:\nTeam: {best_betweenness['Team']}\nValue: {best_betweenness['Betweenness Centrality']:.4f}")
print(f"\nEigenvector Centrality:\nTeam: {best_eigenvector['Team']}\nValue: {best_eigenvector['Eigenvector Centrality']:.4f}")
print(f"\nDegree Centrality:\nTeam: {best_degree['Team']}\nValue: {best_degree['Degree Centrality']:.4f}")



In [None]:
import matplotlib.pyplot as plt
def plot_centrality(centrality_measure, title):
    teams = centrality_measure['Team']
    values = centrality_measure.drop(columns=['Team'])

    plt.figure(figsize=(12, 9))
    plt.barh(teams, values.values.reshape(-1), color='skyblue')
    plt.xlabel('Centrality Value')
    plt.title(title)
    for index, value in enumerate(values.values.reshape(-1)):
        plt.text(value, index, f'{value:.4f}', va='center', ha='left')

    plt.show()
plot_centrality(centrality_measures[['Team', 'Closeness Centrality']], 'Closeness Centrality')
plot_centrality(centrality_measures[['Team', 'Harmonic Centrality']], 'Harmonic Centrality')
plot_centrality(centrality_measures[['Team', 'Betweenness Centrality']], 'Betweenness Centrality')
plot_centrality(centrality_measures[['Team', 'Eigenvector Centrality']], 'Eigenvector Centrality')
plot_centrality(centrality_measures[['Team', 'Degree Centrality']], 'Degree Centrality')


# Community Detection (Girvan-Newman Algorithm, louvain_communities and infomap)

In [None]:
# Community Clustering using Girvan-Newman Algorithm

girvan_communities = girvan_newman(G)
print(list(girvan_communities))

In [None]:
# find the best community with Girvan-Newman Algorithm

from networkx.algorithms.community import girvan_newman

def optimal_girvan_newman_communities(G):
    optimal = None  # the optimal community
    max_mod = 0   # community with the highest modularity

    for communities in girvan_newman(G):
        mod = modularity(G, communities)
        if mod > max_mod:
            max_mod = mod
            optimal_communities = communities

    return tuple(optimal_communities), max_mod

optimal_girvan_newman_communities, gn_mod = optimal_girvan_newman_communities(G)

print(f"Optimal Girvan-Newman Communities: {optimal_girvan_newman_communities}")


Another alternative: louvain_communities as comparison

In [None]:
from networkx.algorithms.community import louvain_communities

def optimal_louvain_communities(G):
    optimal = None  # the optimal community
    max_mod = 0   # community with the highest modularity

    communities = louvain_communities(G)

    mod = modularity(G, communities)
    if mod > max_mod:
        max_mod = mod
        optimal_communities = communities

    return tuple(optimal_communities), max_mod

optimal_louvain_communities, louvain_mod = optimal_louvain_communities(G)

Another alternative: infomap as comparison

In [None]:
!pip install infomap

In [None]:
from infomap import Infomap

def optimal_infomap_communities(G):
    im = Infomap("--directed")

    # Create a dictionary mapping node labels to integer IDs
    node_to_id = {node: i for i, node in enumerate(G.nodes())}
    id_to_node = {i: node for node, i in node_to_id.items()}

    # Add nodes and edges to the Infomap object
    for node in G.nodes():
        im.add_node(node_to_id[node])
    for source, target in G.edges():
        im.add_link(node_to_id[source], node_to_id[target])

    # Run the Infomap algorithm
    im.run()

    # Extract the communities
    communities = {}
    for node_id, module_id in im.modules:
        if module_id not in communities:
            communities[module_id] = []
        communities[module_id].append(id_to_node[node_id])

    # Convert the communities dictionary to a list of sets
    community_list = [set(nodes) for nodes in communities.values()]

    # Calculate modularity
    mod = modularity(G, community_list)

    return community_list, mod

optimal_infomap_communities, infomap_mod = optimal_infomap_communities(G)

**Partition Quality and Modularity**

In [None]:
optimal_girvan_communities

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate partition quality for each partition
girvan_cov, girvan_per = partition_quality(G, optimal_girvan_communities)
louvain_cov, louvain_per = partition_quality(G, optimal_louvain_communities)
infomap_cov, infomap_per = partition_quality(G, optimal_infomap_communities)

# Compare partition quality
print("Girvan-Newman Partition:")
print(f'The coverage of partition is: {girvan_cov:0.3}')
print(f'The performance of partition is: {girvan_per:0.3}')
print(f'The modularity of partition is: {girvan_mod:0.3}')
print()

print("Louvain Partition:")
print(f'The coverage of partition is: {louvain_cov:0.3}')
print(f'The performance of partition is: {louvain_per:0.3}')
print(f'The modularity of partition is: {louvain_mod:0.3}')
print()

print("Infomap Partition:")
print(f'The coverage of partition is: {infomap_cov:0.3}')
print(f'The performance of partition is: {infomap_per:0.3}')
print(f'The modularity of partition is: {infomap_mod:0.3}')
print()



**Data visualization with different communities**

In [None]:
# Color maps for each partition
color_map_girvan = []
color_map_louvain = []
color_map_infomap = []

for node in G.nodes:
    for i in range(len(optimal_girvan_communities)):
        if node in optimal_girvan_communities[i]:
            color_map_girvan.append(sns.color_palette('pastel')[i])
            break

    for i in range(len(optimal_louvain_communities)):
        if node in optimal_louvain_communities[i]:
            color_map_louvain.append(sns.color_palette('muted')[i])
            break

    for i in range(len(optimal_infomap_communities)):
        if node in optimal_infomap_communities[i]:
            color_map_infomap.append(sns.color_palette('bright')[i])
            break

# Drawing the graph for each partition
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

# Girvan-Newman partition
nx.draw(G, with_labels=True, pos=pos2, node_color=color_map_girvan, edge_color='gray', node_size=1500, font_size=10, ax=ax1)
ax1.set_title("Girvan-Newman Partition")

# Louvain partition
nx.draw(G, with_labels=True, pos=pos2, node_color=color_map_louvain, edge_color='gray', node_size=1500, font_size=10, ax=ax2)
ax2.set_title("Louvain Partition")

# Infomap partition
nx.draw(G, with_labels=True, pos=pos2, node_color=color_map_infomap, edge_color='gray', node_size=1500, font_size=10, ax=ax3)
ax3.set_title("Infomap Partition")

plt.tight_layout()
plt.show()

Visualizing the perfomances of different algorithms

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Create a DataFrame with the partition quality metrics and modularity scores
data = {
    'Partition': ['Girvan-Newman', 'Louvain', 'Infomap'],
    'Coverage': [girvan_cov, louvain_cov, infomap_cov],
    'Performance': [girvan_per, louvain_per, infomap_per],
    'Modularity': [girvan_mod, louvain_mod, infomap_mod]
}
df = pd.DataFrame(data)

# Set the style and color palette
sns.set(style='whitegrid')
palette = sns.color_palette('viridis', 3)

# Create a figure with subplots
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

# Coverage plot
sns.barplot(x='Partition', y='Coverage', data=df, palette=palette, ax=ax1)
ax1.set_title('Coverage')
ax1.set_ylim(0, 1)

# Performance plot
sns.barplot(x='Partition', y='Performance', data=df, palette=palette, ax=ax2)
ax2.set_title('Performance')
ax2.set_ylim(0, 1)

# Modularity plot
sns.barplot(x='Partition', y='Modularity', data=df, palette=palette, ax=ax3)
ax3.set_title('Modularity')
ax3.set_ylim(0, 1)

# Adjust the spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()