## __Super Eagles 2024: How Well do you Know the Players?__



### 1. Introduction

This is a data-driven exploration of the Nigerian national football team, the Super Eagles, for the year __2024__. Leveraging Python's BeautifulSoup and requests libraries for web scraping, followed by Pandas and Mathplotlib for Exploratory Data Analysis(EDA) and Visualization respectively, this project offers a comprehensive dive into the players' attributes and statistics.


##### Contents
1. Introduction
2. Data Gathering
3. Data Cleaning and Preparation
4. Data Preprocessing
5. Exploratory Data Analysis (EDA)
6. Data Visualization
7. Conclusion  

### 2. Data Gathering

In [None]:
pip install bs4

In [None]:
pip install seaborn

In [None]:
# Import librabries
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Define the URL of the webpage to scrape
url = 'https://www.transfermarkt.com/nigeria/kader/verein/3444/saison_id/2023/plus/1'  # Replace this with the target website URL

# Define custom headers to mimic a request from a web browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
 
# Send an HTTP GET request to the URL with the custom headers
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    print(response.text)  # Print the content of the response
else:
    print(f'Request failed with status code: {response.status_code}')


In [None]:
# Create a BeautifulSoup object from the HTML content of the response
soup = BeautifulSoup(response.text,'html')

# Find the location of the table to scrape
table = soup.find('table', class_ = 'items')

table

In [None]:
# Extract the 'headers' from the 'th'
squad_titles = table.find_all('th')

squad_table_titles = [title.text.strip() for title in squad_titles]

# Add another header 'Position' after the 'Player'
squad_table_titles.insert(squad_table_titles.index('Player') + 1, 'Position')

print(squad_table_titles)

In [None]:
# Place it into a DataFrame
df = pd.DataFrame(columns = squad_table_titles)

df

In [None]:
# Extract the rows out
column_data = table.find_all('tr')
for row in column_data:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]  
    print(individual_row_data)

### 3. Data Cleaning and Preparation

In [None]:
# Assuming individual_row_data is a list with the correct number of elements
expected_num_columns = len(df.columns)

for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.get_text(strip=True) for data in row_data]
    
    # Extract the club title from the <a> tag's 'title' attribute
    club_title = next((data.find('a')['title'] for data in row_data if data.find('a') and 'title' in data.find('a').attrs), None)
    
    # Replace the item at index   4 (fifth position) with the club title
    if club_title:
        individual_row_data[6] = club_title
    
    # Remove the second and third columns (indices   1 and   2 since list is zero-indexed)
    if len(individual_row_data) >   2:
        del individual_row_data[1]
        del individual_row_data[1]
    
    # Check if the row has the correct number of columns before appending
    if len(individual_row_data) == expected_num_columns:
        # Use .loc[] to assign the new row to the DataFrame
        df.loc[len(df)] = individual_row_data
df

In [None]:
# Replace "-" with 0 in the "International matches" and "Goals" columns
df['International matches'] = df['International matches'].replace('-', 0)
df['Goals'] = df['Goals'].replace('-', 0)

# Replace 'm' with ' ' and ',' with '.' in "Height" column 
df['Height'] = df['Height'].str.replace('m', '', regex=True)
df['Height'] = df['Height'].str.replace(',', '.', regex=True)

In [None]:
# Rename the column 'Market value' to 'Market value(EURO €)
df.rename(columns={'Market value': 'Market value(EURO €)'}, inplace=True)

# Rename the column 'Market value' to 'Market value(EURO €)
df.rename(columns={'Height': 'Height(m)'}, inplace= True)


In [None]:
# Update Data with current values
df.loc[3:11, 'Position'] = 'Defender'
df.loc[12:17, 'Position'] = 'Midfield'
df.loc[18:24, 'Position'] = 'Forward'
df.at[8,'Player'] = 'Zaidu Sanusi'
df.at[1,'Player'] = 'Stanley Nwabali'
df.at[0,'International matches'] = 35
df.at[1,'International matches'] = 9
df.at[8,'International matches'] = 22
df.at[11,'International matches'] = 40
df.at[5,'International matches'] = 71
df.at[7,'International matches'] = 34
df.at[10,'International matches'] = 14
df.at[9,'International matches'] = 5
df.at[4,'International matches'] = 31
df.at[3,'International matches'] = 23
df.at[6,'International matches'] = 67
df.at[15,'International matches'] = 5
df.at[13,'International matches'] = 24
df.at[14,'International matches'] = 34
df.at[17,'International matches'] = 71
df.at[16,'International matches'] = 76
df.at[12,'International matches'] = 5
df.at[18,'International matches'] = 109
df.at[21,'International matches'] = 35
df.at[19,'International matches'] = 36
df.at[23,'International matches'] = 52
df.at[20,'International matches'] = 19
df.at[24,'International matches'] = 23
df.at[22,'International matches'] = 16
df.at[5,'Goals'] = 7
df.at[7,'Goals'] = 1
df.at[13,'Goals'] = 1
df.at[17,'Goals'] = 9
df.at[21,'Goals'] = 21
df.at[19,'Goals'] = 5
df.at[23,'Goals'] = 15
df.at[20,'Goals'] = 5
df.at[1, 'Debut'] = 'July 4, 2021'


#### 4. Data Preprocessing

In [None]:
# Convert the "Debut" column to datetime format
df['Debut'] = pd.to_datetime(df['Debut'], errors='coerce')

# Calculate the difference between present date and debut date
present_date = datetime(2024, 2, 1)  # February 2024
df['Years Played'] = (present_date - df['Debut']).dt.days / 365

# Round the 'Years Played' column to 1 decimal place
df['Years Played'] = df['Years Played'].round(1)


In [None]:
# Splitting the "Date of birth/Age" column into two separate columns
df[['Date of birth', 'Age']] = df['Date of birth/Age'].str.extract(r'(.+)\((\d+)\)')

# Dropping the original "Date of birth/Age" column
df.drop(columns=['Date of birth/Age'], inplace=True)

# Inserting the "Date of birth" column at position 3 and "Age" column at position 4
df.insert(3, 'Date of birth', df.pop('Date of birth'))
df.insert(4, 'Age', df.pop('Age'))


In [None]:
df

In [None]:
# Remove the Euro symbol from the 'Market' column
df['Market value(EURO €)'] = df['Market value(EURO €)'].str.replace('€', '', regex=True)

def convert_decimal_string_to_thousands(decimal_str):
    # Check if 'm' is in the string
    if 'm' in decimal_str:
        # Remove the 'm' and parse the remaining string to a float
        decimal_value = float(decimal_str.rstrip('m'))
        # Convert the float to an integer to remove the decimal point
        integer_value = int(decimal_value *   100)
        # Multiply by   1000 to convert to thousands
        thousands_value = integer_value *   10000
        return thousands_value
    else:
        # Handle the case where 'm' is not found in the string
        # Parse the string to a float
        decimal_value = float(decimal_str.rstrip('k'))
        # Multiply by  1000 to convert to thousands
        return int(decimal_value *  1000)

# Example usage:
df['Market value(EURO €)'] = df['Market value(EURO €)'].apply(convert_decimal_string_to_thousands)



In [None]:

# Convert  specific column to a different data type
df['Player'] = df['Player'].astype(str)
df['Position'] = df['Position'].astype(str)
df['Club'] = df['Club'].astype(str)
df['Height(m)'] = df['Height(m)'].astype(float)
df['Foot'] = df['Foot'].astype(str)
df['Age'] = df['Age'].astype(int)
df['Goals'] = df['Goals'].astype(int)

# Convert 'Date of Birth' column to date datatype
df['Date of birth'] = pd.to_datetime(df['Date of birth'])


In [None]:
df

#### 5. Exploratory Data Analysis (EDA)

In [None]:
# getting the overview of my DataFrame's structure
df.info()

In [None]:
# Obtaining df summary statistics for numerical columns, providing insights into the distribution of the data.
df.describe()

#df.select_dtypes(include= 'number')

In [None]:
# identifying columns with missing data
df.isnull().sum()

In [None]:
# identifying columns with unique value
df.nunique()

In [None]:
df.sort_values(by = "Goals", ascending= False).head(5)

In [None]:
# Sort the DataFrame based on 'Goals' column in descending order and select the top 5 rows
top_5_goals = df.sort_values(by='Goals', ascending=False).head(5)

top_goals= top_5_goals[['Player', 'Goals']].reset_index(drop = True)

top_goals

In [None]:
# Sort the DataFrame based on 'Intenational matches' column in descending order and select the top 5 rows
appearance = df.sort_values(by='International matches', ascending=False).head(5)

caps = appearance[['Player', 'International matches']]

caps

In [None]:
# Sort the DataFrame based on 'Intenational matches' column in descending order and select the top 5 rows
high_value = df.sort_values(by='Market value(EURO €)', ascending=False).head(5)

value = high_value[['Player', 'Market value(EURO €)']]

value

In [None]:
select_columns = ['Age', 'International matches', 'Height(m)', 'Goals', 'Market value(EURO €)', 'Years Played']

correlation_matrix = df[select_columns].corr()

correlation_matrix

In [None]:
sns.heatmap(df[select_columns].corr(), annot= True)

#to change the size of the plt
#plt.rcParams['figure.figsize' = (15,7)]

plt.show()

In [None]:
df[select_columns].boxplot(figsize=(10,6))

In [None]:

result = df.groupby('Position').agg({'Age': 'mean', 'Height(m)': 'mean', 'International matches': 'mean', 'Market value(EURO €)': 'sum'}).sort_values(by = 'Market value(EURO €)')

result = result.round(1)

result['International matches'] = result['International matches'].apply(lambda x: round(x, 1))

result

### 4. Data Visualization


In [None]:
top_goals.plot.barh('Player', 'Goals', ylabel = 'Player', xlabel = 'Goals', color= 'lightgreen', title = ' Top 5 Super Eagles Goals Scorers')

In [None]:

ax = caps.plot(kind='bar', figsize=(6,  6), color='steelblue')
plt.ylabel('International matches')
plt.xlabel('Player')   
plt.title('Top 5 players with the most International caps')

# Set player names as x-tick labels
plt.xticks(range(len(caps)), caps['Player'])

# Annotate value labels to each bar
for i in range(len(caps)):
    matches = caps['International matches'].iloc[i]
    label = f'{matches}'
    ax.annotate(label, xy=(i, matches), color='black', ha='center', va='bottom')

plt.show()


In [None]:

value

In [None]:
# Create a horizontal bar chart
lb = plt.barh(value['Player'], value['Market value(EURO €)'], align='center', color='None', edgecolor='darkgreen', alpha=1)
plt.yticks(value['Player'])   # Use player names for yticks
plt.xlabel('Market value(EURO €)')   # Label for x-axis
plt.title('Top 5 players with the most International caps')

# Annotate value labels to each bar
for bar in lb:
    market = bar.get_width()  # Get the market value from the width of the bar
    label = f'{market:,.0f}'  # Format the label
    plt.annotate(label, xy=(market, bar.get_y() + bar.get_height() / 2), color='black', ha='left', va='center')

plt.show()

In [None]:
df.plot.scatter(x = 'International matches', y = 'Years Played', s= 100, title = 'Relationship between Players CAPS and Years Played')

In [None]:
df.plot.scatter(x = 'International matches', y = 'Market value(EURO €)', s = 100, title = 'Relationship between Players CAPS and Market value(EURO €)')

In [None]:
# Count the occurrences of each category in the 'Foot' column
foot_counts = df['Foot'].value_counts()

# Plot a pie chart
foot_counts.plot.pie(autopct='%1.1f%%', figsize=(5, 5))

# Add a title
plt.title('Distribution of Super Eagles Players by Preferred Foot')

# Show the plot
plt.show()

In [None]:
# Create DataFrame
results = pd.DataFrame(result, index=['Goalkeeper', 'Defender', 'Midfield', 'Forward'])
# Plot the bar chart
results['Market value(EURO €)'].plot(kind='bar', color='skyblue', figsize=(8, 5))

# Add labels and title
plt.xlabel('Position')
plt.ylabel('Market value(EURO €)')
plt.title('Market value by Position')

# Show the plot
plt.show()

In [None]:
df.to_csv(r'C:\Users\ELITEX21012G2\Documents\SUperEagles\players_.csv', index=False)

#### 8. Conclusion


I exported this file to Tableau where i built a dashboard with it to better display my analysis.

In [None]:
#df = df.iloc[0:0]
df = pd.DataFrame()  # Reassign an empty DataFrame

len(df)