In [39]:
# get all the imports

# all the imports 
import warnings

# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# HTTP requests and web scraping
import requests
from bs4 import BeautifulSoup
from io import StringIO

import urllib.parse

import os
from glob import glob

#stats analysis
from scipy import stats
from sklearn.preprocessing import StandardScaler

print("All imports work! Your environment is ready for data analysis and web scraping.")


All imports work! Your environment is ready for data analysis and web scraping.


In [40]:
#formatting for data set

pd.options.display.float_format = "{:,.2f}".format
pd.options.display.max_columns = None

pd.set_option("display.max_colwidth", None)

In [43]:
## pull the data set - Immaculate Data Version 

fifa_data = "https://raw.githubusercontent.com/fern-1210/IronHack-w3-Modual1-Project3/main/Data/Clean/The%20Immaculate%20Data.csv"
fifa = pd.read_csv(fifa_data)


# Create a copy to avoid modifying the original
fifa = fifa.copy()

# List of value columns to convert
value_columns = ['2019 Value', '2020 Value', '2021 Value', '2022 Value', '2023 Value']

# Convert each column from euros to millions
for col in value_columns:
    fifa[col] = fifa[col] / 1000000

#view info 
fifa.head()

Unnamed: 0,ID,Name,Nationality,Overall,Wage,Preferred Foot,Work Rate,Body Type,Position,Height,Weight,Year,in_2019,in_2020,in_2021,in_2022,in_2023,Age in 2019,Age in 2023,2019 Club,2020 Club,2021 Club,2022 Club,2023 Club,2019 Value,2020 Value,2021 Value,2022 Value,2023 Value,Total Clubs 2019-2023
0,176580,L. Suárez,Uruguay,84,1000,Right,High/ Medium,Unique,Left Striker,182,86,2023,1,1,1,1,1,31,35,FC Barcelona,FC Barcelona,Atlético Madrid,Atlético de Madrid,Club Nacional de Football,80.0,53.0,31.5,44.5,18.0,4
1,194765,A. Griezmann,France,83,185000,Left,Medium/ Medium,Unique,Right Striker,176,73,2023,1,1,1,1,1,27,31,Atlético Madrid,FC Barcelona,FC Barcelona,Atlético de Madrid,Atlético de Madrid,71.0,69.0,50.5,53.0,30.5,3
2,177003,L. Modrić,Croatia,88,230000,Right,Medium/ Medium,Lean (170-185),Right Centre Midfield,172,66,2023,1,1,1,1,1,32,36,Real Madrid,Real Madrid,Real Madrid,Real Madrid CF,Real Madrid CF,67.0,39.0,24.5,32.0,29.0,2
3,224334,M. Acuña,Argentina,85,46000,Left,High/ High,Stocky (170-185),Left Back,172,69,2023,1,1,1,1,1,26,30,Sporting CP,Sporting CP,Sevilla FC,Sevilla FC,Sevilla FC,19.0,20.0,22.0,37.0,46.5,2
4,192985,K. De Bruyne,Belgium,91,350000,Right,High/ High,Unique,Right Centre Midfield,181,70,2023,1,1,1,1,1,27,31,Manchester City,Manchester City,Manchester City,Manchester City,Manchester City,93.0,90.0,87.0,125.5,107.5,1


### Begin the Analysis (2023 + Wage focus)

In [44]:
# Focusing on the main data points for the analysis under DF ["fifa_foot_analysis"]
fifa_foot_analysis = fifa[[
    "ID",
    "Name", 
    "Preferred Foot",
    "Position",
    "Wage",
    "Overall",
    "2023 Value"
]].copy()

# View on the data set
display(fifa_foot_analysis.head(10))

# Check the distribution of foot preference
display("\nPreferred Foot distribution: %")
display(fifa_foot_analysis["Preferred Foot"].value_counts(normalize=True))

# Check the distribution of foot preference
display("\nPreferred Foot distribution: absolute")
display(fifa_foot_analysis["Preferred Foot"].value_counts())

Unnamed: 0,ID,Name,Preferred Foot,Position,Wage,Overall,2023 Value
0,176580,L. Suárez,Right,Left Striker,1000,84,18.0
1,194765,A. Griezmann,Left,Right Striker,185000,83,30.5
2,177003,L. Modrić,Right,Right Centre Midfield,230000,88,29.0
3,224334,M. Acuña,Left,Left Back,46000,85,46.5
4,192985,K. De Bruyne,Right,Right Centre Midfield,350000,91,107.5
5,178518,R. Nainggolan,Right,Right Centre Midfield,22000,76,3.5
6,212198,B. Fernandes,Right,Left Centre Midfield,190000,86,78.5
7,176676,M. arcelo,Left,Substitute,125000,79,9.0
8,212462,A. Telles,Left,Substitute,110000,80,18.5
9,173731,G. Bale,Left,Right Striker,14000,81,20.5


'\nPreferred Foot distribution: %'

Preferred Foot
Right   0.75
Left    0.25
Name: proportion, dtype: float64

'\nPreferred Foot distribution: absolute'

Preferred Foot
Right    2767
Left      917
Name: count, dtype: int64

### Statistical Analysis ###

In [46]:

# Split groups
left_foot = fifa_foot_analysis[fifa_foot_analysis["Preferred Foot"] == "Left"]
right_foot = fifa_foot_analysis[fifa_foot_analysis["Preferred Foot"] == "Right"]

# -----------------------------
# T-tests
# -----------------------------
t_overall, p_overall = stats.ttest_ind(
    left_foot["Overall"],
    right_foot["Overall"],
    equal_var=False
)

t_wage, p_wage = stats.ttest_ind(
    left_foot["Wage"],
    right_foot["Wage"],
    equal_var=False
)

t_value, p_value = stats.ttest_ind(
    left_foot["2023 Value"],
    right_foot["2023 Value"],
    equal_var=False
)

print("="*80)
print("STATISTICAL SIGNIFICANCE TESTS (Welch's T-Test)")
print("="*80)

print("\nOverall Rating:")
print(f"  t-statistic: {t_overall:.3f}")
print(f"  p-value: {p_overall:.4f}")
print(f"  Significant? {'YES' if p_overall < 0.05 else 'NO'}")

print("\nWage:")
print(f"  t-statistic: {t_wage:.3f}")
print(f"  p-value: {p_wage:.4f}")
print(f"  Significant? {'YES' if p_wage < 0.05 else 'NO'}")

print("\nMarket Value (2023):")
print(f"  t-statistic: {t_value:.3f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Significant? {'YES' if p_value < 0.05 else 'NO'}")

print("\nInterpretation:")
print("p < 0.05 = statistically significant difference")
print("p ≥ 0.05 = no evidence of a difference between footedness groups")

#Mulitvariant analysis 


scaler = StandardScaler()

fifa_foot_analysis[["Overall_z", "Wage_z", "Value_z"]] = scaler.fit_transform(
    fifa_foot_analysis[["Overall", "Wage", "2023 Value"]]
)

# Composite score
fifa_foot_analysis["Composite_Score"] = (
    fifa_foot_analysis["Overall_z"] +
    fifa_foot_analysis["Wage_z"] +
    fifa_foot_analysis["Value_z"]
)

left = fifa_foot_analysis[fifa_foot_analysis["Preferred Foot"] == "Left"]
right = fifa_foot_analysis[fifa_foot_analysis["Preferred Foot"] == "Right"]

t_comp, p_comp = stats.ttest_ind(
    left["Composite_Score"],
    right["Composite_Score"],
    equal_var=False
)

print("\nComposite Score (Overall + Wage + Value):")
print(f"  t-statistic: {t_comp:.3f}")
print(f"  p-value: {p_comp:.4f}")
print(f"  Significant? {'YES' if p_comp < 0.05 else 'NO'}")



STATISTICAL SIGNIFICANCE TESTS (Welch's T-Test)

Overall Rating:
  t-statistic: -0.203
  p-value: 0.8392
  Significant? NO

Wage:
  t-statistic: 0.074
  p-value: 0.9413
  Significant? NO

Market Value (2023):
  t-statistic: 0.796
  p-value: 0.4261
  Significant? NO

Interpretation:
p < 0.05 = statistically significant difference
p ≥ 0.05 = no evidence of a difference between footedness groups

Composite Score (Overall + Wage + Value):
  t-statistic: 0.247
  p-value: 0.8050
  Significant? NO
