# EXPLORATORY DATA ANALYSIS

## Import necessary packages and load data

In [22]:
#Import packages
import pandas as pd

In [23]:
#Load dataset
data = pd.read_csv('datasets/raw_data_final/cleaned_dataset.csv')

In [24]:
#Get the dimensions (rows, columns) of the dataframe
data.shape
#The dataset contains 3555 records and 113 records

(2384, 94)

In [25]:
#View the first few rows of the dataframe
data.head()

Unnamed: 0,CLUB_NAME,PLAYER_VALUE,LEAGUE_COUNTRY,CURRENT_INTERNATIONAL,AGE,HEIGHT,POSITION,FOOT,PLAYER_AGENT,OUTFITTER,...,LOST_CHALLENGES,BLOCKS_BLOCKS,SH_BLOCKS,PASS_BLOCKS,INT,TKL+INT,CLR,ERR,YEAR_BIRTH,ID
0,Manchester City,80000000.0,England,Portugal,26.0,1.87,Defender - Centre-Back,right,True,True,...,7.0,21.0,13.0,8.0,18.0,39.0,56.0,1.0,1997.0,258004
1,Manchester City,42000000.0,England,Netherlands,28.0,1.8,Defender - Centre-Back,left,True,True,...,4.0,20.0,10.0,10.0,20.0,46.0,48.0,0.0,1995.0,177476
2,Manchester City,40000000.0,England,England,29.0,1.88,Defender - Centre-Back,right,True,True,...,3.0,17.0,10.0,7.0,9.0,32.0,40.0,0.0,1994.0,186590
3,Manchester City,38000000.0,England,Switzerland,27.0,1.88,Defender - Centre-Back,right,True,True,...,13.0,27.0,9.0,18.0,16.0,52.0,33.0,0.0,1995.0,284730
4,Manchester City,25000000.0,England,Spain,29.0,1.89,Defender - Centre-Back,left,False,True,...,4.0,10.0,4.0,6.0,3.0,14.0,30.0,1.0,1994.0,176553


In [26]:
#Get general information about the dataframe, like data types and missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2384 entries, 0 to 2383
Data columns (total 94 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   CLUB_NAME                      2384 non-null   object 
 1   PLAYER_VALUE                   2384 non-null   float64
 2   LEAGUE_COUNTRY                 2384 non-null   object 
 3   CURRENT_INTERNATIONAL          2384 non-null   object 
 4   AGE                            2384 non-null   float64
 5   HEIGHT                         2384 non-null   float64
 6   POSITION                       2384 non-null   object 
 7   FOOT                           2384 non-null   object 
 8   PLAYER_AGENT                   2384 non-null   bool   
 9   OUTFITTER                      2384 non-null   bool   
 10  MP_PLAYING                     2384 non-null   float64
 11  STARTS_PLAYING                 2384 non-null   float64
 12  MIN_PLAYING                    2384 non-null   f

## Change data types

In [27]:
#Convert "object" data types columns to category data type
object_columns = data.select_dtypes(include='object').columns
data[object_columns] = data[object_columns].astype('category')

In [28]:
# Transform the "YEAR_BIRTH" column to a categorical data type
data['YEAR_BIRTH'] = data['YEAR_BIRTH'].astype('category')

## Separate and visualize output variable

*Separating the output variable during EDA helps maintain data integrity, improves the accuracy of your analysis, and ensures a cleaner transition to the modeling phase. It's a good practice that can save you time and prevent potential issues later in the data analysis process.*

In [29]:
# Separate the output variable (target) from the features
output_variable = data['PLAYER_VALUE']

In [30]:
output_variable

0       80000000.0
1       42000000.0
2       40000000.0
3       38000000.0
4       25000000.0
           ...    
2379     4000000.0
2380     2800000.0
2381     1100000.0
2382     1000000.0
2383      600000.0
Name: PLAYER_VALUE, Length: 2384, dtype: float64

In [31]:
#Import numpy library
import numpy as np

# Create bins with a length of 25,000,000
bin_length = 25000000
bins = np.arange(0, output_variable.max() + bin_length, bin_length)

# Use pandas cut function to create bins and calculate percentages
output_bins = pd.cut(output_variable, bins=bins)
bin_counts = output_bins.value_counts()
percentage_players = (bin_counts / len(output_variable)) * 100

# Create a DataFrame to display the results
result_df = pd.DataFrame({'PLAYER_VALUE Bin': bin_counts.index.astype(str),
                          'Player Count': bin_counts.values,
                          'Percentage of Players': [f'{value:.2f}%' for value in percentage_players.values]}
                         )

# Display the result table
result_df

Unnamed: 0,PLAYER_VALUE Bin,Player Count,Percentage of Players
0,"(0.0, 25000000.0]",2141,89.81%
1,"(25000000.0, 50000000.0]",168,7.05%
2,"(50000000.0, 75000000.0]",47,1.97%
3,"(75000000.0, 100000000.0]",20,0.84%
4,"(100000000.0, 125000000.0]",5,0.21%
5,"(175000000.0, 200000000.0]",2,0.08%
6,"(125000000.0, 150000000.0]",1,0.04%
7,"(150000000.0, 175000000.0]",0,0.00%


### Histogram of Player Value

In [32]:
#Import plotly libraries
import plotly.express as px

# Convert output_variable to a Series
output_series = output_variable.squeeze()

# Create a histogram with a KDE line for the output_variable
hist = px.histogram(output_series, title='Histogram of Player Market Value',
                   labels={'value': 'Player Value (in €)'},
                   color_discrete_sequence=['steelblue'])

# Update the y-axis title to "Frequency"
hist.update_yaxes(title_text="Frequency")

# Remove the legend from the layout
hist.update_layout(showlegend=False, height=500)

# Show the histogram
hist.show()

#The histogram shows that the distribution of 'PLAYER_VALUE' is right skewed. Almost 90% of players has a 'PLAYER_VALUE' less than €25,000,000

### Box Plot of Player Value

In [33]:
# Create an interactive horizontal box plot with tooltips
box_plot = px.box(output_variable, orientation='h', title='Box Plot of Player Market Value',
             labels={'value': 'Player Value (in €)'}, color_discrete_sequence=['steelblue'],
             hover_name=None, hover_data=[output_variable])

# Show the plot
box_plot.show()

### Apply log transformation to output variable

In [34]:
# Apply a log transformation to the output variable
log_transformed_y = np.log(output_variable)

# Now, 'log_transformed_y' contains the output variable with the log transformation.

#Save log transformed output variable
log_transformed_y.to_csv('datasets/raw_data_final/output_variable_model.csv')

In [35]:
output_variable.tail()

2379    4000000.0
2380    2800000.0
2381    1100000.0
2382    1000000.0
2383     600000.0
Name: PLAYER_VALUE, dtype: float64

In [36]:
log_transformed_y.tail()

2379    15.201805
2380    14.845130
2381    13.910821
2382    13.815511
2383    13.304685
Name: PLAYER_VALUE, dtype: float64

### Histogram after Log Transformation

In [37]:
# Convert output_variable to a Series
output_series = log_transformed_y.squeeze()

# Create a histogram with a KDE line for the output_variable
hist = px.histogram(output_series, title='Histogram of Log Player Market Value',
                   labels={'value': 'Log Player Value'},
                   color_discrete_sequence=['steelblue'])

# Update the y-axis title to "Frequency"
hist.update_yaxes(title_text="Frequency")

# Remove the legend from the layout
hist.update_layout(showlegend=False, height=500)

# Show the histogram
hist.show()

## Separate and visualize features

*By focusing on the independent variables separately from the target variable, you can explore relationships and patterns in the data without being influenced by the outcome variable. This allows you to gain a deeper understanding of the features and their impact on the target variable.*

In [38]:
# Separate the features from the output variable
features = data.drop(columns=['PLAYER_VALUE'])

In [39]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2384 entries, 0 to 2383
Data columns (total 93 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   CLUB_NAME                      2384 non-null   category
 1   LEAGUE_COUNTRY                 2384 non-null   category
 2   CURRENT_INTERNATIONAL          2384 non-null   category
 3   AGE                            2384 non-null   float64 
 4   HEIGHT                         2384 non-null   float64 
 5   POSITION                       2384 non-null   category
 6   FOOT                           2384 non-null   category
 7   PLAYER_AGENT                   2384 non-null   bool    
 8   OUTFITTER                      2384 non-null   bool    
 9   MP_PLAYING                     2384 non-null   float64 
 10  STARTS_PLAYING                 2384 non-null   float64 
 11  MIN_PLAYING                    2384 non-null   float64 
 12  GLS                            238

In [40]:
features.columns

Index(['CLUB_NAME', 'LEAGUE_COUNTRY', 'CURRENT_INTERNATIONAL', 'AGE', 'HEIGHT',
       'POSITION', 'FOOT', 'PLAYER_AGENT', 'OUTFITTER', 'MP_PLAYING',
       'STARTS_PLAYING', 'MIN_PLAYING', 'GLS', 'AST', 'G+A', 'G_MINUS_PK',
       'PK', 'PKATT', 'CRDY', 'CRDR', 'PRGC_PROGRESSION', 'PRGP_PROGRESSION',
       'PRGR_PROGRESSION', 'SH_STANDARD', 'SOT_STANDARD', 'DIST_STANDARD',
       'FK_STANDARD', 'TOUCHES_TOUCHES', 'DEF PEN_TOUCHES', 'DEF 3RD_TOUCHES',
       'MID 3RD_TOUCHES', 'ATT 3RD_TOUCHES', 'ATT PEN_TOUCHES', 'LIVE_TOUCHES',
       'ATT_TAKE', 'SUCC_TAKE', 'TKLD_TAKE', 'CARRIES_CARRIES',
       'TOTDIST_CARRIES', 'PRGDIST_CARRIES', 'PRGC_CARRIES',
       'FINAL_THIRD_CARRIES', 'CPA_CARRIES', 'MIS_CARRIES', 'DIS_CARRIES',
       'REC_RECEIVING', 'PRGR_RECEIVING', 'COMPL_STARTS', 'SUBS_SUBS',
       'UNSUB_SUBS', 'PPM_TEAM.SUCCESS', 'ONG_TEAM.SUCCESS',
       'ONGA_TEAM.SUCCESS', 'PLUS_PER__MINUS__TEAM.SUCCESS', 'CMP_TOTAL',
       'ATT_TOTAL', 'TOTDIST_TOTAL', 'PRGDIST_TOTAL', 'KP

In [41]:
#Create new variable based on player's position (attack - midfield - defense)

#features['POSITION'].unique()

# Define a custom function to categorize the positions
def get_macro_position(position):
    if 'Defender' in position:
        return 'Defense'
    elif 'midfield' in position:
        return 'Midfield'
    else:
        return 'Attack'

# Create the "MACRO_POSITION" column using the custom function
features['MACRO_POSITION'] = features['POSITION'].apply(get_macro_position)

# Display the DataFrame with the new "MACRO_POSITION" column
features[['POSITION', 'MACRO_POSITION']].head(10)

Unnamed: 0,POSITION,MACRO_POSITION
0,Defender - Centre-Back,Defense
1,Defender - Centre-Back,Defense
2,Defender - Centre-Back,Defense
3,Defender - Centre-Back,Defense
4,Defender - Centre-Back,Defense
5,Defender - Left-Back,Defense
6,Defender - Right-Back,Defense
7,Defender - Right-Back,Defense
8,Defender - Right-Back,Defense
9,midfield - Defensive Midfield,Midfield


### Bar charts for Categorical Features

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_categorical_bar_charts(data, categorical_columns):
    """
    Plot separate bar charts for a list of categorical columns in the given DataFrame.

    Parameters:
        data (pd.DataFrame): The DataFrame containing the data.
        categorical_columns (list): A list of names of the categorical columns to be analyzed.

    Returns:
        None
    """
    num_plots = len(categorical_columns)
    rows = 3
    cols = 4

    fig = make_subplots(rows=rows, cols=cols,
                        subplot_titles=categorical_columns)

    for i, column in enumerate(categorical_columns):
        counts = data[column].value_counts()
        trace = go.Bar(x=counts.index, y=counts.values, marker_color='steelblue')
        row_num = (i // cols) + 1
        col_num = (i % cols) + 1
        fig.add_trace(trace, row=row_num, col=col_num)

    fig.update_layout(title='Categorical Variables Analysis',
                      xaxis=dict(title='Categories'),
                      yaxis=dict(title='Count'),
                      height=1500)
    
    # Hide the legend
    fig.update_layout(showlegend=False)

    fig.show()

In [None]:
 # Define the list of categorical columns to analyze
categorical_columns = ['CLUB_NAME', 'LEAGUE_COUNTRY', 'CURRENT_INTERNATIONAL', 
                       'POSITION', 'MACRO_POSITION', 'FOOT', 'PLAYER_AGENT', 'OUTFITTER', 'YEAR_BIRTH']

# Plot the separate bar charts for the categorical columns in a 4x3 grid
plot_categorical_bar_charts(features, categorical_columns)

### Histograms for Numerical Features

In [None]:
numerical_columns = features.drop(columns=categorical_columns, axis=1).columns

In [None]:
# Define the threshold for low variance (e.g., 0.01, but you can adjust based on your data)
variance_threshold = 10

# Calculate the variance of each numerical column
variance_values = features[numerical_columns].var()

# Create a DataFrame and sort the values in descending order
variance_df = pd.DataFrame({'Column': variance_values.index, 'Variance': variance_values.values})
variance_df = variance_df.sort_values(by='Variance', ascending=True)

# Display the sorted DataFrame
variance_df.to_csv('datasets/temp/TEMP_variance_values.csv')

In [None]:
# Identify columns with low variance
low_variance_columns = variance_values[variance_values < variance_threshold].index.tolist()

# Drop columns with low variance from the DataFrame
features_filtered = features.drop(columns=low_variance_columns)

# Now, the features_filtered DataFrame contains the relevant columns after removing low variance features.

In [None]:
features_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2384 entries, 0 to 2383
Data columns (total 80 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   CLUB_NAME                      2384 non-null   category
 1   LEAGUE_COUNTRY                 2384 non-null   category
 2   CURRENT_INTERNATIONAL          2384 non-null   category
 3   AGE                            2384 non-null   float64 
 4   POSITION                       2384 non-null   category
 5   FOOT                           2384 non-null   category
 6   PLAYER_AGENT                   2384 non-null   bool    
 7   OUTFITTER                      2384 non-null   bool    
 8   MP_PLAYING                     2384 non-null   float64 
 9   STARTS_PLAYING                 2384 non-null   float64 
 10  MIN_PLAYING                    2384 non-null   float64 
 11  GLS                            2384 non-null   float64 
 12  G+A                            238

In [None]:
numerical_columns = features_filtered.drop(columns=categorical_columns, axis=1).columns
numerical_columns = numerical_columns.drop('ID')
numerical_columns

Index(['AGE', 'MP_PLAYING', 'STARTS_PLAYING', 'MIN_PLAYING', 'GLS', 'G+A',
       'PRGC_PROGRESSION', 'PRGP_PROGRESSION', 'PRGR_PROGRESSION',
       'SH_STANDARD', 'SOT_STANDARD', 'DIST_STANDARD', 'TOUCHES_TOUCHES',
       'DEF PEN_TOUCHES', 'DEF 3RD_TOUCHES', 'MID 3RD_TOUCHES',
       'ATT 3RD_TOUCHES', 'ATT PEN_TOUCHES', 'LIVE_TOUCHES', 'ATT_TAKE',
       'SUCC_TAKE', 'TKLD_TAKE', 'CARRIES_CARRIES', 'TOTDIST_CARRIES',
       'PRGDIST_CARRIES', 'PRGC_CARRIES', 'FINAL_THIRD_CARRIES', 'CPA_CARRIES',
       'MIS_CARRIES', 'DIS_CARRIES', 'REC_RECEIVING', 'PRGR_RECEIVING',
       'COMPL_STARTS', 'SUBS_SUBS', 'UNSUB_SUBS', 'ONG_TEAM.SUCCESS',
       'ONGA_TEAM.SUCCESS', 'PLUS_PER__MINUS__TEAM.SUCCESS', 'CMP_TOTAL',
       'ATT_TOTAL', 'TOTDIST_TOTAL', 'PRGDIST_TOTAL', 'KP', 'FINAL_THIRD',
       'PPA', 'CRSPA', 'FLS', 'FLD', 'OFF', 'CRS', 'TKLW', 'RECOV',
       'WON_AERIAL', 'LOST_AERIAL', 'SCA_SCA', 'GCA_GCA', 'TKL_TACKLES',
       'TKLW_TACKLES', 'DEF 3RD_TACKLES', 'MID 3RD_TACKLES', 'AT

In [None]:
def plot_histograms(df, numerical_columns):
    num_plots = len(numerical_columns)
    rows = (num_plots + 3) // 4  # Round up to the nearest integer for rows
    cols = 4
    
    # Create the subplot grid
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=numerical_columns)

    # Add histogram plots to the subplot grid
    for i, column in enumerate(numerical_columns):
        hist_trace = go.Histogram(x=df[column], name=column, marker_color='steelblue')
        row = i // cols + 1
        col = i % cols + 1
        fig.add_trace(hist_trace, row=row, col=col)

    # Update layout for better visualization
    fig.update_layout(title='Histograms of Numerical Columns',
                      showlegend=False,
                      height=4000,  # Adjust height based on the number of plots
                      barmode='overlay')

    # Update x-axis and y-axis titles
    fig.update_xaxes(title_text='Values')
    fig.update_yaxes(title_text='Frequency')

    # Show the subplot grid
    fig.show()

In [None]:
# Plot histogram plots for the specified numerical columns
plot_histograms(features, numerical_columns)

### Box Plots for Numerical Features

In [None]:
def plot_boxplots(df, numerical_columns):
    num_plots = len(numerical_columns)
    num_rows = (num_plots + 3) // 4  # Round up to the nearest integer for rows

    # Create the subplot grid
    fig = make_subplots(rows=num_rows, cols=4, subplot_titles=numerical_columns)

    # Add boxplots to the subplot grid
    for i, column in enumerate(numerical_columns):
        row = i // 4 + 1
        col = i % 4 + 1
        box_trace = go.Box(y=df[column], name=column, marker_color='steelblue')
        fig.add_trace(box_trace, row=row, col=col)

    # Update layout for better visualization
    fig.update_layout(title='Boxplots of Numerical Columns',
                      height=4000,  # Adjust height based on the number of plots
                      showlegend=False)

    # Show the subplot grid
    fig.show()

In [None]:
# Plot boxplots for the specified numerical columns
plot_boxplots(features, numerical_columns)

### Heatmap for Numerical Features

In [None]:
import numpy as np
from sklearn.cluster import KMeans

def plot_heatmaps_for_subsets(features_df, numerical_columns, n_clusters, correlation_threshold):
    # Create a correlation matrix for the numerical columns
    correlation_matrix = features_df[numerical_columns].corr().abs()

    # Use KMeans clustering to create subsets of variables
    kmeans = KMeans(n_clusters=n_clusters, random_state=1)
    subset_labels = kmeans.fit_predict(correlation_matrix)

    # Create a list to store the Plotly figures
    heatmap_figures = []

    # Store correlated variables to be dropped
    correlated_variables = []
    
    # Plot heatmap for each subset
    for i in range(n_clusters):
        subset = correlation_matrix.index[subset_labels == i].tolist()
        subset_correlation_matrix = correlation_matrix.loc[subset, subset]
        
        heatmap = go.Figure(data=go.Heatmap(
            z=subset_correlation_matrix.values,
            x=subset,
            y=subset,
            colorscale='YlGnBu',  # Set the colorscale
            colorbar=dict(title='Correlation'),  # Set the colorbar title
        ))

        heatmap.update_layout(
            title=f"Heatmap for Subset {i+1}",
            xaxis_title='Numerical Columns',
            yaxis_title='Numerical Columns',
        )

        heatmap_figures.append(heatmap)

        # Create an empty DataFrame to store correlated pairs and their correlation values
        correlated_pairs = pd.DataFrame(columns=['Variable 1', 'Variable 2', 'Correlation Value'])

        # Find pairs of variables with correlation above the threshold and store in correlated_pairs
        # Select variables with correlation above the threshold
        correlated_vars_subset = (subset_correlation_matrix > correlation_threshold).sum() > 0
    
        for var1 in correlated_vars_subset.index:
            for var2 in correlated_vars_subset.index:
                if var1 != var2 and subset_correlation_matrix.loc[var1, var2] > correlation_threshold:
                    if var1 in correlated_variables or var2 in correlated_variables:
                        continue
                    max_correlation_value = subset_correlation_matrix.loc[var1, var2]
                    correlated_pairs = correlated_pairs.append({'Variable 1': var1,
                                                                'Variable 2': var2,
                                                                'Correlation Value': max_correlation_value},
                                                            ignore_index=True)

        # Iterate through the correlated pairs DataFrame and process each pair
        for index, row in correlated_pairs.iterrows():
            var1 = row['Variable 1']
            var2 = row['Variable 2']
            max_correlation_value = row['Correlation Value']

            # Calculate variance for each variable in the pair
            var1_variance = features_df[var1].var()
            var2_variance = features_df[var2].var()

            # Select the variable with the lower variance
            variable_to_drop = var1 if var1_variance < var2_variance else var2

            # Check if the variable_to_drop is already in correlated_variables
            if variable_to_drop in correlated_variables:
                continue

            # Print the selected variables and their correlation, variances, and variable to drop
            print(f"Correlated Variables: {var1}, {var2}")
            print(f"Correlation Value: {max_correlation_value}")
            print(f"Variances: {var1_variance}, {var2_variance}")
            print(f"Variable to Drop: {variable_to_drop}")
            
            # Add the variable with lower variance to correlated_variables list
            correlated_variables.append(variable_to_drop)
            
    return heatmap_figures, correlated_variables

In [None]:
heatmap_figures, correlated_variables = plot_heatmaps_for_subsets(features_filtered, numerical_columns, n_clusters=9, correlation_threshold=0.85)

print("Number of variables to drop:", len(correlated_variables))

# Show the heatmaps (one by one)
for heatmap in heatmap_figures:
    heatmap.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

Correlated Variables: MP_PLAYING, STARTS_PLAYING
Correlation Value: 0.8849059839587551
Variances: 124.11227106433408, 125.80920713555409
Variable to Drop: MP_PLAYING
Correlated Variables: STARTS_PLAYING, MIN_PLAYING
Correlation Value: 0.9930219247052197
Variances: 125.80920713555409, 899515.9011227114
Variable to Drop: STARTS_PLAYING
Correlated Variables: MIN_PLAYING, ONG_TEAM.SUCCESS
Correlation Value: 0.8820554024404597
Variances: 899515.9011227114, 295.14064599075823
Variable to Drop: ONG_TEAM.SUCCESS
Correlated Variables: MIN_PLAYING, ONGA_TEAM.SUCCESS
Correlation Value: 0.8929380574488324
Variances: 899515.9011227114, 231.54351643492626
Variable to Drop: ONGA_TEAM.SUCCESS
Correlated Variables: PRGP_PROGRESSION, TOTDIST_CARRIES
Correlation Value: 0.8584178164858011
Variances: 2893.8773611740853, 5526360.177946704
Variable to Drop: PRGP_PROGRESSION
Correlated Variables: PRGR_PROGRESSION, PRGR_RECEIVING
Correlation Value: 1.0
Variances: 4239.729494539072, 4239.729494539072
Variable t


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

Correlated Variables: TOUCHES_TOUCHES, MID 3RD_TOUCHES
Correlation Value: 0.970316585402248
Variances: 457197.20685884485, 122772.24401715041
Variable to Drop: MID 3RD_TOUCHES
Correlated Variables: TOUCHES_TOUCHES, LIVE_TOUCHES
Correlation Value: 0.9999991905498067
Variances: 457197.20685884485, 457045.5065295791
Variable to Drop: LIVE_TOUCHES
Correlated Variables: TOUCHES_TOUCHES, CARRIES_CARRIES
Correlation Value: 0.9686438465503525
Variances: 457197.20685884485, 184294.6777284289
Variable to Drop: CARRIES_CARRIES
Correlated Variables: TOUCHES_TOUCHES, PRGDIST_CARRIES
Correlation Value: 0.901360363175316
Variances: 457197.20685884485, 1510545.8109707122
Variable to Drop: TOUCHES_TOUCHES
Correlated Variables: TOUCHES_TOUCHES, REC_RECEIVING
Correlation Value: 0.979815121871724
Variances: 457197.20685884485, 209110.13930592663
Variable to Drop: REC_RECEIVING
Correlated Variables: TOUCHES_TOUCHES, COMPL_STARTS
Correlation Value: 0.8812015840356883
Variances: 457197.20685884485, 84.995569


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

In [None]:
features_filtered = features_filtered.drop(columns=correlated_variables, axis=1)
features_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2384 entries, 0 to 2383
Data columns (total 35 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   CLUB_NAME                      2384 non-null   category
 1   LEAGUE_COUNTRY                 2384 non-null   category
 2   CURRENT_INTERNATIONAL          2384 non-null   category
 3   AGE                            2384 non-null   float64 
 4   POSITION                       2384 non-null   category
 5   FOOT                           2384 non-null   category
 6   PLAYER_AGENT                   2384 non-null   bool    
 7   OUTFITTER                      2384 non-null   bool    
 8   MIN_PLAYING                    2384 non-null   float64 
 9   DIST_STANDARD                  2384 non-null   float64 
 10  DEF 3RD_TOUCHES                2384 non-null   float64 
 11  ATT 3RD_TOUCHES                2384 non-null   float64 
 12  ATT PEN_TOUCHES                238

In [None]:
features_filtered.head(5)

Unnamed: 0,CLUB_NAME,LEAGUE_COUNTRY,CURRENT_INTERNATIONAL,AGE,POSITION,FOOT,PLAYER_AGENT,OUTFITTER,MIN_PLAYING,DIST_STANDARD,...,CRS,WON_AERIAL,LOST_AERIAL,ATT 3RD_TACKLES,LOST_CHALLENGES,PASS_BLOCKS,TKL+INT,YEAR_BIRTH,ID,MACRO_POSITION
0,Manchester City,England,Portugal,26.0,Defender - Centre-Back,right,True,True,1998.0,9.1,...,1.0,44.0,28.0,1.0,7.0,8.0,39.0,1997.0,258004,Defense
1,Manchester City,England,Netherlands,28.0,Defender - Centre-Back,left,True,True,1873.0,10.5,...,15.0,36.0,23.0,2.0,4.0,10.0,46.0,1995.0,177476,Defense
2,Manchester City,England,England,29.0,Defender - Centre-Back,right,True,True,1846.0,13.4,...,6.0,33.0,16.0,4.0,3.0,7.0,32.0,1994.0,186590,Defense
3,Manchester City,England,Switzerland,27.0,Defender - Centre-Back,right,True,True,2287.0,15.9,...,6.0,23.0,34.0,3.0,13.0,18.0,52.0,1995.0,284730,Defense
4,Manchester City,England,Spain,29.0,Defender - Centre-Back,left,False,True,993.0,14.9,...,6.0,24.0,19.0,1.0,4.0,6.0,14.0,1994.0,176553,Defense


### Handle outliers for Numerical Features

In [None]:
numerical_columns = features_filtered.drop(columns=categorical_columns, axis=1).columns
numerical_columns = numerical_columns.drop('ID')
numerical_columns

Index(['AGE', 'MIN_PLAYING', 'DIST_STANDARD', 'DEF 3RD_TOUCHES',
       'ATT 3RD_TOUCHES', 'ATT PEN_TOUCHES', 'ATT_TAKE', 'TOTDIST_CARRIES',
       'CPA_CARRIES', 'MIS_CARRIES', 'SUBS_SUBS', 'UNSUB_SUBS',
       'PLUS_PER__MINUS__TEAM.SUCCESS', 'TOTDIST_TOTAL', 'CRSPA', 'FLS', 'FLD',
       'OFF', 'CRS', 'WON_AERIAL', 'LOST_AERIAL', 'ATT 3RD_TACKLES',
       'LOST_CHALLENGES', 'PASS_BLOCKS', 'TKL+INT'],
      dtype='object')

In [None]:
features_filtered[numerical_columns].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AGE,2384.0,26.154362,4.445275,17.0,23.0,26.0,29.0,41.0
MIN_PLAYING,2384.0,1328.419044,948.428121,1.0,475.25,1286.0,2108.0,3420.0
DIST_STANDARD,2384.0,17.498574,4.912862,2.8,14.2,17.2,20.5,37.5
DEF 3RD_TOUCHES,2384.0,233.50755,273.029243,0.0,35.0,127.0,341.0,1665.0
ATT 3RD_TOUCHES,2384.0,206.734899,204.506826,0.0,43.0,147.0,314.0,1270.0
ATT PEN_TOUCHES,2384.0,31.241191,37.33775,0.0,6.0,19.0,41.0,302.0
ATT_TAKE,2384.0,26.970638,31.434323,0.0,5.0,16.0,38.0,306.0
TOTDIST_CARRIES,2384.0,2648.384228,2350.821171,0.0,703.75,2149.0,3969.75,14430.0
CPA_CARRIES,2384.0,6.283138,10.896335,0.0,0.0,2.0,8.0,140.0
MIS_CARRIES,2384.0,21.728607,21.318178,0.0,5.0,16.0,31.0,139.0


In [None]:
from scipy.stats import iqr

def find_columns_with_outliers(df, columns, iqr_multiplier=1.5):
    """
    Find columns in the given DataFrame with outliers using the IQR method.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the numerical columns to evaluate.
        columns (list): A list of column names to evaluate for outliers.
        iqr_multiplier (float): The multiplier to calculate the IQR threshold. Values above this threshold will be considered outliers.

    Returns:
        list: A list containing the column names that have outliers.
    """
    columns_with_outliers = []

    for column in columns:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr_threshold = iqr(df[column]) * iqr_multiplier
        lower_bound = q1 - iqr_threshold
        upper_bound = q3 + iqr_threshold

        outlier_indices = (df[column] < lower_bound) | (df[column] > upper_bound)
        if outlier_indices.any():
            columns_with_outliers.append(column)

    return columns_with_outliers

iqr_multiplier = 1.5  # Set the IQR multiplier for outlier detection

columns_with_outliers = find_columns_with_outliers(features_filtered, numerical_columns, iqr_multiplier)
print("Columns with outliers:", len(columns_with_outliers))

Columns with outliers: 24


In [None]:
from scipy.stats import zscore

def handle_outliers_zscore(df, columns, zscore_threshold=3):
    """
    Handle outliers in the given DataFrame for specified columns using the Z-score method.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the numerical columns with potential outliers.
        columns (list): A list of column names for which outlier handling should be applied.
        zscore_threshold (float): The threshold for the Z-score. Values above this threshold will be considered outliers.

    Returns:
        pd.DataFrame: The DataFrame with outlier-handled values for specified columns.
    """
    outlier_handled_df = df.copy()

    for column in columns:
        z_scores = zscore(outlier_handled_df[column])
        outlier_indices = abs(z_scores) > zscore_threshold
        outlier_handled_df.loc[outlier_indices, column] = outlier_handled_df[column].median()

    return outlier_handled_df

In [None]:
zscore_threshold = 3  # Set the Z-score threshold
# Handle outliers using Z-score method
outlier_handled_zscore_df = handle_outliers_zscore(features_filtered, columns_with_outliers, zscore_threshold)

In [None]:
outlier_handled_zscore_df[numerical_columns].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
AGE,2384.0,26.142198,4.425766,17.0,23.0,26.0,29.0,39.0
MIN_PLAYING,2384.0,1328.419044,948.428121,1.0,475.25,1286.0,2108.0,3420.0
DIST_STANDARD,2384.0,17.406879,4.756441,2.8,14.2,17.2,20.4,32.1
DEF 3RD_TOUCHES,2384.0,210.530621,231.987644,0.0,35.0,127.0,318.0,1052.0
ATT 3RD_TOUCHES,2384.0,198.030201,188.181163,0.0,43.0,147.0,304.25,818.0
ATT PEN_TOUCHES,2384.0,27.82047,29.831441,0.0,6.0,19.0,38.0,143.0
ATT_TAKE,2384.0,24.305789,25.691381,0.0,5.0,16.0,35.0,121.0
TOTDIST_CARRIES,2384.0,2548.333893,2164.439514,0.0,703.75,2148.5,3878.75,9699.0
CPA_CARRIES,2384.0,4.943792,7.102584,0.0,0.0,2.0,7.0,38.0
MIS_CARRIES,2384.0,20.454279,18.896509,0.0,5.0,16.0,30.0,85.0


## Create dummies for categorical columns

In [None]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns to be label encoded
label_encoded_columns = ['CURRENT_INTERNATIONAL', 'CLUB_NAME', 'YEAR_BIRTH']

# List of remaining categorical columns to be one-hot encoded
one_hot_encoded_columns = [col for col in categorical_columns if col not in label_encoded_columns]

# Perform label encoding for the specified columns
label_encoder = LabelEncoder()
for col in label_encoded_columns:
    outlier_handled_zscore_df[col] = label_encoder.fit_transform(outlier_handled_zscore_df[col])

# Perform one-hot encoding for the remaining categorical columns
features_final = pd.get_dummies(outlier_handled_zscore_df, columns=one_hot_encoded_columns)

features_final.head(5)

Unnamed: 0,CLUB_NAME,CURRENT_INTERNATIONAL,AGE,MIN_PLAYING,DIST_STANDARD,DEF 3RD_TOUCHES,ATT 3RD_TOUCHES,ATT PEN_TOUCHES,ATT_TAKE,TOTDIST_CARRIES,...,MACRO_POSITION_Attack,MACRO_POSITION_Defense,MACRO_POSITION_Midfield,FOOT_both,FOOT_left,FOOT_right,PLAYER_AGENT_False,PLAYER_AGENT_True,OUTFITTER_False,OUTFITTER_True
0,52,78,26.0,1998.0,9.1,881.0,83.0,22.0,10.0,8194.0,...,0,1,0,0,0,1,0,1,0,1
1,52,68,28.0,1873.0,10.5,615.0,256.0,32.0,5.0,5455.0,...,0,1,0,0,1,0,0,1,0,1
2,52,32,29.0,1846.0,13.4,544.0,210.0,27.0,10.0,6201.0,...,0,1,0,0,0,1,0,1,0,1
3,52,91,27.0,2287.0,15.9,815.0,176.0,34.0,12.0,8443.0,...,0,1,0,0,0,1,0,1,0,1
4,52,88,29.0,993.0,14.9,335.0,166.0,13.0,4.0,4015.0,...,0,1,0,0,1,0,1,0,0,1


In [None]:
features_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2384 entries, 0 to 2383
Data columns (total 56 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   CLUB_NAME                               2384 non-null   int32  
 1   CURRENT_INTERNATIONAL                   2384 non-null   int32  
 2   AGE                                     2384 non-null   float64
 3   MIN_PLAYING                             2384 non-null   float64
 4   DIST_STANDARD                           2384 non-null   float64
 5   DEF 3RD_TOUCHES                         2384 non-null   float64
 6   ATT 3RD_TOUCHES                         2384 non-null   float64
 7   ATT PEN_TOUCHES                         2384 non-null   float64
 8   ATT_TAKE                                2384 non-null   float64
 9   TOTDIST_CARRIES                         2384 non-null   float64
 10  CPA_CARRIES                             2384 non-null   floa

## Scale numerical columns

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Apply MinMaxScaler to the numerical columns
features_final[numerical_columns] = scaler.fit_transform(features_final[numerical_columns])

# Now, the numerical columns in features_final are scaled using MinMaxScaler.

## Save final features

In [None]:
#Save features dataframe
features_final.to_csv("datasets/raw_data_final/features_model.csv")