# Import dependencies

In [None]:
# For data science
import pandas as pd
import numpy as np
import missingno as msno

# For plot
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

# For modules
from sources import (
    get_heatmap,
)

# Getting data
Get dataset

In [None]:
players = pd.read_csv("../data/Players.csv")
season_stat = pd.read_csv("../data/Seasons_Stats.csv")

Clear datasets

In [None]:
if players.isna().any().any():
    players.dropna(inplace=True)

Check for correlations between features

In [None]:
# Get correlation table
round(
    number=players[["weight", "height", "born"]].corr(),
    ndigits=2,
)

In [None]:
# Get correlation graph
sns.heatmap(
    data=players[["weight", "height", "born"]].corr(),
    annot=True,
);

The sufficient correlation is observed between 'weight' and 'height' features! Let's discover them.

Discover height and weight distribution

In [None]:
# Set a box plot using Pandas
height_mean = players.boxplot(
    column="height",
    showmeans=True,
    meanline=True,
    notch=True,
    vert=True,
)

# Customize the plot
height_mean.set_xlabel("Height")
height_mean.set_title("Box Plot with Mean and Confidence Interval (95%)")

# Display the plot
plt.show()

The mean height of players is around 200 units. There are points out of Confidence interval (potential outlayers).

In [None]:
# Set a box plot
weight_mean = players.boxplot(
    column="weight",
    showmeans=True,
    meanline=True,
    notch=True,
    vert=True,
)

# Customize the plot
weight_mean.set_xlabel("Weight")
weight_mean.set_title("Box Plot with Mean and Confidence Interval 95%")

# Display the plot
plt.show()

The mean weight of players is around 95 units. The potential outlayers are mostly have bigger values then points within Confidence interval.

In [None]:
# Plot pairwise relationships in a dataset
pair_plot = sns.pairplot(
    data=players[["height", "weight", "birth_state"]],
    hue="birth_state",
    hue_order=players["birth_state"].unique().tolist()[:25],
)

# Add title to the pair plot
pair_plot.fig.suptitle("Pair Plot of Height, Weight by Birth State", y=1.02);

It would be better to combine correlated features by on feature, for example, index weight/height)

In [None]:
# Get correlation plot for height and weight
pair_plot = sns.lmplot(
    x="height",
    y="weight",
    data=players,
)

# Add title to the regression plot
pair_plot.fig.suptitle("Correlation plot for Height and Weight", y=1.02);

The linear correlation between weight and height is observed. For the tall people the deviation in weight is bigger.

In [None]:
# Get Pearson statistics for the distribution
pearson = stats.pearsonr(
    x=players["weight"],
    y=players["height"],
)

corr = pearson[0]

print(f"Correlation coefficient is: {corr:.2f}")

In [None]:
pair_plot = sns.lmplot(
    x="born",
    y="weight",
    data=players,
)

# Add title to the regression plot
pair_plot.fig.suptitle("Correlation plot for Height and Born date", y=1.02);

Later players have more weight but this tendency is not so much obvious as for weight and height. Let's discover it.

In [None]:
# Get median year
median_year = players["born"].median()

# Create a new column to indicate the period
players["period"] = np.where(
    players["born"] <= median_year,
    f"Before {int(median_year)}.",
    f"After {int(median_year)}.",
)

sns.boxplot(
    x="period",
    y="weight",
    data=players,
)

# Add title to the box plot
plt.title("Weight comparison before and after median year")
plt.show()

Yeh... The tendency is not sufficient(

Let's discover seasons statistics.

In [None]:
# Get diagram with missing values
msno.matrix(season_stat);

There are two empty column and rows with no data. Let's drop them.

In [None]:
season_stat.dropna(axis=1, how="all", inplace=True)
season_stat.dropna(axis=0, how="any", inplace=True)

Get heatmap

In [None]:
# Get numbered data
numbered_stat = season_stat.select_dtypes(include="number")

# Get correlation matrix
get_heatmap(
    dataframe=numbered_stat,
    name="Numbered Stat",
    fmt=".1f",
)

There are strongly correlated features

In [None]:
# Set the correlation threshold
correlation_threshold = 0.8

# Get correlation matrix
correlation_statistics = numbered_stat.corr()

# Create a mask for values below the threshold
narrow_mask = np.abs(correlation_statistics) < correlation_threshold

# Set those values to NaN in the correlation matrix
corr_stat_narrowed = correlation_statistics.mask(narrow_mask)

# Get correlation matrix
get_heatmap(
    dataframe=corr_stat_narrowed,
    name="Numbered Stat",
    fmt=".1f",
);

Not sure what do they mean but there are several strongly correlated features in the dataset: G, GS, MP, TS, OWS, DWS, WS, FG, FGA etc.