# Import dependencies

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import missingno as msno
from scipy import stats

plt.close("all")

Get datasets

In [None]:
players = pd.read_csv("Players.csv")
season_stat = pd.read_csv("Seasons_Stats.csv")

Clear datasets

In [None]:
if players.isna().any().any():
    players.dropna(inplace=True)

Check for correlations between features

In [None]:
round(players[["weight", "height", "born"]].corr(), 2)

In [None]:
sns.heatmap(
    players[["weight", "height", "born"]].corr(),
    annot=True,
)

The sufficient correlation is observed between 'weight' and 'height' features! Let's discover them.

Discover height and weight distribution

In [None]:
# Set a box plot using Pandas
height_mean = players.boxplot(
    column="height", showmeans=True, meanline=True, notch=True, vert=True
)

# Customize the plot
height_mean.set_xlabel("Height")
height_mean.set_title("Box Plot with Mean and Confidence Interval (95%)")

# Display the plot
plt.show()

The mean height of players is around 200 units. There are points out of Confidence interval (potential outlayers).

In [None]:
# Set a box plot
weight_mean = players.boxplot(
    column="weight", showmeans=True, meanline=True, notch=True, vert=True
)

# Customize the plot
weight_mean.set_xlabel("Weight")
weight_mean.set_title("Box Plot with Mean and Confidence Interval 95%")

# Display the plot
plt.show()

The mean weight of players is around 95 units. The potential outlayers are mostly have bigger values then points within Confidence interval.

In [None]:
sns.pairplot(
    players[["height", "weight", "birth_state"]],
    hue="birth_state",
    hue_order=players["birth_state"].unique().tolist()[:25],
);

It would be better to combine correlated features by on feature, for example, index weight/height)

In [None]:
# Get correlation plot for height and weight
sns.lmplot(x="height", y="weight", data=players);

The linear correlation between weight and height is observed. For the tall people the deviation in weight is bigger.

In [None]:
# Get Pearson statistics for the distribution
pearson = stats.pearsonr(players["weight"], players["height"])
corr = pearson[0]

print(f"Correlation coefficient is: {corr:.2f}")

In [None]:
sns.lmplot(
    x="born",
    y="weight",
    data=players,
);

Later players have more weight but this tendency is not so much obvious as for weight and height. Let's discover it.

In [None]:
# Get median year
median_year = players["born"].median()

# Create a new column to indicate the period
players["period"] = np.where(
    players["born"] <= median_year,
    f"Before {int(median_year)}",
    f"After {int(median_year)}",
)

sns.boxplot(x="period", y="weight", data=players)
plt.title("Weight comparison before and after median year")
plt.show()

Yeh... The tendency is not sufficient(

Let's discover seasons statistics.

In [None]:
# Get diagram with missing values
msno.matrix(season_stat);

There are two empty column and rows with no data. Let's drop them.

In [None]:
season_stat.dropna(axis=1, how="all", inplace=True)
season_stat.dropna(axis=0, how="any", inplace=True)

Get heatmap

In [None]:
# Get numbered data
numbered_stat = season_stat.select_dtypes(include="number")

# Get correlation matrix
corr_stat = numbered_stat.corr()

# Set mask to get triangle visualization
mask = np.triu(corr_stat)

# Set size for the plot
plt.figure(figsize=(15, 15))

# Get heatmap
sns.heatmap(
    corr_stat,
    mask=mask,
)

plt.title("Heatmap of Correlation Matrix")
plt.show()

There are strongly correlated features

In [None]:
# Set the correlation threshold
correlation_threshold = 0.8

# Create a mask for values below the threshold
narrow_mask = np.abs(corr_stat) < correlation_threshold

# Set those values to NaN in the correlation matrix
corr_stat_narrowed = corr_stat.mask(narrow_mask)

# Create a heatmap for the narrowed correlation matrix
plt.figure(figsize=(15, 15))
sns.heatmap(corr_stat_narrowed, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title(f"Narrowed Heatmap (Correlation > {correlation_threshold})")
plt.show()

Not sure what do they mean but there are several strongly correlated features in the dataset: G, GS, MP, TS, OWS, DWS, WS, FG, FGA etc.