In [1]:
from pathlib import Path
import numpy as np
from scipy.stats import ttest_ind
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load data with relative path
script_dir = Path.cwd()
data_file = script_dir.parent / "data/processed_data_tigers.csv"
tigers_data = pd.read_csv(data_file)

data_file = script_dir.parent / "data/processed_data_heidelberg.csv"
heidelberg_data = pd.read_csv(data_file)


# exlude all columns that are not numerical
tigers_data = tigers_data.select_dtypes(include=[np.number])
heidelberg_data = heidelberg_data.select_dtypes(include=[np.number])

In [18]:
print("H0: The performance of the two top performer (Jackson and Jaworski) is the same in games their teams have won and where they have played")

tigers_features = ["jackson_points", "jackson_assists", "jackson_minutes_played"]
heidelberg_features = ["jaworski_points", "jaworski_assists", "jaworski_minutes_played"]

for i in range(0, 3):
    jackson_performance = tigers_data[(tigers_data["tigers_win"] == 1) & (tigers_data["jackson_minutes_played"] > 0)][tigers_features[i]]
    jaworski_performance = heidelberg_data[(heidelberg_data["heidelberg_win"] == 1) & (heidelberg_data["jaworski_minutes_played"] > 0)][heidelberg_features[i]]
    
    t_stat, p_value = ttest_ind(jackson_performance, jaworski_performance, equal_var=False) 

    print(f"T-test for {tigers_features[i].split('_', 1)[1]}: T-statistic: {t_stat}, P-value: {p_value}")

print('''Only the test of minutes_played has a p-value smaller than 0.05, meaning only the minutes played are significantly different between the two top performers in games their teams have won.
But in general the performance of the two top performers is not significantly different in games their teams have won.''')


H0: The performance of the two top performer (Jackson and Jaworski) is the same in games their teams have won
T-test for points: T-statistic: -0.3835158682973332, P-value: 0.7128527140469287
T-test for assists: T-statistic: -1.252748980616332, P-value: 0.24516896257799609
T-test for minutes_played: T-statistic: -2.924620954468904, P-value: 0.030627299871608046
Only the test of minutes_played has a p-value smaller than 0.05, meaning only the minutes played are significantly different between the two top performers in games their teams have won.
But in general the performance of the two top performers is not significantly different in games their teams have won.


In [23]:
print ("H0: The 3_pointer_percentage of both teams is the same")

tigers_3_pointer_percentage = tigers_data["3_pointer_percentage"]
heidelberg_3_pointer_percentage = heidelberg_data["3_pointer_percentage"]

t_stat, p_value = ttest_ind(tigers_3_pointer_percentage, heidelberg_3_pointer_percentage, equal_var=False) 

print(f"T-statistic: {t_stat}, P-value: {p_value}")

print('''Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaining we could not show that there is a difference in the 3_pointer_percentage between the two teams.''')

H0: The 3_pointer_percentage of both teams is the same
T-statistic: 1.431255042451343, P-value: 0.1571209775673478
Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaining we could not show that there is a difference in the 3_pointer_percentage between the two teams.
