In [1]:
from pathlib import Path
import numpy as np
from scipy.stats import ttest_ind
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
# load data with relative path
script_dir = Path.cwd()
data_file = script_dir.parent / "data/processed_data_tigers.csv"
tigers_data = pd.read_csv(data_file)

data_file = script_dir.parent / "data/processed_data_heidelberg.csv"
heidelberg_data = pd.read_csv(data_file)


# exlude all columns that are not numerical
tigers_data = tigers_data.select_dtypes(include=[np.number])
heidelberg_data = heidelberg_data.select_dtypes(include=[np.number])

In [3]:
print("H0: The performance of the two top performer (Jackson and Jaworski) is the same in games their teams have won and where they have played")

tigers_features = ["jackson_points", "jackson_assists", "jackson_minutes_played"]
heidelberg_features = ["jaworski_points", "jaworski_assists", "jaworski_minutes_played"]

for i in range(0, 3):
    jackson_performance = tigers_data[(tigers_data["tigers_win"] == 1) & (tigers_data["jackson_minutes_played"] > 0)][tigers_features[i]]
    jaworski_performance = heidelberg_data[(heidelberg_data["heidelberg_win"] == 1) & (heidelberg_data["jaworski_minutes_played"] > 0)][heidelberg_features[i]]
    
    t_stat, p_value = ttest_ind(jackson_performance, jaworski_performance, equal_var=False) 

    print(f"T-test for {tigers_features[i].split('_', 1)[1]}: T-statistic: {t_stat}, P-value: {p_value}")

print('''Only the test of minutes_played has a p-value smaller than 0.05, meaning only the minutes played are significantly different between the two top performers in games their teams have won.
But in general the performance of the two top performers is not significantly different in games their teams have won.''')


H0: The performance of the two top performer (Jackson and Jaworski) is the same in games their teams have won and where they have played
T-test for points: T-statistic: -0.3835158682973332, P-value: 0.7128527140469287
T-test for assists: T-statistic: -1.252748980616332, P-value: 0.24516896257799609
T-test for minutes_played: T-statistic: -2.924620954468904, P-value: 0.030627299871608046
Only the test of minutes_played has a p-value smaller than 0.05, meaning only the minutes played are significantly different between the two top performers in games their teams have won.
But in general the performance of the two top performers is not significantly different in games their teams have won.


In [4]:
print ("H0: The 3_pointer_percentage of both teams is the same")

tigers_3_pointer_percentage = tigers_data["3_pointer_percentage"]
heidelberg_3_pointer_percentage = heidelberg_data["3_pointer_percentage"]

t_stat, p_value = ttest_ind(tigers_3_pointer_percentage, heidelberg_3_pointer_percentage, equal_var=False) 

print(f"T-statistic: {t_stat}, P-value: {p_value}")

print('''Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaning we could not show that there is a difference in the 3_pointer_percentage between the two teams.''')

H0: The 3_pointer_percentage of both teams is the same
T-statistic: 1.431255042451343, P-value: 0.1571209775673478
Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaining we could not show that there is a difference in the 3_pointer_percentage between the two teams.


In [10]:
print ("H0: The 3_pointer_percentage_given_win given they win is the same of both teams")

tigers_3_pointer_percentage_given_win = tigers_data.loc[tigers_data["tigers_win"] == 1, "3_pointer_percentage"]
heidelberg_3_pointer_percentage_given_win = heidelberg_data.loc[heidelberg_data["heidelberg_win"] == 1, "3_pointer_percentage"]

t_stat, p_value = ttest_ind(tigers_3_pointer_percentage_given_win, heidelberg_3_pointer_percentage_given_win, equal_var=False) 

print(f"T-statistic: {t_stat}, P-value: {p_value}")

print('''Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaning we could not show that there is a difference in the 3_pointer_percentage given a win between the two teams.''')

H0: The 3_pointer_percentage given they win is the same of both teams
T-statistic: 0.5948758260475356, P-value: 0.5717378159474669
Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaning we could not show that there is a difference in the 3_pointer_percentage between the two teams.


In [12]:
print ("H0: The 3_pointer_percentage_given_loss given they loose is the same of both teams")

tigers_3_pointer_percentage_given_loss = tigers_data.loc[tigers_data["tigers_win"] == 0, "3_pointer_percentage"]
heidelberg_3_pointer_percentage_given_loss = heidelberg_data.loc[heidelberg_data["heidelberg_win"] == 0, "3_pointer_percentage"]

t_stat, p_value = ttest_ind(tigers_3_pointer_percentage_given_loss, heidelberg_3_pointer_percentage_given_loss, equal_var=False) 

print(f"T-statistic: {t_stat}, P-value: {p_value}")

print('''Since the p value is smalller than 0.05, we can reject the null hypothesis, meaning the 3_pointer_percentage rate given a loss is significantly different between the two teams.''')

H0: The 3_pointer_percentage_given_loss given they loose is the same of both teams
T-statistic: 2.0987419690520954, P-value: 0.04122912102306113
Since the p value is smalller than 0.05, we can reject the null hypothesis, meaning the 3_pointer_percentage rate given a loss is significantly different between the two teams.


In [18]:
print ("H0: The number of assists of both teams is the same")

tigers_assists = tigers_data["assists"]
heidelberg_assists = heidelberg_data["assists"]

t_stat, p_value = ttest_ind(tigers_assists, heidelberg_assists, equal_var=False) 

print(f"T-statistic: {t_stat}, P-value: {p_value}")

print('''Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaning we could not show that there is a difference in the assists between the two teams.''')

H0: The number of assists of both teams is the same
T-statistic: 1.9650229218283435, P-value: 0.05362298217579887
Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaning we could not show that there is a difference in the 3_pointer_percentage between the two teams.


In [37]:
print ("H0: The number of rebounds of both teams is the same")

tigers_rebounds = tigers_data["rebounds"]
heidelberg_rebounds = heidelberg_data["rebounds"]

t_stat, p_value = ttest_ind(tigers_rebounds, heidelberg_rebounds, equal_var=False) 

print(f"T-statistic: {t_stat}, P-value: {p_value}")

print('''Since the p value is smaller than 0.05, we can reject the null hypothesis, meaning the rebounds are significantly different between the two teams.''')

H0: The number of rebounds of both teams is the same
T-statistic: -3.256960726190201, P-value: 0.0018092156972491517
Since the p value is smaller than 0.05, we can reject the null hypothesis, meaning the rebounds are significantly different between the two teams.


In [36]:
print ("H0: The number of fouls of both teams is the same")

tigers_fouls = tigers_data["fouls"]
heidelberg_fouls = heidelberg_data["fouls"]

t_stat, p_value = ttest_ind(tigers_fouls, heidelberg_fouls, equal_var=False) 

print(f"T-statistic: {t_stat}, P-value: {p_value}")

print('''Since the p value is smaller than 0.05, we can reject the null hypothesis, meaning the fouls are significantly different between the two teams.''')

H0: The number of fouls of both teams is the same
T-statistic: 2.8266868790334145, P-value: 0.0067334581105090415
Since the p value is smaller than 0.05, we can reject the null hypothesis, meaning the fouls are significantly different between the two teams.


In [38]:
print ("H0: The number of turnovers of both teams is the same")

tigers_turnovers = tigers_data["turnovers"]
heidelberg_turnovers = heidelberg_data["turnovers"]

t_stat, p_value = ttest_ind(tigers_turnovers, heidelberg_turnovers, equal_var=False) 

print(f"T-statistic: {t_stat}, P-value: {p_value}")

print('''Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaning we could not show that there is a difference in the turnovers between the two teams.''')

H0: The number of turnovers of both teams is the same
T-statistic: 1.8891916781838352, P-value: 0.06349821658038983
Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaning we could not show that there is a difference in the turnovers between the two teams.


In [48]:
print ("H0: The number of points in the 4th quarter is the same for both teams")

tigers_points_4th_quarter = tigers_data["tigers_points_4th_quarter"]
heidelberg_points_4th_quarter = heidelberg_data["heidelberg_points_4th_quarter"]

t_stat, p_value = ttest_ind(tigers_points_4th_quarter, heidelberg_points_4th_quarter, equal_var=False) 

print(f"T-statistic: {t_stat}, P-value: {p_value}")

print('''Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaning we could not show that there is a difference in the number of points in the 4th quarter between the two teams.''')

H0: The number of points in the 4th quarter is the same for both teams
T-statistic: -0.6526010497472435, P-value: 0.5162884931137004
Since the p value is larger than 0.05, we cannot reject the null hypothesis, meaning we could not show that there is a difference in the number of points in the 4th quarter between the two teams.
