# Master Thesis Statistical Tests Code
Below is all of the code used to run the statistical tests whose results appear in my MA Thesis. 

## Chi Squared Tests for Distances 

### Men and Women

In [18]:
import numpy as np
from scipy.stats import chi2_contingency

# Data: [distances over 7.5 km, distances between 2 and 7.5 km, distances under 2 km]
men_counts = [173, 542, 482]
women_counts = [132, 488, 515]

# Create contingency table
contingency_table = np.array([men_counts, women_counts])

# Perform Chi-Square test
chi2_stat, p_val, dof, ex = chi2_contingency(contingency_table)

print(f'Chi2 Statistic: {chi2_stat}, P-value: {p_val}')

Chi2 Statistic: 7.791957436808222, P-value: 0.020323473754362217


## Residence Chi Squared

In [19]:
# Data: [distances over 7.5 km, distances between 2 and 7.5 km, distances under 2 km]
rural_counts = [294, 522, 333]
urban_counts = [36, 565, 705]

# Create contingency table
contingency_table = np.array([rural_counts, urban_counts])

# Perform Chi-Square test
chi2_stat, p_val, dof, ex = chi2_contingency(contingency_table)

print(f'Chi2 Statistic: {chi2_stat}, P-value: {p_val}')

Chi2 Statistic: 328.02925232534125, P-value: 5.879669382717215e-72


## Weighted Chi Squared test for E-bikes and Standard Bikes Distance
The weight is 60% for standard and 40% for e-bikes

In [20]:
#WEIGHTED CHI2

# Data: [distances over 7.5 km, distances between 2 and 7.5 km, distances under 2 km]
normal_bike_counts = [144, 585, 725]
e_bike_counts = [182, 483, 299]

# Population proportions
p_normal_bike = 0.60
p_e_bike = 0.40

# Adjust counts based on proportions
total_counts = np.array(normal_bike_counts) + np.array(e_bike_counts)
expected_normal_bike_counts = total_counts * p_normal_bike
expected_e_bike_counts = total_counts * p_e_bike

# Create contingency table with expected counts
contingency_table = np.array([normal_bike_counts, e_bike_counts])
expected_table = np.array([expected_normal_bike_counts, expected_e_bike_counts])

# Perform Chi-Square test with expected frequencies
chi2_stat, p_val, dof, ex = chi2_contingency(contingency_table, correction=False)

print(f'Chi2 Statistic: {chi2_stat}, P-value: {p_val}')

Chi2 Statistic: 96.04071997017141, P-value: 1.3964411546656164e-21


## ANOVA for age group distances 

In [13]:
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Example data with different sample sizes
data = {
    'distance': [29, 81, 116, 67, 361, 340, 161, 458, 403, 14, 250, 175 ],
    'age_group': ['Age_Group_1', 'Age_Group_1', 'Age_Group_1',  
                  'Age_Group_2', 'Age_Group_2', 'Age_Group_2',  
                  'Age_Group_3', 'Age_Group_3', 'Age_Group_3', 
                  'Age_Group_4', 'Age_Group_4', 'Age_Group_4']
}

df = pd.DataFrame(data)

#WELCH ANOVA 
model = ols('distance ~ age_group', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2, robust='hc3')
print(anova_table)

                  sum_sq   df         F    PR(>F)
age_group  124355.609392  3.0  2.426782  0.140594
Residual   136648.000000  8.0       NaN       NaN


## Paired T-Test and Logit Transformation for the Road Type Analysis
#### mode, age, and gender

## Mode Test

In [1]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Data
data = {
    "Road Type": ["Cycleway", "Residential", "Primary", "Secondary", "Tertiary", "Unclassified"],
    "E-bike": [0.3152327221, 0.1636107193, 0.02820874471, 0.1488011283, 0.2475317348, 0.09661495063],
    "Standard Bicycle": [0.3007975693, 0.204329662, 0.0140524117, 0.1344473984, 0.2628180782, 0.08355488036]
}

df = pd.DataFrame(data)

# Logit Transformation
df['logit_E-bike'] = np.log(df['E-bike'] / (1 - df['E-bike']))
df['logit_Standard_Bicycle'] = np.log(df['Standard Bicycle'] / (1 - df['Standard Bicycle']))

# Check for Normality using Shapiro-Wilk test
shapiro_ebike = stats.shapiro(df['logit_E-bike'])
shapiro_standard = stats.shapiro(df['logit_Standard_Bicycle'])

# Perform paired t-test
t_test_result = stats.ttest_rel(df['logit_E-bike'], df['logit_Standard_Bicycle'])

# Output results
shapiro_ebike, shapiro_standard, t_test_result

(ShapiroResult(statistic=0.9285017659902026, pvalue=0.5685824471769393),
 ShapiroResult(statistic=0.865789135939045, pvalue=0.2099297347390255),
 TtestResult(statistic=0.8684731850011399, pvalue=0.42485043706837294, df=5))

## Logit Transformation for Ages and Road Type

In [6]:
data = {
    "Cycleway": [0.2868217054, 0.2955390335, 0.2977983778, 0.2688679245],
    "Residential": [0.1395348837, 0.1338289963, 0.1402085747, 0.1933962264],
    "Primary": [0.007751937984, 0.04460966543, 0.04287369641, 0.02830188679],
    "Secondary": [0.1705426357, 0.1468401487, 0.1494785632, 0.1226415094],
    "Tertiary": [0.2868217054, 0.2639405204, 0.2618771727, 0.2216981132],
    "Unclassified": [0.1085271318, 0.1152416357, 0.1077636153, 0.1650943396]
}

# Function to perform logit transformation
def logit(p):
    return np.log(p / (1 - p))

# Apply logit transformation to the data
logit_data = {road_type: [logit(p) for p in probabilities] for road_type, probabilities in data.items()}

# Print the transformed data
for road_type, transformed_probs in logit_data.items():
    print(f"{road_type}: {transformed_probs}")

Cycleway: [-0.9108706645336643, -0.8686320026568548, -0.8578039009446883, -1.0003738492286691]
Residential: [-1.8191584435904937, -1.8675195149588633, -1.813558697896492, -1.4280914898951143]
Primary: [-4.852030263984117, -3.064169435048779, -3.1056768608250787, -3.536116699650717]
Secondary: [-1.5817863808131485, -1.7596023575846216, -1.7386966238736872, -1.967650136007352]
Tertiary: [-0.9108706645336643, -1.025587153890494, -1.0362346560594926, -1.2557978722342615]
Unclassified: [-2.105874798571718, -2.038283469066011, -2.113791021647473, -1.620801671248677]


In [17]:
data = {
    'road': [-0.91, -1.81, -4.852, -1.58, -0.91, -2.105,
            -0.86, -1.86, -3.06, -1.56, -1.02, -2.03, 
            -0.857, -1.81, -3.105, -1.73, -1.03, -2.11, 
            -1.0, -1.42, -3.536, -1.96, -1.25, -1.62],
    'age_group': ['Age_Group_1', 'Age_Group_1', 'Age_Group_1', 'Age_Group_1', 'Age_Group_1', 'Age_Group_1',
                  'Age_Group_2', 'Age_Group_2', 'Age_Group_2',  'Age_Group_2',  'Age_Group_2','Age_Group_2',
                  'Age_Group_3', 'Age_Group_3', 'Age_Group_3', 'Age_Group_3','Age_Group_3','Age_Group_3',
                  'Age_Group_4', 'Age_Group_4', 'Age_Group_4','Age_Group_4', 'Age_Group_4', 'Age_Group_4']
}

df = pd.DataFrame(data)

#WELCH ANOVA 
model = ols('road ~ age_group', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2, robust='hc3')
print(anova_table)


              sum_sq    df         F    PR(>F)
age_group   0.171805   3.0  0.053684  0.983131
Residual   21.335341  20.0       NaN       NaN


## Gender Test

In [3]:
# Normalized values

data = {
    "Road Type": ["Cycleway", "Residential", "Primary", "Secondary", "Tertiary", "Unclassified"],
    'men': [0.310707457, 0.1787762906, 0.01912045889, 0.1496175908, 0.2543021033, 0.08747609943],
    'women': [0.2966611932, 0.2074438971, 0.01915708812, 0.1308155446, 0.2621784346, 0.08374384236]
    }

df = pd.DataFrame(data)

#Logit transformation 
df['logit_men'] = np.log(df['men'] / (1 - df['men']))
df['logit_women'] = np.log(df['women'] / (1 - df['women']))

# Check for Normality using Shapiro-Wilk test
shapiro_men = stats.shapiro(df['logit_men'])
shapiro_women = stats.shapiro(df['logit_women'])

# Perform paired t-test
t_test_result = stats.ttest_rel(df['logit_men'], df['logit_women'])

# Output results
print(f'Shapiro men: {shapiro_men}, Shapiro women: {shapiro_women}, T-test P-value: {t_test_result}')


Shapiro men: ShapiroResult(statistic=0.8939977425632528, pvalue=0.33966649307330066), Shapiro women: ShapiroResult(statistic=0.8870948100097674, pvalue=0.30324449099575884), T-test P-value: TtestResult(statistic=0.15214920057090145, pvalue=0.8850181502962471, df=5)


## Residence Test

In [4]:
# Normalized values
data = {
     "Road Type": ["Cycleway", "Residential", "Primary", "Secondary", "Tertiary", "Unclassified"],
    "rural": [0.2929465301, 0.1450511945, 0.03924914676, 0.1467576792, 0.2593856655, 0.1166097838], 
    "urban": [0.3132171696, 0.2239694008, 0.003824904377, 0.134721632, 0.2566935827, 0.06757331067]

}

df = pd.DataFrame(data)

#Logit transformation 
df['logit_rural'] = np.log(df['rural'] / (1 - df['rural']))
df['logit_urban'] = np.log(df['urban'] / (1 - df['urban']))

# Check for Normality using Shapiro-Wilk test
shapiro_rural = stats.shapiro(df['logit_rural'])
shapiro_urban = stats.shapiro(df['logit_urban'])

# Perform paired t-test
t_test_result = stats.ttest_rel(df['logit_rural'], df['logit_urban'])

# Output results
print(f'Shapiro rural: {shapiro_rural}, Shapiro urban: {shapiro_urban}, T-test P-value: {t_test_result}')


Shapiro rural: ShapiroResult(statistic=0.9125089714688732, pvalue=0.45310710952500893), Shapiro urban: ShapiroResult(statistic=0.803522477992112, pvalue=0.06320820719617136), T-test P-value: TtestResult(statistic=0.9762041457309291, pvalue=0.3737971149326715, df=5)



# Urbanity Chi Squared Tests 


### Weighted Chi Squared for Urbanity Level Points

In [3]:

#WEIGHTED CHI2

# Data: [Urbanity 1, Urbanity 2, etc.]
normal_bike_counts = [817, 208, 148, 114, 229]
e_bike_counts = [349, 146, 108, 159, 258]

# Population proportions
p_normal_bike = 0.60
p_e_bike = 0.40

# Adjust counts based on proportions
total_counts = np.array(normal_bike_counts) + np.array(e_bike_counts)
expected_normal_bike_counts = total_counts * p_normal_bike
expected_e_bike_counts = total_counts * p_e_bike

# Create contingency table with expected counts
contingency_table = np.array([normal_bike_counts, e_bike_counts])
expected_table = np.array([expected_normal_bike_counts, expected_e_bike_counts])

# Perform Chi-Square test with expected frequencies
chi2_stat, p_val, dof, ex = chi2_contingency(contingency_table, correction=False)

print(f'Chi2 Statistic: {chi2_stat}, P-value: {p_val}')

Chi2 Statistic: 121.74299868410914, P-value: 2.2663885688981556e-25


### Men and Women Urbanity

##### Chi square test of independece

In [5]:
import numpy as np
from scipy.stats import chi2_contingency

# Creating the contingency table
data = np.array([
    [574, 559],  # Level 1
    [171, 172],  # Level 2
    [108, 143],   # Level 3
    [120, 138],   # Level 4
    [217, 249]    # Level 5
])

# Perform the Chi-Squared test
chi2, p, dof, expected = chi2_contingency(data)

# Print the results
print(f"Chi-Squared Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print(f"Expected Frequencies: \n{expected}")

# Interpretation
alpha = 0.05
if p < alpha:
    print("We reject the null hypothesis (there is an association between urbanity level and total points).")
else:
    print("We fail to reject the null hypothesis (no association between urbanity level and total points).")


Chi-Squared Statistic: 6.4839495463192245
P-value: 0.16580450523265755
Degrees of Freedom: 4
Expected Frequencies: 
[[550.08975928 582.91024072]
 [166.53202774 176.46797226]
 [121.86454508 129.13545492]
 [125.26315789 132.73684211]
 [226.25051    239.74949   ]]
We fail to reject the null hypothesis (no association between urbanity level and total points).


### Chi Squared test of Independence for Rural vs Urban for Urbanity

In [4]:
import numpy as np
from scipy.stats import chi2_contingency

# Creating the contingency table
data = np.array([
    [929, 250],  # Level 1
    [208, 152],  # Level 2
    [83, 178],   # Level 3
    [49, 232],   # Level 4
    [33, 465]    # Level 5
])

# Perform the Chi-Squared test
chi2, p, dof, expected = chi2_contingency(data)

# Print the results
print(f"Chi-Squared Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print(f"Expected Frequencies: \n{expected}")

# Interpretation
alpha = 0.05
if p < alpha:
    print("We reject the null hypothesis (there is an association between urbanity level and total points).")
else:
    print("We fail to reject the null hypothesis (no association between urbanity level and total points).")


Chi-Squared Statistic: 928.1035537506363
P-value: 1.3563609996323396e-199
Degrees of Freedom: 4
Expected Frequencies: 
[[595.2144242  583.7855758 ]
 [181.74486235 178.25513765]
 [131.7650252  129.2349748 ]
 [141.861962   139.138038  ]
 [251.41372625 246.58627375]]
We reject the null hypothesis (there is an association between urbanity level and total points).


## Clusters expected differences

In [6]:
import pandas as pd
from scipy.stats import chi2_contingency

# Data provided
data = {
    "clusterID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    "men": [27, 16, 9, 12, 10, 17, 10, 8, 8, 34, 8, 8, 10, 6, 3],
    "women": [27, 14, 14, 10, 8, 17, 6, 8, 7, 34, 7, 4, 7, 5, 7],
    "other": [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
}

# Create a DataFrame
df = pd.DataFrame(data)

# Create the contingency table
contingency_table = pd.melt(df, id_vars=["clusterID"], value_vars=["men", "women", "other"], var_name="Gender", value_name="Count")
contingency_table = contingency_table.pivot_table(index="clusterID", columns="Gender", values="Count").fillna(0)

# Perform the Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Print the results
print("Chi-square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

Chi-square Statistic: 21.673759745035312
P-value: 0.7961663336480427
Degrees of Freedom: 28
Expected Frequencies:
 [[29.04657534  0.62465753 27.32876712]
 [15.28767123  0.32876712 14.38356164]
 [11.72054795  0.25205479 11.02739726]
 [11.2109589   0.24109589 10.54794521]
 [ 9.17260274  0.19726027  8.63013699]
 [17.3260274   0.37260274 16.30136986]
 [ 8.15342466  0.17534247  7.67123288]
 [ 8.15342466  0.17534247  7.67123288]
 [ 7.64383562  0.16438356  7.19178082]
 [34.65205479  0.74520548 32.60273973]
 [ 7.64383562  0.16438356  7.19178082]
 [ 6.11506849  0.13150685  5.75342466]
 [ 9.17260274  0.19726027  8.63013699]
 [ 5.60547945  0.12054795  5.2739726 ]
 [ 5.09589041  0.10958904  4.79452055]]
