## Upload Data

In [239]:
import pandas as pd
import numpy as np

scores = pd.read_csv("capstone_data.csv")
print(scores.columns)

Index(['Team_Name_Year', 'Domestic_Short', 'Domestic_Long',
       'International_1_Short', 'International_1_Long',
       'International_2_Short', 'International_2_Long', 'Nationals_Short',
       'Nationals_Long', 'Words_Short', 'Worlds_Long'],
      dtype='object')


## Handle Missing Data

In [240]:
missing_data_rows = scores[scores.isnull().any(axis=1)]
print(missing_data_rows)

          Team_Name_Year  Domestic_Short  Domestic_Long  \
4   Les_Supremes_2017/18             NaN            NaN   
11          Nova_2018/19           57.07         124.99   
24     Skyliners_2024/25           60.48         114.28   
25   Haydenettes_2024/25           62.81         124.08   
26     Rockettes_2024/25           76.64         146.95   
27   Team_Unique_2024/25           77.68         149.49   
28  Les_Supremes_2024/25           71.80         139.94   
29          Nova_2024/25           71.72         135.48   

    International_1_Short  International_1_Long  International_2_Short  \
4                   64.33                115.20                  66.70   
11                  77.94                137.92                    NaN   
24                  63.22                124.59                  59.79   
25                  74.11                141.20                  76.78   
26                  76.97                148.18                  78.63   
27                  65.8

In [241]:
short_columns = ['Domestic_Short', 'International_1_Short', 'International_2_Short', 'Nationals_Short']
long_columns  = ['Domestic_Long', 'International_1_Long', 'International_2_Long', 'Nationals_Long']

In [242]:
row = scores.loc[4]
short_average = row[short_columns].dropna().mean()
long_average = row[long_columns].dropna().mean()

In [243]:
if pd.isna(row['Domestic_Short']):
    scores.loc[4, 'Domestic_Short'] = short_average

if pd.isna(row['Domestic_Long']):
    scores.loc[4, 'Domestic_Long'] = long_average

In [244]:
row = scores.loc[11]
short_average = row[short_columns].dropna().mean()
long_average = row[long_columns].dropna().mean()

In [245]:
if pd.isna(row['International_2_Short']):
    scores.loc[11, 'International_2_Short'] = short_average

if pd.isna(row['International_2_Long']):
    scores.loc[11, 'International_2_Long'] = long_average

In [246]:
missing_data_rows = scores[scores.isnull().any(axis=1)]
print(missing_data_rows)

          Team_Name_Year  Domestic_Short  Domestic_Long  \
24     Skyliners_2024/25           60.48         114.28   
25   Haydenettes_2024/25           62.81         124.08   
26     Rockettes_2024/25           76.64         146.95   
27   Team_Unique_2024/25           77.68         149.49   
28  Les_Supremes_2024/25           71.80         139.94   
29          Nova_2024/25           71.72         135.48   

    International_1_Short  International_1_Long  International_2_Short  \
24                  63.22                124.59                  59.79   
25                  74.11                141.20                  76.78   
26                  76.97                148.18                  78.63   
27                  65.88                138.66                  73.44   
28                  77.56                147.64                  77.49   
29                  75.82                142.98                  71.96   

    International_2_Long  Nationals_Short  Nationals_Long  Words_Sh

## Seperate Data (training/testing sets, X and y)

In [247]:
from sklearn.linear_model import LinearRegression

In [248]:
print(scores["Team_Name_Year"])

0              Skyliners_2017/18
1            Haydenettes_2017/18
2     Marigold_Ice_Unity_2017/18
3            Team_Unique_2017/18
4           Les_Supremes_2017/18
5                Nexxice_2017/18
6              Skyliners_2018/19
7            Haydenettes_2018/19
8     Marigold_Ice_Unity_2018/19
9              Rockettes_2018/19
10               Nexxice_2018/19
11                  Nova_2018/19
12                 Miami_2022/23
13           Haydenettes_2022/23
14             Rockettes_2022/23
15           Team_Unique_2022/23
16          Les_Supremes_2022/23
17               Nexxice_2022/23
18             Skyliners_2023/24
19           Haydenettes_2023/24
20             Rockettes_2023/24
21           Team_Unique_2023/24
22          Les_Supremes_2023/24
23                  Nova_2023/24
24             Skyliners_2024/25
25           Haydenettes_2024/25
26             Rockettes_2024/25
27           Team_Unique_2024/25
28          Les_Supremes_2024/25
29                  Nova_2024/25
Name: Team

In [249]:
training_teams = [
    'Skyliners_2017/18', 'Haydenettes_2017/18', 'Marigold_Ice_Unity_2017/18', 
    'Team_Unique_2017/18', 'Les_Supremes_2017/18', 'Nexxice_2017/18', 
    'Skyliners_2018/19', 'Haydenettes_2018/19', 'Marigold_Ice_Unity_2018/19', 
    'Rockettes_2018/19', 'Nexxice_2018/19', 'Nova_2018/19', 'Miami_2022/23', 
    'Haydenettes_2022/23', 'Rockettes_2022/23', 'Team_Unique_2022/23', 
    'Les_Supremes_2022/23', 'Nexxice_2022/23', 'Skyliners_2023/24', 
    'Haydenettes_2023/24', 'Rockettes_2023/24', 'Team_Unique_2023/24', 
    'Les_Supremes_2023/24', 'Nova_2023/24'
]
testing_teams = [
    'Skyliners_2024/25', 'Haydenettes_2024/25', 'Rockettes_2024/25', 
    'Team_Unique_2024/25', 'Les_Supremes_2024/25', 'Nova_2024/25'
]

training = scores[~scores['Team_Name_Year'].isin(training_teams)]
testing = scores[scores['Team_Name_Year'].isin(testing_teams)]

In [250]:
print(scores.columns)

Index(['Team_Name_Year', 'Domestic_Short', 'Domestic_Long',
       'International_1_Short', 'International_1_Long',
       'International_2_Short', 'International_2_Long', 'Nationals_Short',
       'Nationals_Long', 'Words_Short', 'Worlds_Long'],
      dtype='object')


In [251]:
training_data = scores[scores['Team_Name_Year'].isin(training_teams)]

In [252]:
X_training = training_data[['Domestic_Short', 'Domestic_Long',
       'International_1_Short', 'International_1_Long',
       'International_2_Short', 'International_2_Long', 'Nationals_Short',
       'Nationals_Long']]
y_training = training_data[['Words_Short', 'Worlds_Long']]

## Initialize regression model, fit it to training sets, print coefficient matrix

In [253]:
from sklearn.linear_model import LinearRegression

In [254]:
from sklearn.preprocessing import StandardScaler

In [255]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_training)

In [274]:
scores_predictive_model = LinearRegression()

In [275]:
scores_predictive_model.fit(X_training, y_training)

In [258]:
print(f"Predicitve Model Coefficients: {scores_predictive_model.coef_}")

Predicitve Model Coefficients: [[-0.19039602  0.058473    0.05191213 -0.01109172  0.45697414  0.0744538
   0.74338628 -0.19280091]
 [ 0.64729789  0.18056421 -0.4362169   0.17569138  0.15550712  0.63476714
  -0.51594777  0.33426769]]


In [259]:
pretty_coefficient_matrix = np.array([
    [-0.24110394,  0.11610089,  0.52317278, -0.04158762,  0.17560229, -0.06650812,  0.74122126, -0.13747368],
    [-0.60110002,  0.33611773,  0.65602659,  0.04556856, -0.06496352,  0.25966149,  0.45102472,  0.34020815]])

In [260]:
coefficient_matrix_columns = [
    'Domestic_Short', 'Domestic_Long', 
    'International_1_Short', 'International_1_Long',
    'International_2_Short', 'International_2_Long',
    'Nationals_Short', 'Nationals_Long']
coefficient_matrix_rows = ['Worlds_Short', 'Worlds_Long']

In [261]:
Coefficient_Matrix_of_Model = pd.DataFrame(pretty_coefficient_matrix, columns = coefficient_matrix_columns, index = coefficient_matrix_rows)

In [262]:
print(Coefficient_Matrix_of_Model.round(5))

              Domestic_Short  Domestic_Long  International_1_Short  \
Worlds_Short         -0.2411        0.11610                0.52317   
Worlds_Long          -0.6011        0.33612                0.65603   

              International_1_Long  International_2_Short  \
Worlds_Short              -0.04159                0.17560   
Worlds_Long                0.04557               -0.06496   

              International_2_Long  Nationals_Short  Nationals_Long  
Worlds_Short              -0.06651          0.74122        -0.13747  
Worlds_Long                0.25966          0.45102         0.34021  


## Perform Cross Validation (negative mean squared error and r^2)

In [263]:
from sklearn.model_selection import KFold

In [264]:
k_fold = KFold(n_splits = 10, shuffle = True, random_state = 48)

In [265]:
from sklearn.model_selection import cross_val_score

In [266]:
from sklearn.metrics import make_scorer, mean_squared_error

In [267]:
rmse = make_scorer(mean_squared_error, greater_is_better = False, squared = False)

In [268]:
cross_validate_rmse = cross_val_score(scores_predictive_model, X_training, y_training, cv = k_fold, scoring = rmse)

In [269]:
cross_validate_r2 = cross_val_score(scores_predictive_model, X_training, y_training, cv = k_fold, scoring = 'r2')

In [270]:
print(f"Negative Root Mean Squared Error Average: {-cross_validate_rmse.mean():.3f}")
print(f"R^2 Average: {cross_validate_r2.mean():.3f}")

Negative Root Mean Squared Error Average: 6.802
R^2 Average: -12.220


## Use model 

In [298]:
X_testing = testing[['Domestic_Short', 'Domestic_Long', 'International_1_Short', 'International_1_Long', 
                          'International_2_Short', 'International_2_Long', 'Nationals_Short', 'Nationals_Long']]

In [299]:
Worlds_Predictions = scores_predictive_model.predict(X_testing)

In [300]:
testing_teams['Predicted_Worlds_Short'] = Worlds_Predictions[:, 0]
testing_teams['Predicted_Worlds_Long'] = Worlds_Predictions[:, 1]

TypeError: list indices must be integers or slices, not str

In [301]:
print("Predictions for 2024/25 World Championship Short and Long Program Scores:")
for index, row in X_testing.iterrows():
    print(f"Team Name and Year: {row['Team_Name_Year']}, Predicted Short Program Score: {row['Predicted_Worlds_Short']:.2f}, Predicted Long Program Score: {row['Predicted_Worlds_Long']:.2f}")

Predictions for 2024/25 World Championship Short and Long Program Scores:


KeyError: 'Team_Name_Year'