## Upload Data

In [155]:
import pandas as pd
import numpy as np

scores = pd.read_csv("capstone_data.csv")
print(scores.columns)

Index(['Team_Name_Year', 'Domestic_Short', 'Domestic_Long',
       'International_1_Short', 'International_1_Long',
       'International_2_Short', 'International_2_Long', 'Nationals_Short',
       'Nationals_Long', 'Words_Short', 'Worlds_Long'],
      dtype='object')


## Handle Missing Data

In [156]:
missing_data_rows = scores[scores.isnull().any(axis=1)]
print(missing_data_rows)

          Team_Name_Year  Domestic_Short  Domestic_Long  \
4   Les_Supremes_2017/18             NaN            NaN   
11          Nova_2018/19           57.07         124.99   
24     Skyliners_2024/25           60.48         114.28   
25   Haydenettes_2024/25           62.81         124.08   
26     Rockettes_2024/25           76.64         146.95   
27   Team_Unique_2024/25           77.68         149.49   
28  Les_Supremes_2024/25           71.80         139.94   
29          Nova_2024/25           71.72         135.48   

    International_1_Short  International_1_Long  International_2_Short  \
4                   64.33                115.20                  66.70   
11                  77.94                137.92                    NaN   
24                  63.22                124.59                  59.79   
25                  74.11                141.20                  76.78   
26                  76.97                148.18                  78.63   
27                  65.8

In [157]:
short_columns = ['Domestic_Short', 'International_1_Short', 'International_2_Short', 'Nationals_Short']
long_columns  = ['Domestic_Long', 'International_1_Long', 'International_2_Long', 'Nationals_Long']

In [158]:
row = scores.loc[4]
short_average = row[short_columns].dropna().mean()
long_average = row[long_columns].dropna().mean()

In [159]:
if pd.isna(row['Domestic_Short']):
    scores.loc[4, 'Domestic_Short'] = short_average

if pd.isna(row['Domestic_Long']):
    scores.loc[4, 'Domestic_Long'] = long_average

In [160]:
row = scores.loc[11]
short_average = row[short_columns].dropna().mean()
long_average = row[long_columns].dropna().mean()

In [161]:
if pd.isna(row['International_2_Short']):
    scores.loc[11, 'International_2_Short'] = short_average

if pd.isna(row['International_2_Long']):
    scores.loc[11, 'International_2_Long'] = long_average

In [162]:
missing_data_rows = scores[scores.isnull().any(axis=1)]
print(missing_data_rows)

          Team_Name_Year  Domestic_Short  Domestic_Long  \
24     Skyliners_2024/25           60.48         114.28   
25   Haydenettes_2024/25           62.81         124.08   
26     Rockettes_2024/25           76.64         146.95   
27   Team_Unique_2024/25           77.68         149.49   
28  Les_Supremes_2024/25           71.80         139.94   
29          Nova_2024/25           71.72         135.48   

    International_1_Short  International_1_Long  International_2_Short  \
24                  63.22                124.59                  59.79   
25                  74.11                141.20                  76.78   
26                  76.97                148.18                  78.63   
27                  65.88                138.66                  73.44   
28                  77.56                147.64                  77.49   
29                  75.82                142.98                  71.96   

    International_2_Long  Nationals_Short  Nationals_Long  Words_Sh

## Seperate Data (training/testing sets, X and y)

In [163]:
from sklearn.linear_model import LinearRegression

In [164]:
print(scores["Team_Name_Year"])

0              Skyliners_2017/18
1            Haydenettes_2017/18
2     Marigold_Ice_Unity_2017/18
3            Team_Unique_2017/18
4           Les_Supremes_2017/18
5                Nexxice_2017/18
6              Skyliners_2018/19
7            Haydenettes_2018/19
8     Marigold_Ice_Unity_2018/19
9              Rockettes_2018/19
10               Nexxice_2018/19
11                  Nova_2018/19
12                 Miami_2022/23
13           Haydenettes_2022/23
14             Rockettes_2022/23
15           Team_Unique_2022/23
16          Les_Supremes_2022/23
17               Nexxice_2022/23
18             Skyliners_2023/24
19           Haydenettes_2023/24
20             Rockettes_2023/24
21           Team_Unique_2023/24
22          Les_Supremes_2023/24
23                  Nova_2023/24
24             Skyliners_2024/25
25           Haydenettes_2024/25
26             Rockettes_2024/25
27           Team_Unique_2024/25
28          Les_Supremes_2024/25
29                  Nova_2024/25
Name: Team

In [165]:
training_teams = [
    'Skyliners_2017/18', 'Haydenettes_2017/18', 'Marigold_Ice_Unity_2017/18', 
    'Team_Unique_2017/18', 'Les_Supremes_2017/18', 'Nexxice_2017/18', 
    'Skyliners_2018/19', 'Haydenettes_2018/19', 'Marigold_Ice_Unity_2018/19', 
    'Rockettes_2018/19', 'Nexxice_2018/19', 'Nova_2018/19', 'Miami_2022/23', 
    'Haydenettes_2022/23', 'Rockettes_2022/23', 'Team_Unique_2022/23', 
    'Les_Supremes_2022/23', 'Nexxice_2022/23', 'Skyliners_2023/24', 
    'Haydenettes_2023/24', 'Rockettes_2023/24', 'Team_Unique_2023/24', 
    'Les_Supremes_2023/24', 'Nova_2023/24'
]
testing_teams = [
    'Skyliners_2024/25', 'Haydenettes_2024/25', 'Rockettes_2024/25', 
    'Team_Unique_2024/25', 'Les_Supremes_2024/25', 'Nova_2024/25'
]

training = scores[~scores['Team_Name_Year'].isin(training_teams)]
testing = scores[scores['Team_Name_Year'].isin(testing_teams)]

In [166]:
print(scores.columns)

Index(['Team_Name_Year', 'Domestic_Short', 'Domestic_Long',
       'International_1_Short', 'International_1_Long',
       'International_2_Short', 'International_2_Long', 'Nationals_Short',
       'Nationals_Long', 'Words_Short', 'Worlds_Long'],
      dtype='object')


In [167]:
training_data = scores[scores['Team_Name_Year'].isin(training_teams)]

In [168]:
X_training = training_data[['Domestic_Short', 'Domestic_Long',
       'International_1_Short', 'International_1_Long',
       'International_2_Short', 'International_2_Long', 'Nationals_Short',
       'Nationals_Long']]
y_training = training_data[['Words_Short', 'Worlds_Long']]

## Initialize regression model, fit it to training sets, print coefficient matrix

In [169]:
from sklearn.linear_model import LinearRegression

In [170]:
from sklearn.preprocessing import StandardScaler

In [171]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_training)

In [172]:
scores_predictive_model = LinearRegression()

In [211]:
scores_predictive_model.fit(X_scaled, y_training)

In [174]:
print(f"Predicitve Model Coefficients: {scores_predictive_model.coef_}")

Predicitve Model Coefficients: [[-1.10251247  0.78090829  0.30332006 -0.28124633  2.64271415  1.03267436
   3.45574149 -2.10195001]
 [ 3.74826116  2.41143933 -2.54879435  4.4549046   0.89930881  8.80422158
  -2.39845979  3.64424619]]


In [175]:
pretty_coefficient_matrix = np.array([
    [-0.24110394,  0.11610089,  0.52317278, -0.04158762,  0.17560229, -0.06650812,  0.74122126, -0.13747368],
    [-0.60110002,  0.33611773,  0.65602659,  0.04556856, -0.06496352,  0.25966149,  0.45102472,  0.34020815]])

In [176]:
coefficient_matrix_columns = [
    'Domestic_Short', 'Domestic_Long', 
    'International_1_Short', 'International_1_Long',
    'International_2_Short', 'International_2_Long',
    'Nationals_Short', 'Nationals_Long']
coefficient_matrix_rows = ['Worlds_Short', 'Worlds_Long']

In [177]:
Coefficient_Matrix_of_Model = pd.DataFrame(pretty_coefficient_matrix, columns = coefficient_matrix_columns, index = coefficient_matrix_rows)

In [178]:
print(Coefficient_Matrix_of_Model.round(5))

              Domestic_Short  Domestic_Long  International_1_Short  \
Worlds_Short         -0.2411        0.11610                0.52317   
Worlds_Long          -0.6011        0.33612                0.65603   

              International_1_Long  International_2_Short  \
Worlds_Short              -0.04159                0.17560   
Worlds_Long                0.04557               -0.06496   

              International_2_Long  Nationals_Short  Nationals_Long  
Worlds_Short              -0.06651          0.74122        -0.13747  
Worlds_Long                0.25966          0.45102         0.34021  


## Perform Cross Validation (negative mean squared error and r^2)

In [179]:
from sklearn.model_selection import KFold

In [231]:
k_fold = KFold(n_splits = 3, shuffle = True, random_state = 48)

In [246]:
from sklearn.model_selection import cross_val_predict

In [247]:
from sklearn.metrics import make_scorer, mean_squared_error

In [248]:
rmse = make_scorer(mean_squared_error, greater_is_better = False, squared = False)

In [249]:
y_training_short = y_training.iloc[:, 0]
y_training_long = y_training.iloc[:, 1]

In [250]:
short_cross_validate_rmse = cross_val_predict(scores_predictive_model, X_scaled, y_training_short, cv = k_fold, scoring = rmse)
long_cross_validate_rmse = cross_val_predict(scores_predictive_model, X_scaled, y_training_long, cv = k_fold, scoring = rmse)

TypeError: cross_val_predict() got an unexpected keyword argument 'scoring'

In [245]:
print(scores_predictive_model.score(X_scaled, y_training))
#this is coefficient of determination

0.8555127210971831


In [244]:
print(f"Root Mean Squared Error Average for Short Scores: {-short_cross_validate_rmse.mean():.3}")
print(f"Root Mean Squared Error Average for Long Scores: {-long_cross_validate_rmse.mean():.3}")

Root Mean Squared Error Average for Short Scores: 5.92
Root Mean Squared Error Average for Long Scores: 13.0


### Standard Deviation

In [240]:
standard_deviation_short = np.std(y_short)
standard_deviation_long = np.std(y_long)

In [204]:
print("Standard Deviation of Worlds Short Scores in Training Set:", standard_deviation_short)
print("Standard Deviation of Worlds Long Scores in Training Set:", standard_deviation_long)

Standard Deviation of Worlds Short Scores in Training Set: 5.668089956061036
Standard Deviation of Worlds Long Scores in Training Set: 15.009471864783327


## Use model 

In [192]:
X_testing = testing[['Domestic_Short', 'Domestic_Long', 'International_1_Short', 'International_1_Long', 
                    'International_2_Short', 'International_2_Long', 'Nationals_Short', 'Nationals_Long']]

In [193]:
Worlds_Predictions = scores_predictive_model.predict(X_testing)



In [194]:
testing['Predicted_Worlds_Short'] = Worlds_Predictions[:, 0]
testing['Predicted_Worlds_Long'] = Worlds_Predictions[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['Predicted_Worlds_Short'] = Worlds_Predictions[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['Predicted_Worlds_Long'] = Worlds_Predictions[:, 1]


In [195]:
print("Predictions for 2024/25 World Championship Short and Long Program Scores:")
for index, row in testing.iterrows():
    print(f"Team Name and Year: {row['Team_Name_Year']}, Predicted Short Program Score: {row['Predicted_Worlds_Short']:.2f}, Predicted Long Program Score: {row['Predicted_Worlds_Long']:.2f}")

Predictions for 2024/25 World Championship Short and Long Program Scores:
Team Name and Year: Skyliners_2024/25, Predicted Short Program Score: 306.73, Predicted Long Program Score: 2521.73
Team Name and Year: Haydenettes_2024/25, Predicted Short Program Score: 389.90, Predicted Long Program Score: 2796.09
Team Name and Year: Rockettes_2024/25, Predicted Short Program Score: 410.09, Predicted Long Program Score: 2859.79
Team Name and Year: Team_Unique_2024/25, Predicted Short Program Score: 405.42, Predicted Long Program Score: 2973.83
Team Name and Year: Les_Supremes_2024/25, Predicted Short Program Score: 409.12, Predicted Long Program Score: 2852.32
Team Name and Year: Nova_2024/25, Predicted Short Program Score: 367.03, Predicted Long Program Score: 2763.25
