# model-volleyball: Machine Learning Models

In [1]:
# module imports
import pandas as pd

In [23]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
# import data
df = pd.read_csv('../data/full_matches.csv')

In [5]:
fr = df[df["season"] == "FR"]
so = df[df["season"] == "SO"]
jr = df[df["season"] == "JR"]
sr = df[df["season"] == "SR"]

#### Per-set totals for each season

In [6]:
print('FR season:')
fr_k_p_s = round(fr['kills'].sum() / fr['sets_played'].sum(), 2)
fr_a_p_s = round(fr['aces'].sum() / fr['sets_played'].sum(), 2)
fr_d_p_s = round(fr['digs'].sum() / fr['sets_played'].sum(), 2)
fr_b_p_s = round(fr['total_blocks'].sum() / fr['sets_played'].sum(), 2)

print(f'K/S: {fr_k_p_s} | A/S: {fr_a_p_s} | D/S: {fr_d_p_s} | B/S: {fr_b_p_s}')

FR season:
K/S: 1.7 | A/S: 0.73 | D/S: 3.22 | B/S: 0.07


In [7]:
print('SO season:')
so_k_p_s = round(so['kills'].sum() / so['sets_played'].sum(), 2)
so_a_p_s = round(so['aces'].sum() / so['sets_played'].sum(), 2)
so_d_p_s = round(so['digs'].sum() / so['sets_played'].sum(), 2)
so_b_p_s = round(so['total_blocks'].sum() / so['sets_played'].sum(), 2)

print(f'K/S: {so_k_p_s} | A/S: {so_a_p_s} | D/S: {so_d_p_s} | B/S: {so_b_p_s}')

SO season:
K/S: 2.55 | A/S: 0.97 | D/S: 3.02 | B/S: 0.07


In [8]:
print('JR season:')
jr_k_p_s = round(167 / jr['sets_played'].sum(), 2)
jr_a_p_s = round(120 / jr['sets_played'].sum(), 2)
jr_d_p_s = round(250 / jr['sets_played'].sum(), 2)
jr_b_p_s = round(8 / jr['sets_played'].sum(), 2)

print(f'K/S: {jr_k_p_s} | A/S: {jr_a_p_s} | D/S: {jr_d_p_s} | B/S: {jr_b_p_s}')

JR season:
K/S: 1.59 | A/S: 1.14 | D/S: 2.38 | B/S: 0.08


In [9]:
print('SR season:')
sr_k_p_s = round(sr['kills'].sum() / sr['sets_played'].sum(), 2)
sr_a_p_s = round(sr['aces'].sum() / sr['sets_played'].sum(), 2)
sr_d_p_s = round(sr['digs'].sum() / sr['sets_played'].sum(), 2)
sr_b_p_s = round(sr['total_blocks'].sum() / sr['sets_played'].sum(), 2)

print(f'K/S: {sr_k_p_s} | A/S: {sr_a_p_s} | D/S: {sr_d_p_s} | B/S: {sr_b_p_s}')

SR season:
K/S: 2.76 | A/S: 1.26 | D/S: 4.17 | B/S: 0.05


FR season:  
K/S: 1.7 | A/S: 0.73 | D/S: 3.22 | B/S: 0.07  

SO season:  
K/S: 2.55 | A/S: 0.97 | D/S: 3.02 | B/S: 0.07  

JR season:  
K/S: 1.59 | A/S: 1.14 | D/S: 2.38 | B/S: 0.08  

SR season:  
K/S: 2.76 | A/S: 1.26 | D/S: 4.17 | B/S: 0.05  
  
---
  
JR stats for aces and blocks seem to be par on the approximate development trajectory as compared to FR, SO, and SR. However, for digs and kills, the JR stats seems massively regressed -- especially compared with FR year. Odd, as I was also team captain for SO, JR, and SR, and played literally almost every set.  

1.59 kills/set for JR is substainally below FR and SO, and it is also odd that the very next year, it jumped up by 1.17 more kills? 2.38 digs/set for JR, which is also lower than both FR and SO, and also another huge jump of 1.79 digs.

#### Percent-change improvements between seasons

In [10]:
print('FR -> SO:')
k_pct = round(((so_k_p_s - fr_k_p_s) / fr_k_p_s) * 100, 2)
a_pct = round(((so_a_p_s - fr_a_p_s) / fr_a_p_s) * 100, 2)
d_pct = round(((so_d_p_s - fr_d_p_s) / fr_d_p_s) * 100, 2)
b_pct = round(((so_b_p_s -  fr_b_p_s) / fr_b_p_s) * 100, 2)

print(f'Kpct: {k_pct} | Apct: {a_pct} | Dpct: {d_pct} | Bpct: {b_pct}')

FR -> SO:
Kpct: 50.0 | Apct: 32.88 | Dpct: -6.21 | Bpct: 0.0


In [11]:
print('SO -> JR:')
k_pct = round(((jr_k_p_s - so_k_p_s) / so_k_p_s) * 100, 2)
a_pct = round(((jr_a_p_s - so_a_p_s) / so_a_p_s) * 100, 2)
d_pct = round(((jr_d_p_s - so_d_p_s) / so_d_p_s) * 100, 2)
b_pct = round(((jr_b_p_s - so_b_p_s) / so_b_p_s) * 100, 2)

print(f'Kpct: {k_pct} | Apct: {a_pct} | Dpct: {d_pct} | Bpct: {b_pct}')

SO -> JR:
Kpct: -37.65 | Apct: 17.53 | Dpct: -21.19 | Bpct: 14.29


In [12]:
print('JR -> SR:')
k_pct = round(((sr_k_p_s - jr_k_p_s) / jr_k_p_s) * 100, 2)
a_pct = round(((sr_a_p_s - jr_a_p_s) / jr_a_p_s) * 100, 2)
d_pct = round(((sr_d_p_s - jr_d_p_s) / jr_d_p_s) * 100, 2)
b_pct = round(((sr_b_p_s - jr_b_p_s) / jr_b_p_s) * 100, 2)

print(f'Kpct: {k_pct} | Apct: {a_pct} | Dpct: {d_pct} | Bpct: {b_pct}')

JR -> SR:
Kpct: 73.58 | Apct: 10.53 | Dpct: 75.21 | Bpct: -37.5


In [13]:
print('FR -> SR:')
k_pct = round(((sr_k_p_s - fr_k_p_s) / fr_k_p_s) * 100, 2)
a_pct = round(((sr_a_p_s - fr_a_p_s) / fr_a_p_s) * 100, 2)
d_pct = round(((sr_d_p_s - fr_d_p_s) / fr_d_p_s) * 100, 2)
b_pct = round(((sr_b_p_s - fr_b_p_s) / fr_b_p_s) * 100, 2)

print(f'Kpct: {k_pct} | Apct: {a_pct} | Dpct: {d_pct} | Bpct: {b_pct}')

FR -> SR:
Kpct: 62.35 | Apct: 72.6 | Dpct: 29.5 | Bpct: -28.57


FR -> SO:  
Kpct: 50.0 | Apct: 32.88 | Dpct: -6.21 | Bpct: 0.0  

SO -> JR:  
Kpct: -37.65 | Apct: 17.53 | Dpct: -21.19 | Bpct: 14.29  

JR -> SR:  
Kpct: 73.58 | Apct: 10.53 | Dpct: 75.21 | Bpct: -37.5  

FR -> SR:  
Kpct: 62.35 | Apct: 72.6 | Dpct: 29.5 | Bpct: -28.57
  
---
  
A negative drop between SO and JR year for kills and digs, despite me attending volleyball camps and striving to improve throughout the summer before the season? There is also another HUGE jump up from JR to SR with a 74% improvement for kills and a 75% improvement for digs. I don't think the numbers show the true statistics for my JR season. 
  
Formula used: `((b-a) / a ) * 100`

#### MACHINE LEARNING

##### Summary/Overview

In [19]:
df.head()

Unnamed: 0,match_key,career_match_index,career_stage,season,season_match_number,date,day_of_week,week_of_season,days_since_last_match,match_no,...,receiving_errors,receiving_per_set,aces,aces_per_set,ace_pct,serve_attempts,serve_errors,serve_pct,points,maxpreps
0,FR_09-01_TA_1,1.0,early,FR,1,2016-09-01,Thursday,1,0,1,...,2.0,4.3,6.0,2.0,40.0,15.0,2.0,86.7,0.0,https://www.maxpreps.com/games/09-01-2016/voll...
1,FR_09-09_RCSA_1,2.0,early,FR,2,2016-09-09,Friday,2,8,1,...,0.0,6.5,2.0,1.0,33.3,6.0,0.0,100.0,0.0,https://www.maxpreps.com/games/09-09-2016/voll...
2,FR_09-09_INTERLACHEN_2,3.0,early,FR,3,2016-09-09,Friday,2,0,2,...,2.0,4.0,4.0,2.0,50.0,8.0,0.0,100.0,0.0,https://www.maxpreps.com/games/09-09-2016/voll...
3,FR_09-09_FSDB_3,4.0,early,FR,4,2016-09-09,Friday,2,0,3,...,2.0,6.0,5.0,2.5,62.5,8.0,0.0,100.0,0.0,https://www.maxpreps.com/games/09-09-2016/voll...
4,FR_09-17_AIDB_1,5.0,early,FR,5,2016-09-17,Saturday,3,8,1,...,0.0,4.0,2.0,0.7,22.2,9.0,1.0,88.9,0.0,https://www.maxpreps.com/games/09-17-2016/voll...


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 80 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   match_key               157 non-null    object 
 1   career_match_index      151 non-null    float64
 2   career_stage            151 non-null    object 
 3   season                  157 non-null    object 
 4   season_match_number     157 non-null    int64  
 5   date                    157 non-null    object 
 6   day_of_week             157 non-null    object 
 7   week_of_season          157 non-null    int64  
 8   days_since_last_match   157 non-null    int64  
 9   match_no                157 non-null    int64  
 10  total_matches_that_day  157 non-null    int64  
 11  total_sets_that_day     157 non-null    int64  
 12  multi_game_day          157 non-null    bool   
 13  first_match_of_day      71 non-null     object 
 14  last_match_of_day       71 non-null     ob

In [21]:
round(df.describe(), 2)

Unnamed: 0,career_match_index,season_match_number,week_of_season,days_since_last_match,match_no,total_matches_that_day,total_sets_that_day,same_day_opponent_num,season_opponent_num,set_count,...,receiving,receiving_errors,receiving_per_set,aces,aces_per_set,ace_pct,serve_attempts,serve_errors,serve_pct,points
count,151.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,157.0,...,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0,151.0
mean,76.0,20.34,5.22,7.39,1.69,2.38,5.89,1.03,1.24,2.78,...,9.2,1.07,3.33,2.08,0.74,16.99,7.48,0.66,64.98,2.2
std,43.73,11.75,2.51,40.18,1.15,1.68,3.41,0.16,0.47,0.74,...,7.5,1.37,2.51,2.96,1.03,18.97,6.88,0.82,41.43,4.49
min,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,38.5,10.0,3.0,0.0,1.0,1.0,3.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,76.0,20.0,6.0,2.0,1.0,1.0,4.0,1.0,1.0,3.0,...,10.0,0.0,3.7,1.0,0.3,11.1,7.0,0.0,85.7,0.0
75%,113.5,30.0,7.0,3.0,2.0,4.0,9.0,1.0,1.0,3.0,...,14.0,2.0,5.3,3.0,1.0,31.05,11.0,1.0,100.0,3.0
max,151.0,46.0,11.0,296.0,6.0,6.0,14.0,2.0,3.0,5.0,...,36.0,6.0,8.3,14.0,5.0,63.6,33.0,3.0,100.0,24.0


##### Feature Identification

X features: career stage, season match number, day of week, match type, location  
y feature: kills

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

In [62]:
season_order = {'FR': 0, 'SO': 1, 'JR': 2, 'SR': 3}
ml_df = df.copy()
ml_df['season_code'] = ml_df['season'].map(season_order)

In [63]:
train_df = ml_df[ml_df['season'].isin(['FR','SO'])].copy()

junior_df = ml_df[ml_df['season'] == 'JR'].copy()

feature_cols = ['season_code', 'season_match_number', 'opponent', 'match_type', 'location']

train_df_clean = train_df[train_df['kills_per_set'].notnull()]
X_train = train_df_clean[feature_cols]
y_train = train_df_clean['kills_per_set']
X_junior = junior_df[feature_cols]

In [64]:
categorical_features = ['season_code', 'opponent', 'match_type', 'location']
numeric_features = ['season_match_number']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

In [65]:
X_train_encoded = preprocessor.fit_transform(X_train)
X_junior_encoded = preprocessor.transform(X_junior)

In [66]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_encoded, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [73]:
junior_kills_pred = model.predict(X_junior_encoded)

junior_df['kills_per_set_predicted'] = model.predict(X_junior_encoded)

In [74]:
expected_avg_kills = 2.65

ml_pred_avg = junior_df['kills_per_set_predicted'].mean()
scaling_factor = 2.65 / ml_pred_avg
junior_df['kills_per_set_adjusted'] = junior_df['kills_per_set_predicted'] * scaling_factor

In [77]:
junior_df['total_kills_predicted'] = junior_df['kills_per_set_predicted'] * junior_df['sets_played']
junior_df['total_kills_adjusted'] = junior_df['kills_per_set_adjusted'] * junior_df['sets_played']

In [None]:
junior_df['kills_per_game_predicted'] = junior_df['kills_per_set_predicted'] * junior_df['sets_played']
junior_df['kills_per_game_adjusted'] = junior_df['kills_per_set_adjusted'] * junior_df['sets_played']

In [86]:
print("Predicted JR avg K/S:", round(junior_df['kills_per_set_predicted'].mean(), 3))
print("Adjusted JR avg K/S:", round(junior_df['kills_per_set_adjusted'].mean(), 3))
print()
print("Predicted JR total kills:", int(junior_df['total_kills_predicted'].sum()))
print("Adjusted JR total kills:", int(junior_df['total_kills_adjusted'].sum()))
print()
print("Predicted JR avg K/G:", round(junior_df['kills_per_game_predicted'].mean(), 3))
print("Adjusted JR avg K/G:", round(junior_df['kills_per_game_adjusted'].mean(), 3))

Predicted JR avg K/S: 1.971
Adjusted JR avg K/S: 2.65

Predicted JR total kills: 207
Adjusted JR total kills: 279

Predicted JR avg K/G: 5.328
Adjusted JR avg K/G: 7.161


In [None]:
fr_sets_total = df[df['season'] == 'FR']['kills'].sum()
so_sets_total = df[df['season'] == 'SO']['kills'].sum()
sr_sets_total = df[df['season'] == 'SR']['kills'].sum()
print(fr_sets_total + so_sets_total + sr_sets_total)

736.0
