In [3]:
import pandas as pd
import numpy as np

In [2]:
seasons = pd.read_csv('csvs/seasons.csv')
metrics = pd.read_csv('csvs/metrics.csv')
team_games = pd.read_csv('csvs/team_games.csv')
team_seasons = pd.read_csv('csvs/team_seasons.csv')
team_colors = pd.read_csv('csvs/team_colors.csv')

In [3]:
metrics['stat_name']

0        tm_to_pct
1       opp_to_pct
2       tm_drb_pct
3       tm_orb_pct
4       tm_efg_pct
5      opp_efg_pct
6      poss_per_40
7       tm_net_eff
8       tm_off_eff
9       tm_def_eff
10     tm_ftm_rate
11    opp_ftm_rate
Name: stat_name, dtype: object

In [4]:
team_seasons_stats_melt = pd.melt(
  team_seasons[['season', 'tm_code', 'tm'] + metrics['stat_name'].tolist()],
  id_vars = ['season', 'tm_code', 'tm'],
  value_vars = metrics['stat_name'].tolist(),
  var_name = 'stat_name',
  value_name = 'raw_stat'
  )
team_seasons_stats_melt

Unnamed: 0,season,tm_code,tm,stat_name,raw_stat
0,2018,-1,NON-D1 TMS,tm_to_pct,24.109130
1,2018,2,Abilene Christian,tm_to_pct,17.747774
2,2018,5,Akron,tm_to_pct,17.159743
3,2018,6,Alabama A&M,tm_to_pct,22.327287
4,2018,7,Alabama St.,tm_to_pct,20.739845
...,...,...,...,...,...
21139,2014,19651,High Point,opp_ftm_rate,0.219593
21140,2014,26172,A&M-Corpus Christi,opp_ftm_rate,0.314847
21141,2014,28600,Lipscomb,opp_ftm_rate,0.312573
21142,2014,28755,FGCU,opp_ftm_rate,0.222460


In [5]:
season_stats = (team_seasons_stats_melt[['season', 'stat_name']].
  drop_duplicates().
  reset_index(drop = True)
  )
season_stats.head(2)

Unnamed: 0,season,stat_name
0,2018,tm_to_pct
1,2017,tm_to_pct


In [6]:
team_games['season_tm'] = (team_games['season'].map(str) + '_' + 
  team_games['tm_code'].map(str))

team_games['season_opp'] = (team_games['season'].map(str) + '_' + 
  team_games['opp_code'].map(str))

team_games.sample(2)

Unnamed: 0,season,game_date,game_id,tm_code,tm,tm_div,opp_code,opp,opp_div,is_home,...,tm_efg_pct,tm_to_pct,tm_orb_pct,tm_ftm_rate,opp_efg_pct,opp_to_pct,tm_drb_pct,opp_ftm_rate,season_tm,season_opp
55369,2014,2014-12-18,576-446-2014-12-18,446,Morgan St.,1.0,576,Rider,1.0,0,...,36.363636,24.777391,30.952381,0.145455,55.813953,27.874564,76.0,0.325581,2014_446,2014_576
17347,2017,2018-01-07,782-651-2018-01-07,651,South Fla.,1.0,782,Wichita St.,1.0,0,...,41.818182,25.504782,28.571429,0.2,72.881356,17.003188,64.0,0.152542,2017_651,2017_782


In [24]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=1,fit_intercept=True)

reg_results_collection = pd.DataFrame(columns = ['season', 'stat_name',
  'coef_name', 'ridge_reg_coef', 'ridge_reg_value'])

reg_results_collection

Unnamed: 0,season,stat_name,coef_name,ridge_reg_coef,ridge_reg_value


In [14]:
# Iterating through combinations of seasons and stats
for season,stat in zip(season_stats.season,season_stats.stat_name):
    # Creating a dataframe showing ONLY that season and stat
    this_season_game_stat = (team_games[team_games['season'] == season]
    [['season_tm', 'tm_hca', 'season_opp', stat]].
    reset_index()
    )
    # Creating dummy variables
    # Each team played, 1 or zero.
    # This will be the X for our ridge regression
    this_season_game_dummy_vars = pd.get_dummies(
    this_season_game_stat[['season_tm', 'tm_hca', 'season_opp']]
    )
    break
this_season_game_dummy_vars

Unnamed: 0,tm_hca,season_tm_2018_-1,season_tm_2018_1004,season_tm_2018_101,season_tm_2018_1014,season_tm_2018_102,season_tm_2018_104,season_tm_2018_10411,season_tm_2018_1068,season_tm_2018_107,...,season_opp_2018_817,season_opp_2018_83,season_opp_2018_86,season_opp_2018_87,season_opp_2018_9,season_opp_2018_90,season_opp_2018_94,season_opp_2018_96,season_opp_2018_97,season_opp_2018_99
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11425,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11426,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11427,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11428,-1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
#Iterating through combinations of seasons and stats
for season_iteration,current_stat in zip(season_stats.season,season_stats.stat_name):

    # if stat not in ('tm_off_eff','tm_def_eff'):
    #     continue

    this_season_game_stat = team_games[team_games.season == season_iteration][['season_tm', 'tm_hca', 'season_opp', current_stat]].reset_index().copy()
    this_season_game_dummy_vars = pd.get_dummies(this_season_game_stat[['season_tm', 'tm_hca', 'season_opp']])

    reg.fit(X = this_season_game_dummy_vars,y = this_season_game_stat[current_stat])

    # Create a sub-DataFrame with the current regression's results
    this_reg_results = pd.DataFrame(
    {
      # Add season and name of stat for this set of results
      'season': season_iteration,
      'stat_name': current_stat,
      # Coef name, which contains both season and tm_code
      'coef_name': this_season_game_dummy_vars.columns.values,
      # Coef that results from ridge regression
      'ridge_reg_coef': reg.coef_
    }
    )
    
    this_reg_results['ridge_reg_value'] = (this_reg_results.ridge_reg_coef + reg.intercept_)
    reg_results_collection = pd.concat([reg_results_collection,this_reg_results],ignore_index = True)

reg_results_collection

Unnamed: 0,season,stat_name,coef_name,ridge_reg_coef,ridge_reg_value
0,2018,tm_to_pct,tm_hca,-0.541062,18.105907
1,2018,tm_to_pct,season_tm_2018_-1,4.977961,23.62493
2,2018,tm_to_pct,season_tm_2018_1004,0.72737,19.374339
3,2018,tm_to_pct,season_tm_2018_101,-1.0095,17.637469
4,2018,tm_to_pct,season_tm_2018_1014,-1.444582,17.202387
...,...,...,...,...,...
42343,2014,opp_ftm_rate,season_opp_2014_90,-0.042636,0.2215
42344,2014,opp_ftm_rate,season_opp_2014_94,0.025049,0.289184
42345,2014,opp_ftm_rate,season_opp_2014_96,0.037149,0.301284
42346,2014,opp_ftm_rate,season_opp_2014_97,-0.009089,0.255047


https://colab.research.google.com/drive/13L4b36cTrnC55ahD6dVf4-r9pzkYV-j5#scrollTo=_KI9qGgoAlUD