# Feature engineering | Sebislaw

## Libraries

In [19]:
from os.path  import join
import random
import itertools
import math

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split, TimeSeriesSplit
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import brier_score_loss
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

## Idea
We will know which teams qualiffy into the tournament on march 16.\
We also know that the only parameter that decides which team wins a game is the score.

The features that may be important are:
- seed (known on 16 of march),
- team's quality (calculated from different data points),
- opposing team quality,
- team's coach???.

How do we calculate teams' quality?\
On average team's performance changes quickly. The mean autocorrelation of fraction of win for teams is only 0.25 1 season back, 0.1 2 season back and from 3 seasons back onwards it is <0=. So we should only at the current season's games whan calculation team's performance.

## Data

In [20]:
data_path = '..\\..\\data'

# The Basics ------------------------------------------------------------------------
# Men
MTeams = pd.read_csv(join(data_path, 'MTeams.csv'))
MSeasons = pd.read_csv(join(data_path, 'MSeasons.csv'))
MNCAATourneySeeds = pd.read_csv(join(data_path, 'MNCAATourneySeeds.csv'))
MRegularSeasonCompactResults = pd.read_csv(join(data_path, 'MRegularSeasonCompactResults.csv'))
MNCAATourneyCompactResults = pd.read_csv(join(data_path, 'MNCAATourneyCompactResults.csv'))
# Women
WTeams = pd.read_csv(join(data_path, 'WTeams.csv'))
WSeasons = pd.read_csv(join(data_path, 'WSeasons.csv'))
WNCAATourneySeeds = pd.read_csv(join(data_path, 'WNCAATourneySeeds.csv'))
WRegularSeasonCompactResults = pd.read_csv(join(data_path, 'WRegularSeasonCompactResults.csv'))
WNCAATourneyCompactResults = pd.read_csv(join(data_path, 'WNCAATourneyCompactResults.csv'))
# Other
SampleSubmissionStage1 = pd.read_csv(join(data_path, 'SampleSubmissionStage1.csv'))
SampleSubmissionStage2 = pd.read_csv(join(data_path, 'SampleSubmissionStage2.csv'))
SeedBenchmarkStage1 = pd.read_csv(join(data_path, 'SeedBenchmarkStage1.csv'))

# Team Box Scores ------------------------------------------------------------------------
# Men
MRegularSeasonDetailedResults = pd.read_csv(join(data_path, 'MRegularSeasonDetailedResults.csv'))
MNCAATourneyDetailedResults = pd.read_csv(join(data_path, 'MNCAATourneyDetailedResults.csv'))
# Women
WRegularSeasonDetailedResults = pd.read_csv(join(data_path, 'WRegularSeasonDetailedResults.csv'))
WNCAATourneyDetailedResults = pd.read_csv(join(data_path, 'WNCAATourneyDetailedResults.csv'))

# Geography ------------------------------------------------------------------------
# All
Cities = pd.read_csv(join(data_path, 'Cities.csv'))
Conferences = pd.read_csv(join(data_path, 'Conferences.csv'))
# Men
MGameCities = pd.read_csv(join(data_path, 'MGameCities.csv'))
# Women
WGameCities = pd.read_csv(join(data_path, 'WGameCities.csv'))

# Public Rankings ------------------------------------------------------------------------
# Men
MMasseyOrdinals = pd.read_csv(join(data_path, 'MMasseyOrdinals.csv')) # men only

# Supplements ------------------------------------------------------------------------
# Men
MTeamCoaches = pd.read_csv(join(data_path, 'MTeamCoaches.csv')) # men only
MTeamConferences = pd.read_csv(join(data_path, 'MTeamConferences.csv'))
MConferenceTourneyGames = pd.read_csv(join(data_path, 'MConferenceTourneyGames.csv'))
MSecondaryTourneyTeams = pd.read_csv(join(data_path, 'MSecondaryTourneyTeams.csv'))
MSecondaryTourneyCompactResults = pd.read_csv(join(data_path, 'MSecondaryTourneyCompactResults.csv'))
MTeamSpellings = pd.read_csv(join(data_path, "MTeamSpellings.csv"), encoding='cp1252')
MNCAATourneySlots = pd.read_csv(join(data_path, 'MNCAATourneySlots.csv'))
MNCAATourneySeedRoundSlots = pd.read_csv(join(data_path, 'MNCAATourneySeedRoundSlots.csv')) # men only
# Women
WTeamConferences = pd.read_csv(join(data_path, 'WTeamConferences.csv'))
WConferenceTourneyGames = pd.read_csv(join(data_path, 'WConferenceTourneyGames.csv'))
WSecondaryTourneyTeams = pd.read_csv(join(data_path, 'WSecondaryTourneyTeams.csv'))
WSecondaryTourneyCompactResults = pd.read_csv(join(data_path, 'WSecondaryTourneyCompactResults.csv'))
WTeamSpellings = pd.read_csv(join(data_path, 'WTeamSpellings.csv'), encoding='cp1252')
WNCAATourneySlots = pd.read_csv(join(data_path, 'WNCAATourneySlots.csv'))

## Analysing data

### What data do we have from regular season?

In [17]:
df = MNCAATourneyDetailedResults.copy()
for l in list(df):
    print(l, end=', ')
# list(df)

Season, DayNum, WTeamID, WScore, LTeamID, LScore, WLoc, NumOT, WFGM, WFGA, WFGM3, WFGA3, WFTM, WFTA, WOR, WDR, WAst, WTO, WStl, WBlk, WPF, LFGM, LFGA, LFGM3, LFGA3, LFTM, LFTA, LOR, LDR, LAst, LTO, LStl, LBlk, LPF, 

In [18]:
df[['Season', 'DayNum', 'NumOT',
 'WTeamID',  'WScore', 'WLoc',
 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF',
 'LTeamID', 'LScore',
 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'
]]

Unnamed: 0,Season,DayNum,NumOT,WTeamID,WScore,WLoc,WFGM,WFGA,WFGM3,WFGA3,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1,1421,92,N,32,69,11,29,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,0,1112,80,N,31,66,7,23,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,0,1113,84,N,31,59,6,14,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,0,1141,79,N,29,53,3,7,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1,1143,76,N,27,64,7,20,...,21,15,20,10,26,16,14,5,8,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1377,2024,146,0,1301,76,N,28,60,3,13,...,20,21,26,10,27,11,9,4,5,23
1378,2024,146,0,1345,72,N,24,53,3,15,...,26,7,11,6,17,17,6,8,4,25
1379,2024,152,0,1163,86,N,31,62,10,25,...,23,9,11,7,21,9,7,2,5,15
1380,2024,152,0,1345,63,N,22,55,10,25,...,19,3,4,6,22,10,11,8,3,13


In [None]:
# Idea:
# join dfs below to make a single data frame to train on.
# then look at the reults and make label y (0 or 1).

# From 2003 (the detailed ones)
# note that here we don't care about opponent's team ID.
# There are few mathces between two specific teams, so there are too few data points to see trends emerging.
# Instead it's a better idea to took how the team performs historically vs specific play styles.
# This data should be from 2003 or more recent year onwards.
# Season	T1_TeamID	
# T1_FGMmean	T1_FGAmean	T1_FGM3mean	T1_FGA3mean T1_ORmean 
# T1_Astmean	T1_TOmean	T1_Stlmean	T1_PFmean
# T1_opponent_FGMmean	T1_opponent_FGAmean	T1_opponent_FGM3mean T1_opponent_FGA3mean	T1_opponent_ORmean
# T1_opponent_Astmean	T1_opponent_TOmean	T1_opponent_Stlmean	T1_opponent_Blkmean	T1_PointDiffmean

# The information about seed
# This data is from season 2025. We will know the seed, so we will only need to look
# at teams with 2025 seeds.
# Season	T1_TeamID	T1_seed
# + add the information about opponent's seed (their seed or seed difference)

# Data from last x days (is the team good recently)
# Season	T1_TeamID	T1_win_ratio_xd

# + add the information about team's quality
# this would be a column or multiple 