## Exploratory Data Analysis

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from supervised.automl import AutoML

### DATA SECTION 1

In [33]:
data_section_1_names = ["MTeams.csv" , "WTeams.csv", "MSeasons.csv", "WSeasons.csv",
                        "MNCAATourneySeeds.csv", "WNCAATourneySeeds.csv",
                        "MRegularSeasonCompactResults.csv", "WRegularSeasonCompactResults.csv",
                        "MNCAATourneyCompactResults.csv", "WNCAATourneyCompactResults.csv",
                        "SampleSubmissionStage1.csv"]

In [34]:
folder = "../../data/"
dfs = [None for i in range(len(data_section_1_names))]
for i, name in enumerate(data_section_1_names):
    dfs[i] = pd.read_csv(folder + name)

In [41]:
for i in range(len(data_section_1_names)):
    print(data_section_1_names[i] + " " +str(i))
    print(dfs[i].head())

MTeams.csv 0
   Season  TeamID  FirstDayNum  LastDayNum       CoachName
0    1985    1102            0         154   reggie_minton
1    1985    1103            0         154     bob_huggins
2    1985    1104            0         154  wimp_sanderson
3    1985    1106            0         154    james_oliver
4    1985    1108            0         154   davey_whitney
WTeams.csv 1
  ConfAbbrev                   Description
0      a_sun       Atlantic Sun Conference
1      a_ten        Atlantic 10 Conference
2        aac  American Athletic Conference
3        acc     Atlantic Coast Conference
4        aec       America East Conference
MSeasons.csv 2
   Season  TeamID ConfAbbrev
0    1985    1102        wac
1    1985    1103        ovc
2    1985    1104        sec
3    1985    1106       swac
4    1985    1108       swac
WSeasons.csv 3
   Season  TeamID ConfAbbrev
0    1998    3102        wac
1    1998    3103        mac
2    1998    3104        sec
3    1998    3106       swac
4    1998    

In [43]:
dfs[2]

Unnamed: 0,Season,TeamID,ConfAbbrev
0,1985,1102,wac
1,1985,1103,ovc
2,1985,1104,sec
3,1985,1106,swac
4,1985,1108,swac
...,...,...,...
13383,2025,1476,nec
13384,2025,1477,southland
13385,2025,1478,nec
13386,2025,1479,nec


### DATA SECTION 2 - Team Box Scores

In [36]:
data_section_2_names = ["MRegularSeasonDetailedResults.csv", "WRegularSeasonDetailedResults.csv",
                        "MNCAATourneyDetailedResults.csv", "WNCAATourneyDetailedResults.csv"]

In [37]:
dfs = [None for i in range(len(data_section_2_names))]
for i, name in enumerate(data_section_2_names):
    dfs[i] = pd.read_csv(folder + name)
for i in range(len(data_section_2_names)):
    print(data_section_2_names[i])
    print(dfs[i].head())

MRegularSeasonDetailedResults.csv
   Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT  WFGM  WFGA  \
0    2003      10     1104      68     1328      62    N      0    27    58   
1    2003      10     1272      70     1393      63    N      0    26    62   
2    2003      11     1266      73     1437      61    N      0    24    58   
3    2003      11     1296      56     1457      50    N      0    18    38   
4    2003      11     1400      77     1208      71    N      0    30    61   

   ...  LFGA3  LFTM  LFTA  LOR  LDR  LAst  LTO  LStl  LBlk  LPF  
0  ...     10    16    22   10   22     8   18     9     2   20  
1  ...     24     9    20   20   25     7   12     8     6   16  
2  ...     26    14    23   31   22     9   12     2     5   23  
3  ...     22     8    15   17   20     9   19     4     3   23  
4  ...     16    17    27   21   15    12   10     7     1   14  

[5 rows x 34 columns]
WRegularSeasonDetailedResults.csv
   Season  DayNum  WTeamID  WScore  LT

### DATA SECTION 3 - Geographic Data

In [38]:
data_section_3_names = ["Cities.csv", "MGameCities.csv", "WGameCities.csv"]
dfs = [None for i in range(len(data_section_3_names))]
for i, name in enumerate(data_section_3_names):
    dfs[i] = pd.read_csv(folder + name)
for i in range(len(data_section_3_names)):
    print(data_section_3_names[i])
    print(dfs[i].head())

Cities.csv
   CityID         City State
0    4001      Abilene    TX
1    4002        Akron    OH
2    4003       Albany    NY
3    4004  Albuquerque    NM
4    4005    Allentown    PA
MGameCities.csv
   Season  DayNum  WTeamID  LTeamID   CRType  CityID
0    2010       7     1143     1293  Regular    4027
1    2010       7     1314     1198  Regular    4061
2    2010       7     1326     1108  Regular    4080
3    2010       7     1393     1107  Regular    4340
4    2010       9     1143     1178  Regular    4027
WGameCities.csv
   Season  DayNum  WTeamID  LTeamID   CRType  CityID
0    2010      11     3103     3237  Regular    4002
1    2010      11     3104     3399  Regular    4085
2    2010      11     3110     3224  Regular    4363
3    2010      11     3111     3267  Regular    4158
4    2010      11     3119     3447  Regular    4367


### DATA SECTION 4 - Public Rankings

In [39]:
data_section_4_names = ["MMasseyOrdinals.csv"]
dfs = [None for i in range(len(data_section_4_names))]
for i, name in enumerate(data_section_4_names):
    dfs[i] = pd.read_csv(folder + name)
for i in range(len(data_section_4_names)):
    print(data_section_4_names[i])
    print(dfs[i].head())

MMasseyOrdinals.csv
   Season  RankingDayNum SystemName  TeamID  OrdinalRank
0    2003             35        SEL    1102          159
1    2003             35        SEL    1103          229
2    2003             35        SEL    1104           12
3    2003             35        SEL    1105          314
4    2003             35        SEL    1106          260


### DATA SECTION 5 - Supplements

In [40]:
data_section_5_names = ["MTeamCoaches.csv", "Conferences.csv", "MTeamConferences.csv", "WTeamConferences.csv",
                        "MConferenceTourneyGames.csv", "WConferenceTourneyGames.csv", 
                        "MSecondaryTourneyTeams.csv", "WSecondaryTourneyTeams.csv",
                        "MSecondaryTourneyCompactResults.csv", "WSecondaryTourneyCompactResults.csv",
                        #"MTeamSpellings.csv", "WTeamSpellings.csv", 
                        "MNCAATourneySlots.csv", "WNCAATourneySlots.csv",
                        "MNCAATourneySeedRoundSlots.csv"]
dfs = [None for i in range(len(data_section_5_names))]
for i, name in enumerate(data_section_5_names):
    dfs[i] = pd.read_csv(folder + name)
for i in range(len(data_section_5_names)):
    print(data_section_5_names[i])
    print(dfs[i].head())

MTeamCoaches.csv
   Season  TeamID  FirstDayNum  LastDayNum       CoachName
0    1985    1102            0         154   reggie_minton
1    1985    1103            0         154     bob_huggins
2    1985    1104            0         154  wimp_sanderson
3    1985    1106            0         154    james_oliver
4    1985    1108            0         154   davey_whitney
Conferences.csv
  ConfAbbrev                   Description
0      a_sun       Atlantic Sun Conference
1      a_ten        Atlantic 10 Conference
2        aac  American Athletic Conference
3        acc     Atlantic Coast Conference
4        aec       America East Conference
MTeamConferences.csv
   Season  TeamID ConfAbbrev
0    1985    1102        wac
1    1985    1103        ovc
2    1985    1104        sec
3    1985    1106       swac
4    1985    1108       swac
WTeamConferences.csv
   Season  TeamID ConfAbbrev
0    1998    3102        wac
1    1998    3103        mac
2    1998    3104        sec
3    1998    3106      