In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [2]:
# https://www.kaggle.com/c/mens-march-mania-2022/data

# regular season data
data_regszn = pd.read_csv('MRegularSeasonDetailedResults.csv')

# tournament data
data_tournament = pd.read_csv('MNCAATourneyDetailedResults.csv')

In [3]:
data_regszn.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,3,14,11,18,14,24,13,23,7,1,22,22,53,2,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,8,20,10,19,15,28,16,13,4,4,18,24,67,6,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,8,18,17,29,17,26,15,10,5,2,25,22,73,3,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,3,9,17,31,6,19,11,12,14,2,18,18,49,6,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,6,14,11,13,17,22,12,14,4,4,20,24,62,6,16,17,27,21,15,12,10,7,1,14


In [4]:
data_tournament.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,11,29,17,26,14,30,17,12,5,3,22,29,67,12,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,7,23,11,14,11,36,22,16,10,7,8,20,64,4,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,6,14,16,22,10,27,18,9,7,4,19,25,69,7,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,3,7,18,25,11,20,15,18,13,1,19,27,60,7,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,7,20,15,23,18,20,17,13,8,2,14,25,56,9,21,15,20,10,26,16,14,5,8,19


In [5]:
combined_data = pd.concat([data_regszn, data_tournament])

In [6]:
# extract all game stats of the winning team
# year, daynum, teamID, and numOT ignored
wins_data = combined_data[['WScore', 'WLoc', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 
                  'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

wins_data.rename(columns = {'WScore':'Score', 
                            'WLoc':'Loc',
                            'WFGM':'FGM',
                            'WFGA':'FGA',
                            'WFGM3':'FGM3',
                            'WFGA3':'FGA3',
                            'WFTM':'FTM',
                            'WFTA':'FTA',
                            'WAst':'Ast',
                            'WOR':'OR',
                            'WDR':'DR',
                            'WTO':'TO',
                            'WStl':'Stl',
                            'WBlk':'Blk',
                            'WPF':'PF'
                           }, inplace = True)

wins_data['Won'] = 1

wins_data['Loc'] = wins_data['Loc'].map({'H':1, 'N':0, 'A':-1})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wins_data.rename(columns = {'WScore':'Score',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wins_data['Won'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wins_data['Loc'] = wins_data['Loc'].map({'H':1, 'N':0, 'A':-1})


In [7]:
# do the same thing for losses
# there is no LLoc, only WLoc, so reverse it when mapping
losses_data = combined_data[['LScore', 'WLoc', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 
                  'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']]

losses_data.rename(columns = {'LScore':'Score', 
                            'WLoc':'Loc',
                            'LFGM':'FGM',
                            'LFGA':'FGA',
                            'LFGM3':'FGM3',
                            'LFGA3':'FGA3',
                            'LFTM':'FTM',
                            'LFTA':'FTA',
                            'LAst':'Ast',
                            'LOR':'OR',
                            'LDR':'DR',
                            'LTO':'TO',
                            'LStl':'Stl',
                            'LBlk':'Blk',
                            'LPF':'PF'
                           }, inplace = True)

losses_data['Won'] = 0

losses_data['Loc'] = losses_data['Loc'].map({'H':-1, 'N':0, 'A':1})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  losses_data.rename(columns = {'LScore':'Score',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  losses_data['Won'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  losses_data['Loc'] = losses_data['Loc'].map({'H':-1, 'N':0, 'A':1})


In [12]:
# combine wins and losses dataframes
data = pd.concat([wins_data, losses_data])

# shuffle data
data = data.sample(frac = 1)

data.head()

Unnamed: 0,Score,Loc,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,Won
73780,68,-1,26,64,5,14,11,21,15,30,17,14,5,4,26,0
18184,72,1,27,54,4,11,14,16,10,22,13,11,5,2,21,1
51994,51,-1,18,47,5,18,10,17,10,16,8,16,4,3,15,0
40098,51,-1,17,50,3,14,14,23,13,31,10,24,2,6,22,0
72569,64,-1,24,60,5,20,11,16,17,21,14,15,6,2,19,0
