In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sqlalchemy import create_engine
from config import db_password

In [2]:
db_string = f'postgres://postgres:{db_password}@127.0.0.1:5432/postgres'
engine = create_engine(db_string)

In [3]:
school_geo = pd.read_sql_table('school_geo', con = engine)
eada = pd.read_sql_table('eada', con = engine)

In [4]:
eada.columns

Index(['unitid', 'institution_name', 'addr1_txt', 'addr2_txt', 'city_txt',
       'state_cd', 'zip_text', 'ClassificationCode', 'classification_name',
       'ClassificationOther', 'EFMaleCount', 'EFFemaleCount', 'EFTotalCount',
       'sector_cd', 'sector_name', 'RECRUITEXP_MEN', 'RECRUITEXP_TOTAL',
       'GRND_TOTAL_REVENUE', 'GRND_TOTAL_EXPENSE', 'OPEXPPERTEAM_MEN_Baseball',
       'OPEXPPERTEAM_MEN_Bskball', 'REV_MEN_Baseball', 'REV_MEN_Bskball',
       'EXP_MEN_Baseball', 'EXP_MEN_Bskball', 'lat', 'lon'],
      dtype='object')

In [5]:
school_geo[['classification_name', 'sector_name', 'EFTotalCount', 'REV_MEN_Bskball']] = eada[['classification_name', 'sector_name', 'EFTotalCount','REV_MEN_Bskball']]

In [6]:
school_geo

Unnamed: 0,unitid,institution_name,state_cd,lat,lon,region,nearest_mlb,nearest_mlb_dist,nearest_nba,nearest_nba_dist,classification_name,sector_name,EFTotalCount,REV_MEN_Bskball
0,100654,Alabama A & M University,AL,34.783,-86.569,ESC,Atlanta Braves,134.8,Atlanta Hawks,142.8,NCAA Division I-FCS,"Public, 4-year or above",4697,1591652.0
1,100706,University of Alabama in Huntsville,AL,34.725,-86.640,ESC,Atlanta Braves,136.7,Atlanta Hawks,144.5,NCAA Division II without football,"Public, 4-year or above",6382,705934.0
2,100724,Alabama State University,AL,32.364,-86.295,ESC,Atlanta Braves,149.3,Atlanta Hawks,146.1,NCAA Division I-FCS,"Public, 4-year or above",3643,992694.0
3,100751,The University of Alabama,AL,33.211,-87.546,ESC,Atlanta Braves,183.3,Atlanta Hawks,185.4,NCAA Division I-FBS,"Public, 4-year or above",29488,16028795.0
4,100760,Central Alabama Community College,AL,32.926,-85.946,ESC,Atlanta Braves,108.2,Atlanta Hawks,106.3,NJCAA Division I,"Public, 2-year",736,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1803,486840,Kennesaw State University,GA,34.038,-84.583,SA,Atlanta Braves,12.2,Atlanta Hawks,22.2,NCAA Division I-FCS,"Public, 4-year or above",23837,1511263.0
1804,486901,Milligan College,TN,36.302,-82.295,ESC,Atlanta Braves,207.0,Charlotte Hornets,110.4,NAIA Division II,"Private nonprofit, 4-year or above",734,400392.0
1805,487524,Husson University,ME,44.827,-68.793,NE,Boston Red Sox,206.6,Boston Celtics,204.4,NCAA Division III with football,"Private nonprofit, 4-year or above",2394,140394.0
1806,488785,University of Saint Katherine,CA,33.048,-117.278,PAC,San Diego Padres,24.6,Los Angeles Clippers,89.3,NCCAA Division I,"Private nonprofit, 4-year or above",163,116259.0


In [7]:
school_geo['classification_name'].value_counts()

NCAA Division III with football       212
NJCAA Division I                      187
NCAA Division III without football    159
NCAA Division II with football        151
NCAA Division II without football     126
NCAA Division I-FBS                   123
NCAA Division I-FCS                   119
NAIA Division II                      105
NJCAA Division III                     98
CCCAA                                  96
NCAA Division I without football       93
NAIA Division I                        91
NJCAA Division II                      91
Other                                  52
USCAA                                  38
NWAC                                   29
NCCAA Division II                      23
Independent                             8
NCCAA Division I                        7
Name: classification_name, dtype: int64

In [8]:
for index, row in school_geo.iterrows():
    if row['region'] == 'oth':
        school_geo.drop(index = index, inplace = True)

In [9]:
school_geo['region'].value_counts()

SA     301
MA     293
ENC    255
PAC    233
WNC    193
WSC    174
ESC    130
NE     118
MNT     89
Name: region, dtype: int64

In [10]:
school_geo.drop(columns = ['nearest_mlb', 'nearest_nba', 'unitid', 'institution_name', 'lat', 'lon', 'nearest_mlb_dist', 'state_cd'], inplace = True)

In [11]:
for index, row in school_geo.iterrows():
    if ('NCAA Division I-' in row['classification_name']) or ('NCAA Division I without football' == row['classification_name']):
        school_geo.at[index, 'classification_name'] = 'NCAA Division I'
    else: 
        school_geo.at[index, 'classification_name'] = 'not NCAA Division I'
        
school_geo.columns

Index(['region', 'nearest_nba_dist', 'classification_name', 'sector_name',
       'EFTotalCount', 'REV_MEN_Bskball'],
      dtype='object')

In [12]:
cvars = school_geo.dtypes[school_geo.dtypes == 'object'].index.tolist()
cvars

['region', 'classification_name', 'sector_name']

In [13]:
enc = OneHotEncoder(sparse = False)

encode_df = pd.DataFrame(enc.fit_transform(school_geo[cvars]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(cvars)
encode_df.head()

Unnamed: 0,region_ENC,region_ESC,region_MA,region_MNT,region_NE,region_PAC,region_SA,region_WNC,region_WSC,classification_name_NCAA Division I,classification_name_not NCAA Division I,"sector_name_Private for-profit, 2-year","sector_name_Private for-profit, 4-year or above","sector_name_Private nonprofit, 2-year","sector_name_Private nonprofit, 4-year or above","sector_name_Public, 2-year","sector_name_Public, 4-year or above"
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
school_geo = school_geo.merge(encode_df, left_index = True, right_index = True).drop(columns = cvars)
school_geo.dropna(inplace = True)
school_geo

Unnamed: 0,nearest_nba_dist,EFTotalCount,REV_MEN_Bskball,region_ENC,region_ESC,region_MA,region_MNT,region_NE,region_PAC,region_SA,region_WNC,region_WSC,classification_name_NCAA Division I,classification_name_not NCAA Division I,"sector_name_Private for-profit, 2-year","sector_name_Private for-profit, 4-year or above","sector_name_Private nonprofit, 2-year","sector_name_Private nonprofit, 4-year or above","sector_name_Public, 2-year","sector_name_Public, 4-year or above"
0,142.8,4697,1591652.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,144.5,6382,705934.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,146.1,3643,992694.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,185.4,29488,16028795.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,140.8,3547,434962.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1781,41.8,1191,205353.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1782,135.1,686,20793.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1783,109.0,11034,1589474.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1784,71.8,3991,127705.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
X = school_geo.drop(columns = ['REV_MEN_Bskball'])
y = school_geo['REV_MEN_Bskball']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 8)
X_train.shape

(1263, 19)

In [17]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
#Multiple Linear Regression
from sklearn.linear_model import LinearRegression
lin_model = LinearRegression().fit(X_train_scaled, y_train)
lin_model.score(X_test_scaled, y_test)

0.4896191328414999

In [19]:
it = 0
for col in X.columns:
    print(f'{X_train.columns[it]}: {lin_model.coef_[it]}')
    it += 1

nearest_nba_dist: 227948.46902941345
EFTotalCount: 2070545.7215250963
region_ENC: 2.1471415215653495e+19
region_ESC: 1.6317096987554468e+19
region_MA: 2.250653841945839e+19
region_MNT: 1.2397033733217194e+19
region_NE: 1.501270671236342e+19
region_PAC: 2.0490800233490653e+19
region_SA: 2.298806324939483e+19
region_WNC: 1.9534965811877495e+19
region_WSC: 1.8364252585810168e+19
classification_name_NCAA Division I: 2.325566573984022e+18
classification_name_not NCAA Division I: 2.3255665739839647e+18
sector_name_Private for-profit, 2-year: -4.7121512722408474e+17
sector_name_Private for-profit, 4-year or above: -1.3290961590175485e+18
sector_name_Private nonprofit, 2-year: -1.0519978912024923e+18
sector_name_Private nonprofit, 4-year or above: -8.242513995295696e+18
sector_name_Public, 2-year: -7.352938501154231e+18
sector_name_Public, 4-year or above: -7.798577021339779e+18


In [20]:
#Random forest regression 
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(random_state = 8, n_estimators = 20)
rf_model.fit(X_train_scaled, y_train)
rf_model.score(X_test_scaled, y_test)

0.42609983801625295

In [21]:
it = 0
for col in X.columns:
    print(f'{X_train.columns[it]}: {rf_model.feature_importances_[it]}')
    it += 1

nearest_nba_dist: 0.19594940243981865
EFTotalCount: 0.5904370115866494
region_ENC: 0.02326827989000574
region_ESC: 0.05815714456200892
region_MA: 0.013130254703080123
region_MNT: 0.00761570896465858
region_NE: 0.002058796408232414
region_PAC: 0.016065685258462612
region_SA: 0.0220170028391248
region_WNC: 0.003693013777775096
region_WSC: 0.010581933875375127
classification_name_NCAA Division I: 0.007960956386137604
classification_name_not NCAA Division I: 0.009745194524816644
sector_name_Private for-profit, 2-year: 4.712462626984959e-06
sector_name_Private for-profit, 4-year or above: 0.0012391489487703863
sector_name_Private nonprofit, 2-year: 1.6229914508755544e-05
sector_name_Private nonprofit, 4-year or above: 0.011737586980879306
sector_name_Public, 2-year: 0.00979293690750515
sector_name_Public, 4-year or above: 0.016528999569563642


In [22]:
#AdaBoost Regression
from sklearn.ensemble import AdaBoostRegressor as abr
ab_model = abr(random_state = 8, n_estimators = 20).fit(X_train_scaled, y_train)
ab_model.score(X_test_scaled, y_test)

0.23542449827530643

In [23]:
it = 0
for col in X.columns:
    print(f'{X_train.columns[it]}: {ab_model.feature_importances_[it]}')
    it += 1

nearest_nba_dist: 0.3036661092545966
EFTotalCount: 0.5237453379546722
region_ENC: 0.007751111158472185
region_ESC: 0.06394568632512258
region_MA: 0.027934176662711623
region_MNT: 0.0
region_NE: 0.0
region_PAC: 0.003392659280266457
region_SA: 0.00629458050955819
region_WNC: 0.0
region_WSC: 0.0006438503395367627
classification_name_NCAA Division I: 0.0
classification_name_not NCAA Division I: 0.034846605312250485
sector_name_Private for-profit, 2-year: 0.0
sector_name_Private for-profit, 4-year or above: 0.0
sector_name_Private nonprofit, 2-year: 0.0
sector_name_Private nonprofit, 4-year or above: 0.02744668257429606
sector_name_Public, 2-year: 0.0
sector_name_Public, 4-year or above: 0.0003332006285169016
