In [1]:
import pandas as pd 
import numpy as np

In [2]:
data_preprocessed = pd.read_csv('raw_data.csv')

In [3]:
data_preprocessed

Unnamed: 0,Player,Age,Num_Stars,USG%,TRB%,AST%,TS%,PTS,AST,REB,Wins,League_Standing,WORP
0,Giannis Antetokounmpo,25,2,37.8,20.6,33.2,0.61,30.5,5.7,12.9,27,1,9.72
1,James Harden,30,2,37.6,8.0,35.4,0.637,38.1,7.5,5.8,21,8,9.99
2,Luka Dončić,20,2,36.9,15.1,49.1,0.618,29.3,8.9,9.6,19,11,8.64
3,Trae Young,21,1,34.6,6.7,42.9,0.585,29.0,8.5,4.4,6,30,4.32
4,Kawhi Leonard,28,4,33.3,12.3,28.0,0.578,25.9,5.1,8.0,23,6,4.59
5,Russell Westbrook,31,2,32.9,11.7,33.3,0.508,24.1,7.1,8.1,21,8,1.62
6,D'Angelo Russell,23,1,32.3,5.8,35.9,0.544,22.3,6.1,3.3,8,28,1.08
7,Bradley Beal,26,1,32.1,7.1,29.1,0.557,28.3,6.8,4.9,9,25,2.43
8,Spencer Dinwiddie,26,1,31.9,5.0,35.2,0.558,22.7,6.2,2.9,16,13,1.89
9,Joel Embiid,25,2,31.8,22.4,16.9,0.59,23.3,3.2,12.6,23,7,3.51


In [4]:
pts = data_preprocessed['PTS'] > (data_preprocessed['PTS'].median() - 1)

In [5]:
ast = data_preprocessed['AST'] > (data_preprocessed['AST'].median() - 1)

In [6]:
reb = data_preprocessed['REB'] > (data_preprocessed['REB'].median() - 1)

In [7]:
wins = data_preprocessed['Wins'] > (data_preprocessed['Wins'].median() - 1)

In [8]:
standing = data_preprocessed['League_Standing'] > data_preprocessed['League_Standing'].median()

In [9]:
worp = data_preprocessed['WORP'] > (data_preprocessed['WORP'].median() - 1)

In [10]:
targets = np.where(((pts & ast & reb) | (wins & standing & worp)), 1, 0)

In [11]:
targets

array([1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1])

In [12]:
data_preprocessed['good'] = targets

In [13]:
targets.sum() / targets.shape[0]

0.48333333333333334

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy = True, with_mean = True, with_std = True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y = None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y = None, copy = None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

In [15]:
data_with_targets = data_preprocessed.drop(['PTS', 'AST', 'REB', 'Wins', 'League_Standing', 'WORP'], axis = 1)

In [16]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

In [17]:
unscaled_inputs.columns

Index(['Player', 'Age', 'Num_Stars', 'USG%', 'TRB%', 'AST%', 'TS%'], dtype='object')

In [18]:
columns_to_scale = ['Age', 'Num_Stars', 'USG%', 'TRB%', 'AST%', 'TS%']

In [19]:
scaler = CustomScaler(columns_to_scale)

In [20]:
scaler.fit(unscaled_inputs)

CustomScaler(columns=['Age', 'Num_Stars', 'USG%', 'TRB%', 'AST%', 'TS%'],
             copy=None, with_mean=None, with_std=None)

In [21]:
scaled_inputs = scaler.transform(unscaled_inputs)

In [22]:
scaled_inputs

Unnamed: 0,Player,Age,Num_Stars,USG%,TRB%,AST%,TS%
0,Giannis Antetokounmpo,-0.344057,-0.356034,2.619601,2.198495,0.931966,1.230405
1,James Harden,0.979238,-0.356034,2.568253,-0.402541,1.135574,2.062662
2,Luka Dončić,-1.667351,-0.356034,2.388535,1.063122,2.403492,1.477
3,Trae Young,-1.402692,-1.542816,1.798034,-0.670902,1.82969,0.459796
4,Kawhi Leonard,0.44992,2.017529,1.464272,0.485114,0.450713,0.244026
5,Russell Westbrook,1.243897,-0.356034,1.361576,0.361255,0.941221,-1.913678
6,D'Angelo Russell,-0.873375,-1.542816,1.207532,-0.85669,1.181848,-0.804002
7,Bradley Beal,-0.079398,-1.542816,1.156185,-0.58833,0.552516,-0.403285
8,Spencer Dinwiddie,-0.079398,-1.542816,1.104837,-1.021836,1.117064,-0.372461
9,Joel Embiid,-0.344057,-0.356034,1.079163,2.570071,-0.576579,0.613918


In [23]:
scaled_wo_names = scaled_inputs.iloc[:, 1:]

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [25]:
xtrain, xtest, ytrain, ytest = train_test_split(scaled_wo_names, targets, train_size = 0.8)

In [26]:
reg = LogisticRegression()

In [27]:
reg.fit(xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
reg.score(xtrain, ytrain)

0.6875

In [29]:
reg.coef_

array([[ 0.45782178, -0.04956589,  1.03366179,  0.11011748, -0.10439744,
         0.15527078]])

In [30]:
feature_name = scaled_wo_names.columns

In [31]:
summary_table = pd.DataFrame(columns = ['Feature Name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table['Odds_ratio'] = np.exp(summary_table['Coefficient'])
summary_table = summary_table.sort_values('Odds_ratio', ascending = False)

In [32]:
summary_table

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
2,USG%,1.033662,2.811342
0,Age,0.457822,1.580627
5,TS%,0.155271,1.167974
3,TRB%,0.110117,1.116409
1,Num_Stars,-0.049566,0.951642
4,AST%,-0.104397,0.900867


In [33]:
reg.score(xtest, ytest)

0.6666666666666666

In [34]:
prob = reg.predict_proba(scaled_wo_names)[:, 1]

In [35]:
unscaled_inputs['Probability'] = prob

In [36]:
unscaled_inputs.sort_values('Probability', ascending = False)

Unnamed: 0,Player,Age,Num_Stars,USG%,TRB%,AST%,TS%,Probability
1,James Harden,30,2,37.6,8.0,35.4,0.637,0.962159
0,Giannis Antetokounmpo,25,2,37.8,20.6,33.2,0.61,0.94591
10,LeBron James,35,2,31.6,12.1,51.3,0.569,0.8657
2,Luka Dončić,20,2,36.9,15.1,49.1,0.618,0.855259
4,Kawhi Leonard,28,4,33.3,12.3,28.0,0.578,0.835007
5,Russell Westbrook,31,2,32.9,11.7,33.3,0.508,0.831551
9,Joel Embiid,25,2,31.8,22.4,16.9,0.59,0.797684
13,Paul George,29,4,30.8,10.1,19.6,0.594,0.7662
12,Derrick Rose,31,2,31.1,5.0,44.0,0.55,0.743512
3,Trae Young,21,1,34.6,6.7,42.9,0.585,0.742133


In [37]:
import pickle

In [38]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [40]:
with open('scaler', 'wb') as file:
    pickle.dump(scaler, file)