In [34]:
# Model prediction using LogisticRegression with upsampling and using liblinear with predict proba

# Import all necessary packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from joblib import dump

In [35]:
# Load the neccessary dataframe from the csv files
train = pd.read_csv("train.csv")

In [36]:
# Check if the datasets contain any null values
print(train.isnull().sum(),end="\n\n-------\n\n")
print(test.isnull().sum())

Id_old         0
Id             0
GP             0
MIN            0
PTS            0
FGM            0
FGA            0
FG%            0
3P Made        0
3PA            0
3P%            0
FTM            0
FTA            0
FT%            0
OREB           0
DREB           0
REB            0
AST            0
STL            0
BLK            0
TOV            0
TARGET_5Yrs    0
dtype: int64

-------

Id_old     0
Id         0
GP         0
MIN        0
PTS        0
FGM        0
FGA        0
FG%        0
3P Made    0
3PA        0
3P%        0
FTM        0
FTA        0
FT%        0
OREB       0
DREB       0
REB        0
AST        0
STL        0
BLK        0
TOV        0
dtype: int64


In [37]:
# Prove that the target variable is imbalance - 83% is "1" with 6669 occurences out of 8000 entries
print(train.TARGET_5Yrs.describe(),end="\n\n-------\n\n")
print(train.TARGET_5Yrs.value_counts())

count    8000.000000
mean        0.833625
std         0.372440
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: TARGET_5Yrs, dtype: float64

-------

1    6669
0    1331
Name: TARGET_5Yrs, dtype: int64


In [38]:
# start the process of resampling

from sklearn.utils import resample

# separate 1s and 0s

fiveyears = train[train.TARGET_5Yrs==1]
lessyears = train[train.TARGET_5Yrs==0]

# upsampling minority
lessyears_upsampled = resample(lessyears, replace=True, n_samples=len(fiveyears), random_state=123)

# combine fiveyears and lessyears_unsampled
up_sampling=pd.concat([fiveyears, lessyears_upsampled])

In [39]:
# check new class counts
up_sampling.TARGET_5Yrs.value_counts()

1    6669
0    6669
Name: TARGET_5Yrs, dtype: int64

In [40]:
# Create a target dataframe
target = up_sampling.pop("TARGET_5Yrs")

In [41]:
# split training sets into training and validation
X_train, X_val, y_train, y_val = train_test_split(up_sampling, target, test_size=0.2, random_state=8)

In [42]:
# Instantiate LogisticRegression Class into reg
reg = LogisticRegression(solver='liblinear')

# fitting
reg.fit(X_train,y_train)

LogisticRegression(solver='liblinear')

In [43]:
from joblib import dump  

dump(reg,  'LogisticRegression_upsampling_liblinear_predict_proba_1.joblib')

['LogisticRegression_upsampling_liblinear_predict_proba_1.joblib']

In [44]:
# export necessary dataset to use by second notebooks

X_train.to_csv('X_train.csv', header = True, index = False)
X_val.to_csv('X_val.csv', header = True, index = False)
y_train.to_csv('y_train.csv', header = True, index = False)
y_val.to_csv('y_val.csv', header = True, index = False)