# Initial Experimentation

## Set up Environment

In [1]:
!python -c "import pandas as pd; print(pd.__version__)"

1.2.0


In [2]:
!pip show scipy

Name: scipy
Version: 1.6.0
Summary: SciPy: Scientific Library for Python
Home-page: https://www.scipy.org
Author: None
Author-email: None
License: BSD
Location: c:\users\chris\.virtualenvs\mdsi_adsi_feb21_at1-1qc6rzga\lib\site-packages
Requires: numpy
Required-by: seaborn, scikit-learn, phik, pandas-profiling, missingno, ImageHash


In [3]:
import os
import sys
import zipfile
import sklearn
import pandas as pd
import numpy as np
from joblib import dump
from joblib import load
from pprint import pprint
from pandas_profiling import ProfileReport
from IPython.display import IFrame

# Add current project to Python Sys Path
if not os.path.abspath("../..") in sys.path:
    sys.path.append(os.path.abspath("../.."))
else:
    sys.path.remove(os.path.abspath("../.."))
    sys.path.append(os.path.abspath("../.."))

## Get Data

In [4]:
# Check & Create directories
for dir in ["../../data/processed/Chris", "../../models/Chris"]:
    if not os.path.exists(dir): os.mkdir(dir)

In [5]:
# Use bash to get data from kaggle
!kaggle competitions files -c uts-advdsi-nba-career-prediction
!kaggle competitions download -c uts-advdsi-nba-career-prediction --path ../../data/raw --force --quiet

# Unzip the data
with zipfile.ZipFile("../../data/raw/uts-advdsi-nba-career-prediction.zip", "r") as z:
    z.extractall("../../data/raw")


# Delete the zip file
if os.path.exists("../../data/raw/uts-advdsi-nba-career-prediction.zip"):
    os.remove("../../data/raw/uts-advdsi-nba-career-prediction.zip")

name                    size  creationDate         
---------------------  -----  -------------------  
sample_submission.csv  107KB  2021-01-19 10:15:06  
train.csv              679KB  2021-01-19 10:15:06  
test.csv               315KB  2021-01-19 10:15:06  


In [6]:
# Turn off table squishing
pd.options.display.max_columns = None

# Import data
data = pd.read_csv("../../data/raw/train.csv")

# Check data
# print(data.columns)
display(data)

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,10556,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,22.6,2.0,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6,1
1,5342,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,34.9,2.4,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4,1
2,5716,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,34.3,0.4,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6,1
3,13790,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,23.7,0.9,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9,1
4,5470,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,13.7,0.2,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,2996,32,9.2,1.8,0.7,1.8,40.3,-0.1,-0.2,23.1,0.4,0.6,65.7,0.3,1.8,1.9,0.5,0.3,0.2,0.4,1
7996,11679,54,6.0,1.8,0.7,1.4,48.7,0.1,0.1,3.1,0.2,0.4,70.1,1.0,1.1,2.0,0.1,0.0,0.3,0.3,1
7997,5537,85,28.2,10.7,4.0,9.0,45.1,0.2,0.6,23.6,2.8,3.9,69.7,1.0,2.1,3.1,3.4,1.2,0.2,1.8,1
7998,1292,39,7.7,2.5,1.0,2.3,40.1,-0.3,-0.5,13.3,0.6,0.7,74.3,0.4,0.6,0.9,0.2,0.3,0.3,0.5,1


In [7]:
# Fix column names
print(data.columns)
data.columns = data.columns.str.replace(' ', '_')
print(data.columns)

# Save training data again
data.to_csv("../../data/raw/train.csv", index=False)

# Apply same logix to test data
test = pd.read_csv("../../data/raw/test.csv")
test.columns = test.columns.str.replace(' ', '_')
test.to_csv("../../data/raw/test.csv", index=False)

Index(['Id', 'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3P Made', '3PA', '3P%',
       'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV',
       'TARGET_5Yrs'],
      dtype='object')
Index(['Id', 'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3P_Made', '3PA', '3P%',
       'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV',
       'TARGET_5Yrs'],
      dtype='object')


## Check Data

### Data Dictionary

In [8]:
# Direct copy & paste from:
# https://www.kaggle.com/c/uts-advdsi-nba-career-prediction/data

DataDict = \
    { "Id": "Player Identifier"
    , "GP": "Games Played"
    , "MIN": "Minutes Played"
    , "PTS": "Points Per Game"
    , "FGM": "Field Goals Made"
    , "FGA": "Field Goals Attempts"
    , "FG": "Field Goals Percent"
    , "3P_Made": "3-Points Made"
    , "3PA": "3-Points Attempts"
    , "3P": "3-Points Percent"
    , "FTM": "Free Throw Made"
    , "FTA": "Free Throw Attempts"
    , "FT": "Free Throw Percent"
    , "OREB": "Offensive Rebounds"
    , "DREB": "Defensive Rebounds"
    , "REB": "Rebounds"
    , "AST": "Assists"
    , "STL": "Steals"
    , "BLK": "Blocks"
    , "TOV": "Turnovers"
    , "TARGET_5Yrs": "Outcome: 1 if career length >= 5 years, 0 otherwise"
    }

### Data Exploration

In [9]:
# Prevent the need to re-run
if os.path.exists("InitialReport.html"):
    IFrame(src="InitialReport.html", width="100%", height=600)
else:
    # Create profile report
    profile = ProfileReport(data, title="Profile Report")
    # Export
    profile.to_file("InitialReport.html")
    # View
    profile

## Experimentation

### Data Manipulation

Steps:

1. Drop the `ID` column
1. Pop the `target` column
1. Scale the features
1. Split in to train/test

In [10]:
# Drop ID
data = data.drop(['Id'], axis=1)

# Pop targ
feat = data.iloc[:,:-1].to_numpy()
targ = data.iloc[:,-1].to_numpy()

# Scale feat
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
feat = scaler.fit_transform(feat)

# Tran/Test split
from sklearn.model_selection import train_test_split
feat_trn, feat_val, targ_trn, targ_val = train_test_split(feat, targ, test_size=0.2, random_state=6)

In [11]:
# Dump your objects
dump(data, "../../data/processed/Chris/data.joblib")
dump(feat, "../../data/processed/Chris/feat.joblib")
dump(targ, "../../data/processed/Chris/targ.joblib")
dump(feat_trn, "../../data/processed/Chris/feat_trn.joblib")
dump(feat_val, "../../data/processed/Chris/feat_val.joblib")
dump(targ_trn, "../../data/processed/Chris/targ_trn.joblib")
dump(targ_val, "../../data/processed/Chris/targ_val.joblib")
dump(scaler, "../../models/Chris/scaler.joblib")

['../../models/Chris/scaler.joblib']

### Set up Experiment Space

In [12]:
# Set DF for score dumps
pred_scor = pd.DataFrame(
    { "name": []
    , "when": []
    , "mse": []
    , "rmse": []
    , "mae": []
    , "mape": []
    , "r2": []
    }
)

In [13]:
def save_reg_perf(pred, targ, name=None, df_metrics=None, overwrite=True, print=True):
    
    # Import modules
    from sklearn.metrics import mean_squared_error as mse
    from sklearn.metrics import mean_absolute_error as mae
    from sklearn.metrics import mean_absolute_percentage_error as mape
    from sklearn.metrics import r2_score as r2

    # Ensure we're using the global object here
    global pred_scor

    # If you want to use another dataframe, go right ahead. Else, just keep it simple.
    if df_metrics==None: df = pred_scor
    else: df = data_reg_metrics
    
    # Best to define name, but if blank then make None
    if name==None: name=="None"

    # Perform calculations
    val_now = pd.Timestamp.now().strftime('%d/%b %H:%M')
    val_mse = round(mse(targ, pred), 5)
    val_rmse = round(mse(targ, pred, squared=False), 5)
    val_mae = round(mae(targ, pred), 5)
    val_mape = round(mape(targ, pred), 5)
    val_r2 = round(r2(targ, pred), 5)

    # Two different methods of updating the table. In SQL-Speak this is the difference between INSERT and UPDATE
    if overwrite and name in df["name"].to_numpy():
        df.loc[df["name"] == name, ["when"]] = val_now
        df.loc[df["name"] == name, ["mse"]] = val_mse
        df.loc[df["name"] == name, ["rmse"]] = val_rmse
        df.loc[df["name"] == name, ["mae"]] = val_mae
        df.loc[df["name"] == name, ["mape"]] = round(val_mape, 5)
        df.loc[df["name"] == name, ["r2"]] = val_r2
    else:
        new = pd.DataFrame(
            { "name": [name]
            , "when": [val_now]
            , "mse": [val_mse]
            , "rmse": [val_rmse]
            , "mae": [val_mae]
            , "mape": [val_mape]
            , "r2": [val_r2]
            }
        )
        df = df.append(new)

    # Fix Pandas indexes
    df.reindex()

    # Assign back to the global scope
    pred_scor = df

    # Print if needed
    if print:
        display(df)

    # Return
    return df

### Baseline

In [14]:
# Add current project to Python Sys Path
if not os.path.abspath("../..") in sys.path:
    sys.path.append(os.path.abspath("../.."))
else:
    sys.path.remove(os.path.abspath("../.."))
    sys.path.append(os.path.abspath("../.."))

# Get mean of target
avg = data['TARGET_5Yrs'].mean()

# Generate actuals and baseline
bas = np.resize(avg, data.shape[0])
act = np.array(data['TARGET_5Yrs'])

# Check objects
# from src.utils.objects import check_object
# check_object(act, "act")
# check_object(bas, "bas")

# Check performance
save_reg_perf(bas, act, "Raw baseline", print=False)
display(pred_scor)

Unnamed: 0,name,when,mse,rmse,mae,mape,r2
0,Raw baseline,30/Jan 22:10,0.13869,0.37242,0.27739,624623900000000.0,0.0


### Attempt #1: ElasticNet

In [15]:
# Import model
from sklearn.linear_model import ElasticNet

# Instantiate
reg = ElasticNet()

# Train
reg.fit(feat_trn, targ_trn)

# Predict on training data
pred_trn = reg.predict(feat_trn)

# Predict on validation data
pred_val = reg.predict(feat_val)

# Check performance
save_reg_perf(pred_trn, targ_trn, "ElasticNet - Defaults - within bag", print=False)
save_reg_perf(pred_val, targ_val, "ElasticNet - Defaults - out of bag", print=False)
display(pred_scor)

# Backup
dump(reg, "../../models/Chris/01_ElasticNet_Default.joblib")

Unnamed: 0,name,when,mse,rmse,mae,mape,r2
0,Raw baseline,30/Jan 22:10,0.13869,0.37242,0.27739,624623900000000.0,0.0
0,ElasticNet - Defaults - within bag,30/Jan 22:10,0.13913,0.373,0.27826,626594000000000.0,0.0
0,ElasticNet - Defaults - out of bag,30/Jan 22:10,0.13695,0.37006,0.27608,614284800000000.0,-8e-05


['../../models/Chris/01_ElasticNet_Default.joblib']

Marginally better... barely worth it...

### Attempt #2: ElasticNetCV

In [16]:
# Import model
from sklearn.linear_model import ElasticNetCV

# Instantiate
reg = ElasticNetCV()

# Train
reg.fit(feat, targ)

# Predict on validation data
pred = reg.predict(feat)

# Check performance
save_reg_perf(pred, targ, "ElasticNetCV - Defaults")

# Backup
dump(reg, "../../models/Chris/02_ElasticNetCV_Default.joblib")

Unnamed: 0,name,when,mse,rmse,mae,mape,r2
0,Raw baseline,30/Jan 22:10,0.13869,0.37242,0.27739,624623900000000.0,0.0
0,ElasticNet - Defaults - within bag,30/Jan 22:10,0.13913,0.373,0.27826,626594000000000.0,0.0
0,ElasticNet - Defaults - out of bag,30/Jan 22:10,0.13695,0.37006,0.27608,614284800000000.0,-8e-05
0,ElasticNetCV - Defaults,30/Jan 22:10,0.12821,0.35807,0.2609,577764000000000.0,0.07556


['../../models/Chris/02_ElasticNetCV_Default.joblib']

Slignt improvement. But nothing substantive yet.

### Attempt #3: Random Forest

In [17]:
# Import model
from sklearn.ensemble import RandomForestRegressor

# Instantiate
reg = RandomForestRegressor()

# Train
reg.fit(feat, targ)

# Predict on validation data
pred = reg.predict(feat)

# Check performance
save_reg_perf(pred, targ, "RandomForest - Defaults")

# Backup
dump(reg, "../../models/Chris/03_RandomForest_Default.joblib")

Unnamed: 0,name,when,mse,rmse,mae,mape,r2
0,Raw baseline,30/Jan 22:10,0.13869,0.37242,0.27739,624623900000000.0,0.0
0,ElasticNet - Defaults - within bag,30/Jan 22:10,0.13913,0.373,0.27826,626594000000000.0,0.0
0,ElasticNet - Defaults - out of bag,30/Jan 22:10,0.13695,0.37006,0.27608,614284800000000.0,-8e-05
0,ElasticNetCV - Defaults,30/Jan 22:10,0.12821,0.35807,0.2609,577764000000000.0,0.07556
0,RandomForest - Defaults,30/Jan 22:10,0.01875,0.13692,0.09825,206957300000000.0,0.86483


['../../models/Chris/03_RandomForest_Default.joblib']

**NOW** We're cookin with Gas!

### Attempt #4: Next

In [18]:
# Import model
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate
reg = GradientBoostingRegressor()

# Train
reg.fit(feat, targ)

# Predict on validation data
pred = reg.predict(feat)

# Check performance
save_reg_perf(pred, targ, "GBM - Defaults")

# Backup
dump(reg, "../../models/Chris/04_GBM_Default.joblib")

Unnamed: 0,name,when,mse,rmse,mae,mape,r2
0,Raw baseline,30/Jan 22:10,0.13869,0.37242,0.27739,624623900000000.0,0.0
0,ElasticNet - Defaults - within bag,30/Jan 22:10,0.13913,0.373,0.27826,626594000000000.0,0.0
0,ElasticNet - Defaults - out of bag,30/Jan 22:10,0.13695,0.37006,0.27608,614284800000000.0,-8e-05
0,ElasticNetCV - Defaults,30/Jan 22:10,0.12821,0.35807,0.2609,577764000000000.0,0.07556
0,RandomForest - Defaults,30/Jan 22:10,0.01875,0.13692,0.09825,206957300000000.0,0.86483
0,GBM - Defaults,30/Jan 22:10,0.11536,0.33964,0.24124,543170600000000.0,0.16827


['../../models/Chris/04_GBM_Default.joblib']

## Submit to kaggle

In [19]:
# Define function to do the final processing
def final_prediction_processing(final_model_path, data_scaler_path, test_data_path, submission_data_path, check=False):
    
    # Import selected data
    final_model = load(final_model_path)
    data_scaler = load(data_scaler_path)
    test_data = pd.read_csv(test_data_path)

    # Process final data
    submission_data = test_data.pop("Id")
    test_data = data_scaler.fit_transform(test_data)

    # Get prediction
    final_prediction = final_model.predict(test_data)

    # Form final data
    submission_data = pd.concat([pd.DataFrame(submission_data), pd.DataFrame(final_prediction)], axis=1)
    submission_data.columns = ["Id", "TARGET_5Yrs"]
    submission_data["TARGET_5Yrs"] = submission_data["TARGET_5Yrs"].apply(lambda pred: 1 if pred > 0.5 else 0)

    # Check
    if check:
        display(submission_data)

    # Save to file
    submission_data.to_csv(submission_data_path, index=False)

    return submission_data

In [20]:
# Process the final data
submission_data = final_prediction_processing \
    ( final_model_path = "../../models/Chris/03_RandomForest_Default.joblib"
    , data_scaler_path = "../../models/Chris/scaler.joblib"
    , test_data_path = "../../data/raw/test.csv"
    , submission_data_path = "../../data/external/final.csv"
    )

In [21]:
# Function to call Kaggle API
def call_kaggle_api(upload_file, upload_message):
    """
    Call the Kaggle API to submit a file
    Inspiration: https://unix.stackexchange.com/questions/190495/how-to-execute-a-bash-command-in-a-python-script#answer-579917
    
    Args:
        upload_file (str): The source path where the file will be uploaded _from_.
        upload_message (str): The message that will be sent to Kaggle for this submission.

    Raises:
        Exception: CallProcessError, incase subprocess has a wobbly
        Exception: Exception, for everything else

    Returns:
        str: The confirmation string from the API call
    """

    import subprocess

    bash_call = "kaggle competitions submit --competition uts-advdsi-nba-career-prediction --file {upload_file} --message {upload_message} --quiet".format(upload_file=upload_file, upload_message=upload_message)

    try:        
        result = subprocess.run(bash_call, check=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except subprocess.CalledProcessError as err:
        raise Exception(str(err.stderr.decode("utf-8")))
    except Exception as err:
        raise Exception(err)
    else:
        return result.stdout.decode("utf-8")

'Successfully submitted to [UTS AdvDSI] NBA Career Prediction'

In [None]:
# Call the API
call_kaggle_api     ( upload_file = "../../data/external/final.csv"
    , upload_message = "Ctrl-Alt-Elite_RandomForest_Defaults"
    )