# Logistic Regression(baseline)

Any player who has never competed in an NBA game prior to that year is considered a rookie in the league. The NBA presents the NBA Rookie of the Year Award to the top rookie at the conclusion of the regular season.A basketball player's transition to the NBA is a significant event. The beginning of their careers is something that sports analysts and fans eagerly anticipate to monitor and predict how they will perform in the future.This notebook will use Logistic Regression to predict whether or not a player will last five years in the league based on his stats.

The steps are:
1. Load Data
2. Data Cleaning and Exploration
3. Train LR Classifier
4. Evaluation
5. Feature importance
6. Prediction

In [1]:
import pandas as pd
import numpy as np
import shap
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Qt5Agg') 
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
### 1. Setup Repository

In [10]:
cd /Users/frodo/desktop/36114/project/adv_dsi_2023

C:\Users\frodo\desktop\36114\project\adv_dsi_2023


In [8]:
import os
os.getcwd()

'C:\\Users\\frodo\\desktop\\36114\\project\\adv_dsi_2023'

In [11]:
cookiecutter https://github.com/drivendata/cookiecutter-data-science

SyntaxError: invalid syntax (3580079992.py, line 1)

In [6]:
cd adv_dsi_2023

[WinError 2] 系统找不到指定的文件。: 'adv_dsi_2023'
C:\Users\frodo\desktop\36114\project\adv_dsi_2023


## 1. Load data

In [None]:
#Load the dataset
df = pd.read_csv("nba_train.csv")

### Define X and Y

In [None]:
# Extract the target variable into a variable called y
y = df.pop('TARGET_5Yrs')

# Set the index of a DataFrame called 'df_cleaned' to the values of the column 'Id'.
df = df.set_index(['Id'])

### Distribution of Y

In [None]:
y.value_counts()

In [None]:
# Calculate positive and negative sample ratios
sizes = y.value_counts(normalize=True).plot(kind="bar", color=["red", "blue"])

### Display the first 5 rows of train data

In [None]:
df.head()

### Display the shape of train data

In [None]:
df.shape

There are 8000 observations with 20 features,and there is no missing value in train dataset

In [None]:
df.info()

In [None]:
df.isna().any()

In [None]:
df.isnull().any()

### Description of data

In [None]:
df.describe()

We can see there are some negative values in our dataset, but negative values are usually not present in this case, we may handle this problem in the data preprocess step

## 2.Data Preparation

We will do data cleaning and data exploration simultaneously because data exploration often goes hand in hand with data cleaning processes in our experiment

### Split data

In [None]:
# Split the dataset into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42, stratify=y)

### Correlation matrix

In [None]:
#Correlation matrix
corr_matrix = X_train.corr()

In [None]:
#plot heatmap
fig, ax = plt.subplots(figsize=(16,10))
ax = sns.heatmap(corr_matrix, annot=True, linewidths=0.5, fmt=".2f", cmap="YlGnBu")

There are some variables highly correlated so we deleted some features.

In [None]:
X_train = X_train.drop(['FGA', '3PA', 'FTA', 'REB'], axis = 1)

## 3. Train model

we generate a custompreprocessor ensure there is no negative value in our train dataset

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[X < 0] = 0
        return X

### Define training pipeline

In [None]:
# Create Pipeline
pipeline = Pipeline([
    ('preprocessor', CustomPreprocessor()),
    ('classifier', LogisticRegression(class_weight='balanced', random_state=42))
])

### Grid Search

In [None]:
# Specify the parameter grid
gs_param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']  # solvers that support both 'l1' and 'l2' penalties
}

# Create a GridSearchCV instance with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, gs_param_grid, scoring='roc_auc', cv=5, 
                           verbose=2, n_jobs=-1, error_score='raise')

# Fit the GridSearchCV 
grid_search.fit(X_train, y_train)

# Get the best parameters and score
gs_best_params = grid_search.best_params_
gs_best_score = grid_search.best_score_

print("Best Parameters:", gs_best_params)
print("Best Score:", gs_best_score)

### Random Search

In [None]:
# Specify the parameter grid
rs_param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga']  # solvers that support both 'l1' and 'l2' penalties
}

# Create a RandomizedSearchCV instance with the pipeline and parameter grid
random_search = RandomizedSearchCV(pipeline, param_distributions=rs_param_grid, n_iter=10,
                                   scoring='roc_auc', cv=5, verbose=2, n_jobs=-1,
                                   error_score='raise')

# Fit the RandomizedSearchCV 
random_search.fit(X_train, y_train)

# Get the best parameters and score
rs_best_params = random_search.best_params_
rs_best_score = random_search.best_score_

print("Best Parameters:", rs_best_params)
print("Best Score:", rs_best_score)

## 4. Evaluation on validation data

we applied the trained model on validation data to obtain model performance 

In [None]:
X_test = X_test.drop(['FGA', '3PA', 'FTA', 'REB'], axis = 1)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
# prediction
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"AUC: {auc:.4f}")

### Classification Report

In [None]:
print(classification_report(y_test, y_pred))

### Plot ROC curve

In [None]:
y_pred_proba = best_model.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test,grid_search.best_estimator_.predict_proba(X_test)[:,1])

In [None]:
fpr, tpr, _ = roc_curve(y_test,  y_pred_proba)
auc = roc_auc_score(y_test, y_pred_proba)

plt.plot(fpr,tpr,label='LR AUC %0.4f' % auc, color='blue', lw = 2)
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating Curve')
plt.legend(loc="lower right")
plt.show()

## 5.Feature importance

In [None]:
feature_importance = best_model.named_steps['classifier'].coef_
feature_names = X_train.columns.tolist()
result = dict(zip(feature_names, feature_importance[0]))

### Plot the feature importances

In [None]:
plt.figure(figsize=(20, 10))
plt.bar(np.arange(len(feature_importance[0])), feature_importance[0], align='center')
plt.xticks(np.arange(len(feature_importance[0])), feature_names, rotation=90)
plt.xlabel('Feature Names')
plt.ylabel('Feature Importance')
plt.title('Feature Importances')
plt.show()

## 6.Prediction

Finally we applied the trained model on test data and upload our result to obtain score

### Load test data

In [None]:
#Load the dataset
df_test = pd.read_csv("nba_test.csv")
df_test = df_test.set_index(['Id'])
df_test = df_test.drop(['FGA', '3PA', 'FTA', 'REB'], axis = 1)

### Obtain predict value

In [None]:
# prediction
y_label = grid_search.best_estimator_.predict(df_test)
y_score = best_model.predict_proba(df_test)[:, 1]