# Pre-processing and Training
in this project I'm completing step 4 of my capstone project where I need to:

Create dummy or indicator features for categorical variables

Standardize the magnitude of numeric features using a scaler

Split your data into testing and training datasets


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
df = pd.read_excel(r'/Users/carlriemann/Documents/GitHub/Capstone-Two-EDA/DataWrangling2.0.xlsx')

In [3]:
df.shape

(272, 86)

In [4]:
print(df.columns)

Index(['Player', 'Age', 'Age_last_year', 'Age_year_before_last', 'G',
       'G_last_year', 'G_year_before_last', 'GS', 'GS_last_year',
       'GS_year_before_last', 'MP', 'MP_last_year', 'MP_year_before_last',
       'FG', 'FG_last_year', 'FG_year_before_last', 'FGA', 'FGA_last_year',
       'FGA_year_before_last', 'FG%', 'FG%_last_year', 'FG%_year_before_last',
       '3P', '3P_last_year', '3P_year_before_last', '3PA', '3PA_last_year',
       '3PA_year_before_last', '3P%', '3P%_last_year', '3P%_year_before_last',
       '2P', '2P_last_year', '2P_year_before_last', '2PA', '2PA_last_year',
       '2PA_year_before_last', '2P%', '2P%_last_year', '2P%_year_before_last',
       'eFG%', 'eFG%_last_year', 'eFG%_year_before_last', 'FT', 'FT_last_year',
       'FT_year_before_last', 'FTA', 'FTA_last_year', 'FTA_year_before_last',
       'FT%', 'FT%_last_year', 'FT%_year_before_last', 'ORB', 'ORB_last_year',
       'ORB_year_before_last', 'DRB', 'DRB_last_year', 'DRB_year_before_last',
       '

In [5]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
print("Categorical Columns:", categorical_columns)

Categorical Columns: Index(['Player', 'Tm', 'Season'], dtype='object')


The main goal of this step is to clean, transform, and split the data so that it is ready for training machine learning models.

In [6]:
#drop the 'Player' column
df = df.drop(columns=['Player', 'Tm'])

#apply dummy encoding to the 'Season' column
categorical_columns = ['Season']
df_with_dummies = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [7]:
numerical_features = [
    'PTS', 'PTS_last_year', 'FG', 'FGA', 'FG_last_year', 'FGA_last_year', 
    '2PA', '2P', '2PA_last_year', 'FT_last_year', 'FT', 'FTA', 
    'FTA_last_year', '2P_last_year', 'PTS_year_before_last', 'TOV', 
    'FT_year_before_last', 'FG_year_before_last', 'TOV_last_year', 
    'FGA_year_before_last', 'FTA_year_before_last', 'MP', 'AST'
]

# Step 2: Initialize the scaler
scaler = StandardScaler()

# Step 3: Fit the scaler on the numerical features and transform them
df_with_dummies[numerical_features] = scaler.fit_transform(df_with_dummies[numerical_features])

In [8]:
X = df_with_dummies.drop(columns=['Points_scored_next_season'])  # Features
y = df_with_dummies['Points_scored_next_season']  # Target variable

#Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (217, 83)
X_test shape: (55, 83)
y_train shape: (217,)
y_test shape: (55,)


In [9]:
model = LinearRegression()

#train the model on the training data
model.fit(X_train, y_train)

#make predictions on the test set
y_pred = model.predict(X_test)

#evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 17.110501017613228
R-squared: 0.6772721487106346
