In [12]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# feature selection
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# ML models
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from xgboost import XGBRegressor

# pipelines
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# performance metrics
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, precision_score, recall_score, r2_score

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

# import cleaned and transformed dataset
df = pd.read_csv("../data/clean_merged_df.csv")

df.columns

Index(['Unnamed: 0', 'Bail Status', 'Days in Court', 'Offence Location',
       'Date', 'Proceeding', 'Court', 'Offence Number', 'Has Lawyer',
       'Not in Custody', 'Custody Unknown', 'Is In Custody', 'Abbotsford',
       'New Westminster', 'North Vancouver', 'Port Coquitlam', 'Richmond',
       'Surrey', 'Vancouver', 'Victoria', 'Statute', 'Offence', 'Election',
       'Discharge Available', 'SS Available', 'CSO Available',
       'Maximum (Summary)(Years)', 'Maximum (Indictable)(Years)'],
      dtype='object')

In [13]:
df.drop(columns = ['Unnamed: 0', 'Bail Status', 'Offence Location',
                   'Date', 'Proceeding', 'Court', 'Offence Number',
                   'Statute', 'Offence'], inplace=True)

In [14]:
df.columns

Index(['Days in Court', 'Has Lawyer', 'Not in Custody', 'Custody Unknown',
       'Is In Custody', 'Abbotsford', 'New Westminster', 'North Vancouver',
       'Port Coquitlam', 'Richmond', 'Surrey', 'Vancouver', 'Victoria',
       'Election', 'Discharge Available', 'SS Available', 'CSO Available',
       'Maximum (Summary)(Years)', 'Maximum (Indictable)(Years)'],
      dtype='object')

Scale data

In [15]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Convert the NumPy array back to a pandas DataFrame
df_scaled = pd.DataFrame(df_scaled, columns = df.columns)

Train test split

In [16]:
# Split the data into features (X) and target (y)
X = df_scaled.drop('Days in Court', axis=1)
y = df_scaled['Days in Court']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
X_train.dtypes

Has Lawyer                     float64
Not in Custody                 float64
Custody Unknown                float64
Is In Custody                  float64
Abbotsford                     float64
New Westminster                float64
North Vancouver                float64
Port Coquitlam                 float64
Richmond                       float64
Surrey                         float64
Vancouver                      float64
Victoria                       float64
Election                       float64
Discharge Available            float64
SS Available                   float64
CSO Available                  float64
Maximum (Summary)(Years)       float64
Maximum (Indictable)(Years)    float64
dtype: object

Run simple model - Linear Regreaaion using SKLearn

In [18]:
# Create and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Compute the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Compute the R-squared score
r2 = r2_score(y_test, y_pred)

# Compute the adjusted R-squared
adj_r2 = 1 - (1 -r2) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)

# print results
print("Mean squared error: ", mse)
print("R-squared: ", r2)
print("Adjusted R-squared: ", adj_r2)


Mean squared error:  0.7493550049630476
R-squared:  0.10455041572848622
Adjusted R-squared:  0.0675823136255338


Linear Regression using Statsmodels (which gives P values)

In [19]:
# Compute the p-values for each feature
# add a constant column to X_train (statsmodels' linear regression model doesn't automatically add this)
X_train_with_constant = sm.add_constant(X_train) 

# Train the linear regression model
LR_model = sm.OLS(y_train, X_train_with_constant)
LR_results = LR_model.fit()

# evaluate the model
p_values = LR_results.summary2().tables[1]['P>|t|']
r2 = LR_results.rsquared
adj_r2 = LR_results.rsquared_adj

# Print results
print(p_values)
print("R-squared:", r2)
print("Adjusted R-squared:", adj_r2)


const                          7.665502e-01
Has Lawyer                     2.126064e-33
Not in Custody                 4.215103e-01
Custody Unknown                4.557076e-01
Is In Custody                  8.163952e-01
Abbotsford                     7.630209e-02
New Westminster                9.781373e-03
North Vancouver                1.387660e-02
Port Coquitlam                 8.314737e-01
Richmond                       3.313304e-01
Surrey                         3.733154e-03
Vancouver                      1.728771e-01
Victoria                       5.518089e-01
Election                       6.252158e-07
Discharge Available            4.997173e-01
SS Available                   4.997173e-01
CSO Available                  4.997173e-01
Maximum (Summary)(Years)       4.490262e-03
Maximum (Indictable)(Years)    1.373527e-09
Name: P>|t|, dtype: float64
R-squared: 0.11506298767714773
Adjusted R-squared: 0.10819540554160445


In [20]:

# perform linear regression
X = df[['col1', 'col2', 'col3']]
y = df['target']
model = sm.OLS(y, sm.add_constant(X)).fit()

# print R-squared and p-values
print('R-squared:', model.rsquared)
print(model.summary())


KeyError: "None of [Index(['col1', 'col2', 'col3'], dtype='object')] are in the [columns]"

In [None]:
# create XGBoost model and fit to data
model = XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                      max_depth=5, alpha=10, n_estimators=50, seed=123)

model.fit(X_train, y_train)

# predict on test set and compute RMSE
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)

# print results
print('RMSE:', rmse)

RMSE: 0.8634164260608546
