## Predicting which businesses are unlikely to pay their fines

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [None]:
from google.colab import files
files.upload();

Saving ML_P09_4fined_legal_entities.csv to ML_P09_4fined_legal_entities (3).csv


In [None]:
data = pd.read_csv('ML_P09_4fined_legal_entities.csv', encoding='iso-8859-1')

In [None]:
len(data.columns), len(data)
#data.dtypes
#data.head()

(96, 20820)

In [None]:
data['fine_paid_p'].describe()
data = data[data['fine_paid_p'].isin([0, 1])]
data['fine_paid_p'].value_counts()

data = data.sample(frac=1, random_state=42)
data = data.iloc[:1000, :] # Currently take less data to avoid OOM

#data_0 = data[data['fine_paid_p'] == 0].sample(n=500, random_state=42)
#data_1 = data[data['fine_paid_p'] == 1].sample(n=500, random_state=42)
#data = pd.concat([data_0, data_1])
#data = data.sample(frac=1, random_state=42).reset_index(drop=True)

data = data.drop('fine_0_count', axis=1)
data = data.drop('fine_1_count', axis=1)
data = data.drop('fine_count', axis=1)
len(data.columns), len(data)

(93, 1000)

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=42)
X_train = train.drop('fine_paid_p', axis=1)
y_train = train['fine_paid_p']
X_test = test.drop('fine_paid_p', axis=1)
y_test = test['fine_paid_p']

In [None]:
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

numerical_imputer = SimpleImputer(strategy='mean')
X_train_numerical = numerical_imputer.fit_transform(X_train[numerical_cols])
X_test_numerical = numerical_imputer.transform(X_test[numerical_cols])

categorical_imputer = SimpleImputer(strategy='most_frequent')
X_train_categorical = categorical_imputer.fit_transform(X_train[categorical_cols])
X_test_categorical = categorical_imputer.transform(X_test[categorical_cols])

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_categorical_encoded = encoder.fit_transform(X_train_categorical)
X_test_categorical_encoded = encoder.transform(X_test_categorical)

X_train_processed = np.concatenate([X_train_numerical, X_train_categorical_encoded], axis=1)
X_test_processed = np.concatenate([X_test_numerical, X_test_categorical_encoded], axis=1)

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_processed, y_train)
rf_predictions = rf_model.predict(X_test_processed)

gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_processed, y_train)
gb_predictions = gb_model.predict(X_test_processed)

 'Y3...2020' 'HM...2017' 'I2...2023' 'G1...2024' 'Y3...2019' 'X2...2021'
 'Y2...2022' 'I1...2021' 'HM...2020' 'Y2...2023' 'G1...2023' 'Y3...2018'
 'HM...2013']. At least one non-missing value is needed for imputation with strategy='mean'.
 'Y3...2020' 'HM...2017' 'I2...2023' 'G1...2024' 'Y3...2019' 'X2...2021'
 'Y2...2022' 'I1...2021' 'HM...2020' 'Y2...2023' 'G1...2023' 'Y3...2018'
 'HM...2013']. At least one non-missing value is needed for imputation with strategy='mean'.


In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
}

xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_processed, y_train)

best_xgb_model = grid_search.best_estimator_
xgb_predictions = best_xgb_model.predict(X_test_processed)

In [None]:
naive_predictions = np.zeros(X_test_processed.shape[0], dtype=int)

In [None]:
naive_accuracy = accuracy_score(y_test, naive_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)
gb_accuracy = accuracy_score(y_test, gb_predictions)
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print(f"Naive Predictor Accuracy: {naive_accuracy}")
print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Gradient Boosting Accuracy: {gb_accuracy}")
print(f"XGBoost Accuracy: {xgb_accuracy}")

Naive Predictor Accuracy: 0.765
Random Forest Accuracy: 0.9
Gradient Boosting Accuracy: 0.89
XGBoost Accuracy: 0.895
