In [1]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Data Science
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Others
from tqdm import tqdm
from timeit import default_timer as timer
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
sample_size = [50000, 100000, 500000, 1000000, 2000000, 3000000, 4062364]

gpu_dict = {'tree_method':'gpu_hist'}
ml_list = [RandomForestRegressor(n_jobs=-1, max_depth=8),
           ExtraTreesRegressor(n_jobs=-1, max_depth=8),
           DecisionTreeRegressor(max_depth=8),
           xgb.XGBRegressor(**gpu_dict, max_depth=8),
           LinearRegression(n_jobs=-1)]

results_dict = {'ML':[],
                'Sample_size':[],
                'Training_time':[],
                'Testing_time':[],
                'Training_score':[],
                'Testing_score':[],
                'RMSE':[],
                'MAE':[]}

In [3]:
for s in tqdm(sample_size, colour='Green'):
    for regressor in tqdm(ml_list, desc='ML Loop', colour='Blue'):

        df = pd.read_csv('2011-2022_mergedcrimerate.csv')
        df = df.drop(['Month', 'LSOA code', 'Count', 'AS Score', 'Inflation_rate', 'Indoors Score'], axis=1)
        df = df.sample(s, random_state = 2)

        X = df.drop(columns=['Crime Rate']).values
        y = df['Crime Rate'].values
        # Choose any random state
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=17)

        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)

        start = timer()
        regressor.fit(X_train, y_train)
        end = timer()

        results_dict['Training_time'].append(end-start)
        results_dict['Sample_size'].append(s)
        results_dict['ML'].append(str(regressor)[0:10])

        insample_predictions = regressor.predict(X_train)
        results_dict['Training_score'].append(r2_score(y_train,insample_predictions))

        start = timer()
        outsample_predictions = regressor.predict(X_test)
        end = timer()
        results_dict['Testing_time'].append(end-start)
        
        results_dict['Testing_score'].append(r2_score(y_test,outsample_predictions))
        results_dict['RMSE'].append(np.sqrt(mean_squared_error(y_test,outsample_predictions)))
        results_dict['MAE'].append(mean_absolute_error(y_test,outsample_predictions))

        del df, X, y, X_train, X_test, y_train, y_test, start, end, regressor, insample_predictions, outsample_predictions

ML Loop: 100%|[34m██████████[0m| 5/5 [00:39<00:00,  7.84s/it]
ML Loop: 100%|[34m██████████[0m| 5/5 [00:38<00:00,  7.76s/it]
ML Loop: 100%|[34m██████████[0m| 5/5 [01:06<00:00, 13.30s/it]
ML Loop: 100%|[34m██████████[0m| 5/5 [01:54<00:00, 22.97s/it]
ML Loop: 100%|[34m██████████[0m| 5/5 [03:42<00:00, 44.52s/it]
ML Loop: 100%|[34m██████████[0m| 5/5 [05:33<00:00, 66.73s/it]
ML Loop: 100%|[34m██████████[0m| 5/5 [07:14<00:00, 86.82s/it]
100%|[32m██████████[0m| 7/7 [20:49<00:00, 178.54s/it]


In [4]:
results_df = pd.DataFrame.from_dict(results_dict)
results_df.to_csv('Benchmark_results_allmlmodels_v2.csv', index=False)