In [1]:
import pandas as pd
import numpy as np 
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Read salary data
file_path = Path("All_data_cleaned.csv")
crime = pd.read_csv(file_path)

# Display sample data
crime.head()

Unnamed: 0,Year,City,Percent Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - High School Graduate (and equivalent),"Percent Population 25 years and over - Some college, no degree",Percent Population 25 years and over - Associate's degree,Percent Population 25 years and over - Bachelor's degree,Percent Population 25 years and over - Graduate or Prefessional Degree,Total population,...,Percent 75 to 84 years,Percent 85 years and over,Violent Crimes Sum,Unemployment Rate,Income,House price mean,% All Families,Crime_Rate_per_100k,Percent Home Occupied,Percent Renter Occupied
0,2010,Alameda,5.5,4.5,17.0,19.9,5.8,31.0,16.3,73981.0,...,3.8,2.1,173.0,11.1,28011.0,605675.9,9.6,233.843825,46.920853,53.079147
1,2010,Alhambra,10.1,7.4,21.7,19.6,8.9,21.7,10.5,83202.0,...,4.2,3.0,185.0,9.2,28100.0,441616.7,10.7,222.350424,38.053381,61.946619
2,2010,Anaheim,15.5,10.7,24.3,18.8,6.7,18.0,6.0,337259.0,...,3.0,1.2,1161.0,14.4,100404.0,371899.4,12.4,344.245817,48.38851,51.61149
3,2010,Antioch,7.7,7.3,26.9,31.7,8.8,13.6,4.1,102745.0,...,2.4,1.1,864.0,15.3,30970.0,181427.3,8.0,840.916833,62.767194,37.232806
4,2010,Apple Valley,1.5,7.7,33.4,26.9,11.0,10.5,9.0,69387.0,...,6.4,2.2,188.0,12.3,23900.0,138720.2,10.4,270.944125,70.690377,29.309623


In [3]:
# Define features set
X = crime.copy()
X.drop(columns=["Crime_Rate_per_100k", "City"], axis=1, inplace=True)
X.head()

Unnamed: 0,Year,Percent Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - High School Graduate (and equivalent),"Percent Population 25 years and over - Some college, no degree",Percent Population 25 years and over - Associate's degree,Percent Population 25 years and over - Bachelor's degree,Percent Population 25 years and over - Graduate or Prefessional Degree,Total population,Percent Under 5 years,...,Percent 65 to 74 years,Percent 75 to 84 years,Percent 85 years and over,Violent Crimes Sum,Unemployment Rate,Income,House price mean,% All Families,Percent Home Occupied,Percent Renter Occupied
0,2010,5.5,4.5,17.0,19.9,5.8,31.0,16.3,73981.0,6.9,...,5.8,3.8,2.1,173.0,11.1,28011.0,605675.9,9.6,46.920853,53.079147
1,2010,10.1,7.4,21.7,19.6,8.9,21.7,10.5,83202.0,4.7,...,9.0,4.2,3.0,185.0,9.2,28100.0,441616.7,10.7,38.053381,61.946619
2,2010,15.5,10.7,24.3,18.8,6.7,18.0,6.0,337259.0,7.5,...,5.2,3.0,1.2,1161.0,14.4,100404.0,371899.4,12.4,48.38851,51.61149
3,2010,7.7,7.3,26.9,31.7,8.8,13.6,4.1,102745.0,8.3,...,6.3,2.4,1.1,864.0,15.3,30970.0,181427.3,8.0,62.767194,37.232806
4,2010,1.5,7.7,33.4,26.9,11.0,10.5,9.0,69387.0,6.3,...,6.3,6.4,2.2,188.0,12.3,23900.0,138720.2,10.4,70.690377,29.309623


In [4]:
y = crime["Crime_Rate_per_100k"].values.reshape(-1, 1)
y[:5]

array([[233.84382477],
       [222.35042427],
       [344.24581701],
       [840.91683294],
       [270.94412498]])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=8)

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [7]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [8]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Create the random forest classifier instance
regressor = RandomForestRegressor(n_estimators=1000, random_state=8)

In [10]:
# Fit the model and use .ravel()on the "y_train" data. 
regressor = regressor.fit(X_train_scaled, y_train.ravel())

In [11]:
# Making predictions using the testing data
predictions = regressor.predict(X_test_scaled)


In [12]:
# Check if 'predictions' is not 1-dimensional and reshape if necessary
if predictions.ndim != 1:
    predictions = predictions.flatten()

# Check if 'y_test' is not 1-dimensional and reshape if necessary
if y_test.ndim != 1:
    y_test = y_test.flatten()

In [13]:
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,148.842184,119.793729
1,360.342158,359.563884
2,159.626737,168.934525
3,394.407582,355.505368
4,486.940379,433.151080
...,...,...
287,262.212264,257.247350
288,275.952386,259.829255
289,85.622477,43.874572
290,522.595103,524.911548


In [14]:
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 24.495324503519022
Mean Squared Error (MSE): 2626.3947551344836
Root Mean Squared Error (RMSE): 51.24836343859659
R-squared (R²): 0.9609398008619956




In [15]:
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(mae, 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 24.5 degrees.
Accuracy: 93.29 %.
