In [1]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read salary data
file_path = Path("All_data_cleaned.csv")
crime = pd.read_csv(file_path)

# Display sample data
crime.head()

Unnamed: 0,Year,City,Percent Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - High School Graduate (and equivalent),"Percent Population 25 years and over - Some college, no degree",Percent Population 25 years and over - Associate's degree,Percent Population 25 years and over - Bachelor's degree,Percent Population 25 years and over - Graduate or Prefessional Degree,Total population,...,Percent 55 to 59 years,Percent 60 to 64 years,Percent 65 to 74 years,Percent 75 to 84 years,Percent 85 years and over,Violent Crimes Sum,% All Families,House Price Mean,Percent Home Occupied,Percent Renter Occupied
0,2015,Alameda,3.6,3.6,12.3,18.7,7.3,36.0,18.5,78614.0,...,7.3,6.4,8.3,4.6,2.0,148.0,5.2,789464.7,44.102898,55.897102
1,2015,Alhambra,8.3,7.6,28.8,14.5,6.0,22.9,11.9,85572.0,...,8.9,6.0,9.0,3.8,2.6,168.0,11.8,524186.5,41.495811,58.504189
2,2015,Anaheim,14.4,11.4,23.6,21.1,6.4,17.4,5.8,350738.0,...,5.4,4.0,5.8,2.9,1.6,1271.0,11.8,479128.8,43.471912,56.528088
3,2015,Bakersfield,10.6,10.2,28.8,21.1,7.9,13.9,7.5,373627.0,...,5.4,4.5,6.1,2.5,0.9,1810.0,14.3,214306.0,57.775194,42.224806
4,2015,Baldwin Park,22.1,12.3,31.1,14.3,6.3,10.8,3.2,77056.0,...,6.1,4.8,9.0,2.8,1.2,299.0,14.4,343992.4,54.430721,45.569279


In [3]:
# Define features set
X = crime.copy()
X.drop(columns=["Violent Crimes Sum", "City"], axis=1, inplace=True)
X.head()

Unnamed: 0,Year,Percent Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - High School Graduate (and equivalent),"Percent Population 25 years and over - Some college, no degree",Percent Population 25 years and over - Associate's degree,Percent Population 25 years and over - Bachelor's degree,Percent Population 25 years and over - Graduate or Prefessional Degree,Total population,Percent Under 5 years,...,Percent 45 to 54 years,Percent 55 to 59 years,Percent 60 to 64 years,Percent 65 to 74 years,Percent 75 to 84 years,Percent 85 years and over,% All Families,House Price Mean,Percent Home Occupied,Percent Renter Occupied
0,2015,3.6,3.6,12.3,18.7,7.3,36.0,18.5,78614.0,5.3,...,16.0,7.3,6.4,8.3,4.6,2.0,5.2,789464.7,44.102898,55.897102
1,2015,8.3,7.6,28.8,14.5,6.0,22.9,11.9,85572.0,4.5,...,14.2,8.9,6.0,9.0,3.8,2.6,11.8,524186.5,41.495811,58.504189
2,2015,14.4,11.4,23.6,21.1,6.4,17.4,5.8,350738.0,7.1,...,13.4,5.4,4.0,5.8,2.9,1.6,11.8,479128.8,43.471912,56.528088
3,2015,10.6,10.2,28.8,21.1,7.9,13.9,7.5,373627.0,7.9,...,11.2,5.4,4.5,6.1,2.5,0.9,14.3,214306.0,57.775194,42.224806
4,2015,22.1,12.3,31.1,14.3,6.3,10.8,3.2,77056.0,6.0,...,14.6,6.1,4.8,9.0,2.8,1.2,14.4,343992.4,54.430721,45.569279


In [4]:
y = crime["Violent Crimes Sum"].values.reshape(-1, 1)
y[:5]

array([[ 148.],
       [ 168.],
       [1271.],
       [1810.],
       [ 299.]])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [7]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [8]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=1000)

In [10]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [11]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [12]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

labels = [f"Class {i}" for i in range(len(cm))]

cm_df = pd.DataFrame(cm, index=labels, columns=labels)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [13]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Class 0,Class 1,Class 2,Class 3,Class 4,Class 5,Class 6,Class 7,Class 8,Class 9,...,Class 219,Class 220,Class 221,Class 222,Class 223,Class 224,Class 225,Class 226,Class 227,Class 228
Class 0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Class 1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Class 2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Class 3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Class 4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Class 224,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Class 225,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Class 226,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Class 227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Accuracy Score : 0.006535947712418301
Classification Report
              precision    recall  f1-score   support

        43.0       0.00      0.00      0.00         1
        56.0       0.00      0.00      0.00         0
        65.0       0.00      0.00      0.00         0
        68.0       0.00      0.00      0.00         1
        69.0       0.00      0.00      0.00         0
        72.0       0.00      0.00      0.00         1
        86.0       0.00      0.00      0.00         0
        91.0       0.00      0.00      0.00         1
        95.0       0.00      0.00      0.00         0
       100.0       0.00      0.00      0.00         0
       105.0       0.00      0.00      0.00         1
       108.0       0.00      0.00      0.00         1
       109.0       0.00      0.00      0.00         2
       110.0       0.00      0.00      0.00         2
       112.0       0.00      0.00      0.00         1
       114.0       0.00      0.00      0.00         1
       117.0       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
