In [8]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [9]:
# Read salary data
file_path = Path("All_data_cleaned.csv")
crime = pd.read_csv(file_path)

# Display sample data
crime.head()

Unnamed: 0,Year,City,Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - Less than 9th grade,Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - 9th to12th (No Diploma),Population 25 years and over - High School Graduate (and equivalent),Percent Population 25 years and over - High School Graduate (and equivalent),"Population 25 years and over - Some college, no degree","Percent Population 25 years and over - Some college, no degree",...,75 to 84 years,Percent 75 to 84 years,85 years and over,Percent 85 years and over,Violent Crimes Sum,% All Families,House Price Mean,Total,Total Occupied,Renter occupied
0,2015,Alameda,2089,3.6,2089,3.6,7242,12.3,11010,18.7,...,3599,4.60%,1541,2.00%,148.0,5.2,789464.7,30710,13544,17166
1,2015,Alhambra,5402,8.3,4923,7.6,18760,28.8,9435,14.5,...,3237,3.80%,2210,2.60%,168.0,11.8,524186.5,29362,12184,17178
2,2015,Anaheim,32506,14.4,25762,11.4,53477,23.6,47633,21.1,...,10167,2.90%,5608,1.60%,1271.0,11.8,479128.8,99991,43468,56523
3,2015,Bakersfield,23326,10.6,22434,10.2,63568,28.8,46592,21.1,...,9168,2.50%,3276,0.90%,1810.0,14.3,214306.0,114383,66085,48298
4,2015,Baldwin Park,11082,22.1,6156,12.3,15636,31.1,7172,14.3,...,2120,2.80%,933,1.20%,299.0,14.4,343992.4,18541,10092,8449


In [10]:
columns_to_convert = [
    'Total population', 'Under 5 years', 'Percent Under 5 years', 
    '5 to 9 years', 'Percent 5 to 9 years', '10 to 14 years', 
    'Percent 10 to 14 years', '15 to 19 years', 'Percent 15 to 19 years', 
    '20 to 24 years', 'Percent 20 to 24 years', '25 to 34 years', 
    'Percent 25 to 34 years', '35 to 44 years', 'Percent 35 to 44 years', 
    '45 to 54 years', 'Percent 45 to 54 years', '55 to 59 years', 
    'Percent 55 to 59 years', '60 to 64 years', 'Percent 60 to 64 years', 
    '65 to 74 years', 'Percent 65 to 74 years', '75 to 84 years', 
    'Percent 75 to 84 years', '85 years and over', 'Percent 85 years and over', 'Total', 'Total Occupied', 'Renter occupied'
]

for column in columns_to_convert:
    crime[column] = crime[column].str.replace(',|%', '', regex=True) .astype(float)

In [11]:
# Define features set
X = crime.copy()
X.drop(columns=["Violent Crimes Sum", "City"], axis=1, inplace=True)
X.head()

Unnamed: 0,Year,Population 25 years and over - Less than 9th grade,Percent Population 25 years and over - Less than 9th grade,Population 25 years and over - 9th to12th (No Diploma),Percent Population 25 years and over - 9th to12th (No Diploma),Population 25 years and over - High School Graduate (and equivalent),Percent Population 25 years and over - High School Graduate (and equivalent),"Population 25 years and over - Some college, no degree","Percent Population 25 years and over - Some college, no degree",Population 25 years and over - Associate's degree,...,Percent 65 to 74 years,75 to 84 years,Percent 75 to 84 years,85 years and over,Percent 85 years and over,% All Families,House Price Mean,Total,Total Occupied,Renter occupied
0,2015,2089,3.6,2089,3.6,7242,12.3,11010,18.7,4306,...,8.3,3599.0,4.6,1541.0,2.0,5.2,789464.7,30710.0,13544.0,17166.0
1,2015,5402,8.3,4923,7.6,18760,28.8,9435,14.5,3932,...,9.0,3237.0,3.8,2210.0,2.6,11.8,524186.5,29362.0,12184.0,17178.0
2,2015,32506,14.4,25762,11.4,53477,23.6,47633,21.1,14397,...,5.8,10167.0,2.9,5608.0,1.6,11.8,479128.8,99991.0,43468.0,56523.0
3,2015,23326,10.6,22434,10.2,63568,28.8,46592,21.1,17529,...,6.1,9168.0,2.5,3276.0,0.9,14.3,214306.0,114383.0,66085.0,48298.0
4,2015,11082,22.1,6156,12.3,15636,31.1,7172,14.3,3162,...,9.0,2120.0,2.8,933.0,1.2,14.4,343992.4,18541.0,10092.0,8449.0


In [12]:
y = crime["Violent Crimes Sum"].values.reshape(-1, 1)
y[:5]

array([[ 148.],
       [ 168.],
       [1271.],
       [1810.],
       [ 299.]])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [14]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [15]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [16]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=10000)

In [24]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

In [25]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [26]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

labels = [f"Class {i}" for i in range(len(cm))]

cm_df = pd.DataFrame(cm, index=labels, columns=labels)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [27]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Class 0,Class 1,Class 2,Class 3,Class 4,Class 5,Class 6,Class 7,Class 8,Class 9,...,Class 226,Class 227,Class 228,Class 229,Class 230,Class 231,Class 232,Class 233,Class 234,Class 235
Class 0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Class 1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Class 2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Class 3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Class 4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Class 231,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Class 232,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Class 233,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Class 234,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Accuracy Score : 0.0
Classification Report
              precision    recall  f1-score   support

        39.0       0.00      0.00      0.00       1.0
        41.0       0.00      0.00      0.00       1.0
        43.0       0.00      0.00      0.00       0.0
        49.0       0.00      0.00      0.00       1.0
        52.0       0.00      0.00      0.00       1.0
        65.0       0.00      0.00      0.00       0.0
        68.0       0.00      0.00      0.00       1.0
        70.0       0.00      0.00      0.00       1.0
        72.0       0.00      0.00      0.00       0.0
        82.0       0.00      0.00      0.00       0.0
        85.0       0.00      0.00      0.00       0.0
        86.0       0.00      0.00      0.00       1.0
       100.0       0.00      0.00      0.00       1.0
       102.0       0.00      0.00      0.00       1.0
       105.0       0.00      0.00      0.00       1.0
       109.0       0.00      0.00      0.00       0.0
       110.0       0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
