In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/personal_security_network/extended_synthetic_data_diverse.csv')

# Handle missing values if any
data.fillna(method='ffill', inplace=True)

  data.fillna(method='ffill', inplace=True)


In [3]:
data.head(5)

Unnamed: 0,user_id,latitude,longitude,crime_reported,time_of_day,area_type,safety_label,crime_type,user_age,user_gender,distance_to_police,weather_condition,day_of_week,special_event_nearby,historical_crime_rate,cctv_coverage
0,user_1,26.725088,78.457332,no,morning,urban,safe,none,51,male,4.717173,rainy,Wednesday,no,0.340278,yes
1,user_2,10.207755,74.523902,no,afternoon,urban,unsafe,none,28,male,1.163624,rainy,Wednesday,no,0.602282,yes
2,user_3,29.288297,88.970762,yes,afternoon,urban,safe,vandalism,47,male,12.563149,foggy,Monday,no,0.281701,no
3,user_4,35.072736,93.704798,no,morning,suburban,unsafe,none,53,male,14.224204,foggy,Friday,no,0.201563,yes
4,user_5,8.767616,95.38528,yes,morning,suburban,unsafe,vandalism,44,female,10.959109,sunny,Monday,no,0.631122,yes


In [4]:
# Label encoding for categorical features
label_encoder = LabelEncoder()
categorical_cols = ['crime_reported', 'time_of_day', 'area_type', 'crime_type',
                    'user_gender', 'weather_condition', 'day_of_week',
                    'special_event_nearby', 'cctv_coverage']

for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

In [5]:
# Define features and target variable
X = data.drop('safety_label', axis=1)  # Features (all except 'safety_label')
y = label_encoder.fit_transform(data['safety_label'])  # Target (safe = 0, unsafe = 1)

In [6]:
# Split the data into training and test sets (70% training, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Standardize numeric features for models that benefit from scaling
scaler = StandardScaler()
# Exclude 'user_id' from scaling as it's now encoded but may not be suitable for scaling
numeric_features = [col for col in X_train.columns if col != 'user_id']
X_train_scaled = scaler.fit_transform(X_train[numeric_features])
X_test_scaled = scaler.transform(X_test[numeric_features])

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_rf = rf_model.predict(X_test_scaled)



In [9]:
# Evaluate the Random Forest model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 0.6986666666666667


In [10]:
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.99      0.82      2099
           1       0.44      0.01      0.03       901

    accuracy                           0.70      3000
   macro avg       0.57      0.50      0.42      3000
weighted avg       0.62      0.70      0.58      3000



In [11]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Confusion Matrix:
 [[2084   15]
 [ 889   12]]


In [12]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Train the model
xgb_model.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_xgb = xgb_model.predict(X_test_scaled)




In [13]:
# Evaluate the XGBoost model
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))


XGBoost Accuracy: 0.6456666666666667


In [14]:
print("Classification Report for XGBoost:\n", classification_report(y_test, y_pred_xgb))

Classification Report for XGBoost:
               precision    recall  f1-score   support

           0       0.70      0.87      0.78      2099
           1       0.28      0.11      0.16       901

    accuracy                           0.65      3000
   macro avg       0.49      0.49      0.47      3000
weighted avg       0.57      0.65      0.59      3000



In [15]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize RandomizedSearchCV
rf_random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_grid,
                                      n_iter=50, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Perform the search
rf_random_search.fit(X_train_scaled, y_train)

# Get the best parameters and use them to predict
best_rf = rf_random_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test_scaled)

print("Best Parameters for Random Forest:", rf_random_search.best_params_)
print("Accuracy after tuning:", accuracy_score(y_test, y_pred_best_rf))


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters for Random Forest: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 10, 'bootstrap': True}
Accuracy after tuning: 0.6993333333333334


In [16]:
from sklearn.model_selection import GridSearchCV

# Parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [6, 10],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}

xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=3, verbose=2, n_jobs=-1)
xgb_grid_search.fit(X_train_scaled, y_train)

best_xgb = xgb_grid_search.best_estimator_
y_pred_best_xgb = best_xgb.predict(X_test_scaled)

print("Best Parameters for XGBoost:", xgb_grid_search.best_params_)
print("Accuracy after tuning:", accuracy_score(y_test, y_pred_best_xgb))


Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}
Accuracy after tuning: 0.6996666666666667


In [17]:
import joblib

# Save the Random Forest model
joblib.dump(best_rf, 'random_forest_model.pkl')

# Save the scaler for later use in the API
joblib.dump(scaler, 'scaler.pkl')

# Load the saved model
model = joblib.load('random_forest_model.pkl')

# Load the scaler as well (if you used scaling)
scaler = joblib.load('scaler.pkl')




In [18]:
# Example: New data to predict on (this data should follow the same format as the training data)
new_data =[
     [20.0, 77.0, 1, 2, 0, 3, 35, 1, 3.0, 1, 5, 0, 0.2, 1]  # Features example
 ]

# Scale the new data (if necessary)
scaled_data = scaler.transform(new_data)

# Make the prediction
prediction = model.predict(scaled_data)
print('Prediction:', 'safe' if prediction[0] == 0 else 'unsafe')

Prediction: safe


