In [None]:


import numpy as np 
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# Loading data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb


In [None]:
data_train = pd.read_csv('/kaggle/input/crime-cast-forecasting-crime-categories/train.csv')
data_test=pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/test.csv")

 # Data Visualization

In [None]:
data_train.head()

In [None]:
data_test.head(5)

**Columns in dataset**

****

In [None]:
print(list(data_train.columns))



In [None]:
print(list(data_test.columns))

**Shape of datasets**

In [None]:
print("Shape of training data:" ,data_train.shape ,"\n")
print("Shape of test data:" ,data_test.shape ,"\n")


**Columns analysis**

In [None]:
data_train.info()

In [None]:
data_test.info()

In [None]:
data_train.describe()

In [None]:
print("Null values in training dataset:\n\n",data_train.isna().sum(),"\n")
print("Null values in test dataset:\n\n",data_test.isna().sum())

Features having null values are:
* Cross_Street
* Modus_Operandi
* Victim_Sex
* Victim_Descent
* Premise_Description
* Weapon_Used_Code
* Weapon_Description


In [None]:
data_train['Crime_Category'].value_counts()

* **Property crimes are most frequent crimes.**

**Data Visualization & Descriptive Statistics**

In [None]:
data_train.describe()

In [None]:
data_train.hist(bins=50, figsize=(20, 15))
plt.suptitle('Distribution of Numeric Features', fontsize=16)
plt.show()

In [None]:
numeric_columns = data_train.select_dtypes(include=['number']).columns

for i, col in enumerate(numeric_columns):
    Q1 = data_train[col].quantile(0.25)
    Q3 = data_train[col].quantile(0.75)
    IQR = Q3 - Q1

    outliers = data_train[(data_train[col] < Q1 - 1.5 * IQR) | (data_train[col] > Q3 + 1.5 * IQR)]
    print(f"{col}: {len(outliers)} outliers")




**Conclusion**
* The Victim_Age contains negative values and is highly imbalanced
* Most frequent Weapon_Used_Code is 400
* There are few entries with latitude and longitude =0
* There are a lot of outliers in Latitutde & Weapon_Used_Code
*Features having null values are:Cross_Street
Modus_Operandi
Victim_Sex
Victim_Descent
Premise_Description
Weapon_Used_Code
Weapon_Description



In [None]:
data_train['Latitude'].nunique()

# Data Preprocessing & Cleaning 

**Since there are so many outliers & null values in Weapon_Used_Code ,I consider dropping it**

In [None]:
data_train = data_train.drop(columns=['Weapon_Used_Code'])
data_test = data_test.drop(columns=['Weapon_Used_Code'])


**Since Cross_Street & Weapon_Description have more than 50% null values so we will drop it**

In [None]:
data_train = data_train.drop(columns=['Cross_Street'])
data_test = data_test.drop(columns=['Cross_Street'])

data_train = data_train.drop(columns=['Weapon_Description'])
data_test = data_test.drop(columns=['Weapon_Description'])



In [None]:
print(data_train.shape)
print(data_test.shape)

**Since the latitude and longitude has value 0 but the location has a different latitude,we will consider dropping those rows**

In [None]:
data_train=data_train[(data_train['Latitude'] !=0 ) & (data_train['Longitude'] !=0 )]


**Imputing negative age values with 0**

In [None]:
data_train['Victim_Age'] = data_train['Victim_Age'].apply(lambda x: 0 if x < 0 else x)

**Removing outliers from latitude**

In [None]:
Q1 = data_train['Latitude'].quantile(0.25)
Q3 = data_train['Latitude'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 +  1.5 * IQR
data_train = data_train[(data_train['Latitude'] > lower_bound) & (data_train['Latitude'] < upper_bound)]
data_train.shape

**Handling date columns**


In [None]:
data_train['Date_Reported'] = pd.to_datetime(data_train['Date_Reported'], format='%m/%d/%Y %I:%M:%S %p')
data_test['Date_Reported'] = pd.to_datetime(data_test['Date_Reported'], format='%m/%d/%Y %I:%M:%S %p')

data_train['Date_Occurred'] = pd.to_datetime(data_train['Date_Occurred'], format='%m/%d/%Y %I:%M:%S %p')
data_test['Date_Occurred'] = pd.to_datetime(data_test['Date_Occurred'], format='%m/%d/%Y %I:%M:%S %p')




**Handling missing values**

In [None]:
#SimpleImputing

data_train['Modus_Operandi'] = data_train['Modus_Operandi'].fillna('Unknown')
data_train['Victim_Sex'] = data_train['Victim_Sex'].fillna(data_train['Victim_Sex'].mode()[0])
data_train['Victim_Descent'] = data_train['Victim_Descent'].fillna(data_train['Victim_Descent'].mode()[0])
data_train['Premise_Description'] = data_train['Premise_Description'].fillna('Unknown')

data_test['Modus_Operandi'] = data_test['Modus_Operandi'].fillna('Unknown')
data_test['Victim_Sex'] = data_test['Victim_Sex'].fillna(data_test['Victim_Sex'].mode()[0])
data_test['Victim_Descent'] = data_test['Victim_Descent'].fillna(data_test['Victim_Descent'].mode()[0])
data_test['Premise_Description'] = data_test['Premise_Description'].fillna('Unknown')

print("Number of missing or null values in training dataset",data_train.isnull().sum().sum())
print("Number of missing or null values in test dataset",data_test.isnull().sum().sum())


# Feature Engineering

**For date-day**

In [None]:
data_train['Day_Occurred'] = data_train['Date_Occurred'].dt.day
data_test['Day_Occurred'] = data_test['Date_Occurred'].dt.day

data_train['WeekDay_Occurred'] = data_train['Date_Occurred'].dt.weekday
data_test['WeekDay_Occurred'] = data_test['Date_Occurred'].dt.weekday

data_train['Month'] = data_train['Date_Occurred'].dt.month
data_test['Month'] = data_test['Date_Occurred'].dt.month

data_train['Day_Rep'] = data_train['Date_Reported'].dt.day
data_test['Day_Rep'] = data_test['Date_Reported'].dt.day

data_train['DayOfYear'] = data_train['Date_Occurred'].dt.dayofyear
data_test['DayOfYear'] = data_test['Date_Occurred'].dt.dayofyear

data_train['Hour'] = data_train['Time_Occurred'].apply(lambda x: int(x / 100))
data_test['Hour'] = data_test['Time_Occurred'].apply(lambda x: int(x / 100))

In [None]:
data_train['IsArrest'] = data_train['Status'].apply(lambda x: 1 if x in ['AA', 'JA'] else 0)
data_test['IsArrest'] =data_test['Status'].apply(lambda x: 1 if x in ['AA', 'JA'] else 0)

**Gap of days when crime occured and it was reported**

In [None]:
data_train['Gap_days']=(data_train['Date_Reported']-data_train['Date_Occurred']).dt.days
data_test['Gap_days']=(data_test['Date_Reported']-data_test['Date_Occurred']).dt.days

In [None]:
Q1 = data_train['Gap_days'].quantile(0.25)
Q3 = data_train['Gap_days'].quantile(0.75)
IQR = Q3 - Q1

outliers = data_train[(data_train['Gap_days'] < Q1 - 1.5 * IQR) | (data_train['Gap_days'] > Q3 + 1.5 * IQR)]
print(f"Gap_days: {len(outliers)} outliers")


**Dropping redundant columns**

In [None]:
data_train = data_train.drop(['Date_Reported', 'Date_Occurred', 'Time_Occurred','Location', 'Area_Name','Premise_Description','Status_Description'], axis=1)
data_test = data_test.drop(['Date_Reported', 'Date_Occurred', 'Time_Occurred','Location', 'Area_Name','Premise_Description','Status_Description'], axis=1)


**Capping outliers in the newly made column**

In [None]:
Q1 = data_train['Gap_days'].quantile(0.25)
Q3 = data_train['Gap_days'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

data_train['Gap_days'] = data_train['Gap_days'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)




In [None]:
Q1 = data_test['Gap_days'].quantile(0.25)
Q3 = data_test['Gap_days'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

data_test['Gap_days'] = data_test['Gap_days'].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)

In [None]:
numeric_columns = data_train.select_dtypes(include=['number']).columns
print(numeric_columns)

**Feature Scaling**

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Example DataFrames
# data_train = pd.read_csv('train.csv')  # Load your actual data
# data_test = pd.read_csv('test.csv')    # Load your actual data

# Specify the numerical features to scale
numerical_features = ['Latitude', 'Longitude', 'Area_ID', 'Reporting_District_no', 'Part 1-2',
       'Victim_Age', 'Premise_Code', 'Day_Occurred', 'WeekDay_Occurred',
       'Month', 'Day_Rep', 'DayOfYear', 'Hour', 'IsArrest', 'Gap_days']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max Scaling to the training data
data_train[numerical_features] = scaler.fit_transform(data_train[numerical_features])

# Apply the same scaling to the test data (using the same scaler fitted on the training data)
data_test[numerical_features] = scaler.transform(data_test[numerical_features])

# Check the shapes of the final DataFrames
print(data_train.shape)
print(data_test.shape)


**Modus Operandi**

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer


data_train['Modus_Operandi'] = data_train['Modus_Operandi'].fillna('')
data_test['Modus_Operandi'] = data_test['Modus_Operandi'].fillna('')

data_train['Modus_Operandi'] = data_train['Modus_Operandi'].apply(lambda x: x.split())
data_test['Modus_Operandi'] = data_test['Modus_Operandi'].apply(lambda x: x.split())

mlb = MultiLabelBinarizer()

train_modus_encoded = mlb.fit_transform(data_train['Modus_Operandi'])
test_modus_encoded = mlb.transform(data_test['Modus_Operandi'])

train_modus_df = pd.DataFrame(train_modus_encoded, columns=mlb.classes_)
test_modus_df = pd.DataFrame(test_modus_encoded, columns=mlb.classes_)

data_train = pd.concat([data_train.reset_index(drop=True), train_modus_df.reset_index(drop=True)], axis=1)
data_test = pd.concat([data_test.reset_index(drop=True), test_modus_df.reset_index(drop=True)], axis=1)

data_train = data_train.drop('Modus_Operandi', axis=1)
data_test = data_test.drop('Modus_Operandi', axis=1)

print(data_train.shape)


**One-Hot Encoding on categorical features**

In [None]:

categorical_features = ['Victim_Sex', 'Victim_Descent', 'Status']

data_train_encoded = pd.get_dummies(data_train, columns=categorical_features)
data_test_encoded = pd.get_dummies(data_test, columns=categorical_features)

data_train_encoded, data_test_encoded = data_train_encoded.align(data_test_encoded, join='left', axis=1, fill_value=0)

print("Shape of the training data:", data_train_encoded.shape)
print("Shape of the test data:", data_test_encoded.shape)

data_train_encoded = data_train_encoded.drop(columns=categorical_features, errors='ignore')
data_test_encoded = data_test_encoded.drop(columns=categorical_features, errors='ignore')
data_test_encoded.drop(['Crime_Category'], axis=1, inplace=True)
data_test_encoded.head()

**Target feature**

In [None]:
data_train_encoded['Crime_Category'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data_train_encoded['Crime_Category_encoded'] = label_encoder.fit_transform(data_train_encoded['Crime_Category'])


category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print(category_mapping)


In [None]:
print(data_train_encoded.shape)
print(data_test_encoded.shape)


data_train_encoded.isnull().sum()
data_test_encoded.isnull().sum()
data_test_encoded.drop

# Applying Models 

In [None]:
X = data_train_encoded.drop(columns=['Crime_Category', 'Crime_Category_encoded'])
y = data_train_encoded['Crime_Category_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


**Logistic Regression(baseline model)**

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report, accuracy_score



# # Initialize and train the Logistic Regression model with adjusted parameters
# model = LogisticRegression(solver='saga', C=0.1, tol=1e-4, max_iter=1000, random_state=42, n_jobs=-1)
# model.fit(X_train, y_train)

# # Predict on the validation set
# y_val_pred = model.predict(X_test)

# # Evaluate the model
# print("Validation Accuracy:", accuracy_score(y_test, y_val_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_val_pred))



# y_test_pred = model.predict(data_test_encoded)



In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt_params = {
    'criterion': ['gini'],
    'max_depth': [None, 10],
    'min_samples_split': [2]
}

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2]
}

# XGBoost
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_params = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 6]
}
# KNN
knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors': [5, 7],
    'weights': ['uniform'],
    'p': [2]
}

# Logistic Regression
log_reg = LogisticRegression(solver='saga', random_state=42)
log_reg_params = {
    'C': [0.1, 1],
    'penalty': ['l2'],
    'max_iter': [100]
}


# **Hyperparameter Tuning**

In [None]:
# Random Search for Decision Tree
dt_random = RandomizedSearchCV(dt, dt_params, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
dt_random.fit(X_train, y_train)
dt_best = dt_random.best_estimator_

# Random Search for Random Forest
rf_random = RandomizedSearchCV(rf, rf_params, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
rf_random.fit(X_train, y_train)
rf_best = rf_random.best_estimator_

# Random Search for XGBoost
xgb_random = RandomizedSearchCV(xgb_model, xgb_params, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
xgb_random.fit(X_train, y_train)
xgb_best = xgb_random.best_estimator_

# Random Search for KNN
knn_random = RandomizedSearchCV(knn, knn_params, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
knn_random.fit(X_train_scaled, y_train)
knn_best = knn_random.best_estimator_

# Random Search for Logistic Regression
log_reg_random = RandomizedSearchCV(log_reg, log_reg_params, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
log_reg_random.fit(X_train_scaled, y_train)
log_reg_best = log_reg_random.best_estimator_


# **Comparing Models**

In [None]:
# Predictions
dt_pred = dt_best.predict(X_test)
rf_pred = rf_best.predict(X_test)
xgb_pred = xgb_best.predict(X_test)
knn_pred = knn_best.predict(X_test_scaled)
log_reg_pred = log_reg_best.predict(X_test_scaled)

# Accuracy and Classification Report
models = {
    'Decision Tree': dt_best,
    'Random Forest': rf_best,
    'XGBoost': xgb_best,
    'KNN': knn_best,
    'Logistic Regression': log_reg_best
}

for name, model in models.items():
    if name in ['KNN', 'Logistic Regression']:
        preds = model.predict(X_test_scaled)
    else:
        preds = model.predict(X_test)
    print(f"{name} - Accuracy: {accuracy_score(y_test, preds)}")
    print(f"{name} - Classification Report:\n{classification_report(y_test, preds)}\n")


# **Submission**


In [None]:
best_model = xgb_best  

y_test_pred = best_model.predict(data_test_encoded)


inverse_mapping = {
    0: 'Crimes against Persons',
    1: 'Crimes against Public Order',
    2: 'Fraud and White-Collar Crimes',
    3: 'Other Crimes',
    4: 'Property Crimes',
    5: 'Violent Crimes'
}

y_test_pred_labels = [inverse_mapping[label] for label in y_test_pred]
submission = pd.DataFrame({
    'ID': data_test_encoded.index+1,  
    'Crime_Category': y_test_pred_labels
})

submission.to_csv('submission.csv', index=False)
print("Submission file created: 'submission.csv'")
submission.head()