<a href="https://colab.research.google.com/github/gulsahus/MyFirstRepo/blob/main/Random_Forest_%26_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# **Data Loading and Initial Exploration**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, KBinsDiscretizer, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_regression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import shap
import seaborn as sns


##Load Data

In [None]:
# Load the pickle file
file_path = '/Users/gulsah/Desktop/df8.pkl'
df = pd.read_pickle(file_path)

# Display the first few rows of the dataframe
print(df.head())

##Inspect Data

In [None]:
# Get the number of rows and columns
rows, columns = df.shape
print(f'The DataFrame has {rows} rows and {columns} columns.')

##Filter Data
Filter the DataFrame for specific years (2022, 2023, 2024) due to high running time of the models


In [None]:
# Filter the DataFrame for the desired years
desired_years = [2024,2023,2022]
df = df[df['CalYear'].isin(desired_years)]

#The same analyis was repeated for 2014-2018 & 2019-2024 time frames.

# **Data Preprocessing**

##Define Columns for Transformations

In [None]:
# Define columns for different transformations
min_max_cols = ["NumCalls"]
std_scale_cols = ["proj_Air_Distance_rounded", "proj_PastIncidentsCount", 'proj_CityCenter', 'proj_PeakHours']
one_hot_cols = ["proj_CalMonth", "IncidentGroup", "StopCodeDescription", "PropertyCategory",
                "IncGeo_BoroughName","proj_CalWeekday", "proj_WindDescription", "SpecialServiceType"]
cyclical_cols = ['proj_Day_of_Year', 'HourOfCall']

##Apply One-Hot Encoding
Encode categorical variables using one-hot encoding and concatenate the results to the DataFrame

In [None]:
# Apply One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_df = pd.DataFrame(encoder.fit_transform(df[one_hot_cols]))
encoded_df.columns = encoder.get_feature_names_out(one_hot_cols)

# Drop original one-hot encoded columns and concatenate encoded columns
df = df.drop(one_hot_cols, axis=1)
df = pd.concat([df, encoded_df], axis=1)

##Cyclical Transformations
Apply sine and cosine transformations to time-based features to capture cyclical patterns.

In [None]:
# Cyclical transformations
df['day_of_year_sin'] = np.sin(2 * np.pi * df['proj_Day_of_Year'] / 366)
df['day_of_year_cos'] = np.cos(2 * np.pi * df['proj_Day_of_Year'] / 366)
df['hour_cos'] = np.cos(2 * np.pi * df['HourOfCall'] / 24)
df['hour_sin'] = np.sin(2 * np.pi * df['HourOfCall'] / 24)


##Convert Columns to Numeric

In [None]:
# Convert non-numeric columns to numeric
for col in std_scale_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')


##Select Final Columns
Identify the final set of columns to be used for analysis.

In [None]:
# Ensure all desired columns are present and combined properly
final_columns = min_max_cols + std_scale_cols + list(encoded_df.columns) + ['TravelTimeSeconds', 'day_of_year_sin', 'day_of_year_cos', 'hour_cos', 'hour_sin']
df_final = df[final_columns]


##Handle Missing Values

In [None]:
# Handle missing values again after all transformations
imputer = SimpleImputer(strategy='mean')
df_final_imputed = pd.DataFrame(imputer.fit_transform(df_final), columns=df_final.columns)


#**Feature Engineering**


##Convert Travel Time
Convert travel time from seconds to minutes.

In [None]:
# Automatically create bins with equal frequency
df_final_imputed['TravelTimeMinutes'] = df_final_imputed['TravelTimeSeconds'] / 60

##Visualize Distributions
Plot histograms and box plots to visualize the distribution of travel times.

In [None]:
# Visualize the distribution of TravelTimeMinutes
plt.figure(figsize=(10, 6))
sns.histplot(df_final_imputed['TravelTimeMinutes'], bins=50, kde=True)
plt.title('Distribution of TravelTimeMinutes')
plt.xlabel('TravelTimeMinutes')
plt.ylabel('Frequency')
plt.show()

# Use a box plot to see the distribution and outliers
plt.figure(figsize=(10, 6))
sns.boxplot(x=df_final_imputed['TravelTimeMinutes'])
plt.title('Box Plot of TravelTimeMinutes')
plt.xlabel('TravelTimeMinutes')
plt.show()


##Check for Repeated Values





In [None]:
# Check for repeated values
travel_time_counts = df_final_imputed['TravelTimeMinutes'].value_counts().sort_values(ascending=False)
print(travel_time_counts.head(10))  # Display the top 10 most frequent values


##Log Transformation (optional)

In [None]:
# Apply log transformation
df_final_imputed['TravelTimeMinutesLog'] = np.log1p(df_final_imputed['TravelTimeMinutes'])

##Discretize Travel Time

Bin the log-transformed travel time into categories.

In [None]:
k_bins = 8

# Use KBinsDiscretizer with transformed data
discretizer = KBinsDiscretizer(n_bins=k_bins, encode='ordinal', strategy='quantile')
df_final_imputed['TravelTimeCategoryLog'] = discretizer.fit_transform(df_final_imputed[['TravelTimeMinutesLog']])

# Convert numeric bins to labels
df_final_imputed['TravelTimeCategoryLog'] = df_final_imputed['TravelTimeCategoryLog'].apply(lambda x: f'Bin{x+1}')


##Visualize Binned Data
Display summary counts of the binned categories and visualize bin edges.


In [None]:
# Summary of bin counts for transformed data
bin_counts_log = df_final_imputed['TravelTimeCategoryLog'].value_counts()
print("Summary of TravelTimeCategoryLog bin counts:")
print(bin_counts_log)

# Display the bin edges to understand which transformed minutes fall into each bin
bin_edges_log = discretizer.bin_edges_[0]
for i in range(len(bin_edges_log) - 1):
    print(f"Bin {i+1}: {bin_edges_log[i]} to {bin_edges_log[i+1]}")

##Drop Unnecessary Columns
Remove columns that are no longer needed after transformations.

In [None]:
# Drop unnecessary columns
columns_to_drop = ['proj_Day_of_Year', 'HourOfCall', 'TravelTimeSeconds', 'TravelTimeMinutes']
df_final_imputed = df_final_imputed.drop(columns=[col for col in columns_to_drop if col in df_final_imputed.columns])


##Final NaN Check
Ensure there are no remaining NaN values in the DataFrame.

In [None]:
# Check for any remaining NaN values in the dataset
nan_counts = df_final_imputed.isnull().sum()
nan_counts_filtered = nan_counts[nan_counts > 0]
if not nan_counts_filtered.empty:
    print("Columns with NaN values after all preprocessing:")
    print(nan_counts_filtered)
    # Handle any remaining NaNs
    df_final_imputed = pd.DataFrame(imputer.fit_transform(df_final_imputed), columns=df_final_imputed.columns)
else:
    print("No NaN values found in the DataFrame after all preprocessing.")


#**Model Preparation**

##Prepare Data for Modeling

In [None]:
# Model preparation
X = df_final_imputed.drop(['TravelTimeCategoryLog'], axis=1)
y = df_final_imputed['TravelTimeCategoryLog']


##Train-Test Split

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


##Encode Target Variable

In [None]:
# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


##Scale Features

In [None]:
# Scaling the features
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)


##Feature Selection - Variance Threshold & SeleckKBest
Select features based on variance threshold to remove low-variance features.

In [None]:
# Feature selection using VarianceThreshold with threshold of 0.05
threshold = 0.05
sel_variance = VarianceThreshold(threshold=threshold)
sel_variance.fit(X_train_scaled)
X_train_filtered = X_train_scaled.loc[:, sel_variance.get_support()]
X_test_filtered = X_test_scaled.loc[:, sel_variance.get_support()]

# Feature selection using SelectKBest with mutual_info_regression
sel_kbest = SelectKBest(score_func=mutual_info_regression, k=20)
sel_kbest.fit(X_train_filtered, y_train_encoded)

# Applying the transformation to the train and test sets
X_train_selected = X_train_filtered.loc[:, sel_kbest.get_support()]
X_test_selected = X_test_filtered.loc[:, sel_kbest.get_support()]

# Retrieve the selected feature names
selected_features_final = X_train_filtered.columns[sel_kbest.get_support()]


##Visualize Selected Features

In [None]:

# Visualization of feature selection
plt.matshow(sel_kbest.get_support().reshape(1, -1), cmap='gray_r')
plt.xlabel('Features Index')
plt.yticks([])
plt.show()

# Print the final selected features
print("Selected features:", selected_features_final.tolist())

#**Modeling**


##RandomForest Model
Train a RandomForest model on the selected features and evaluate its performance using a classification report.

In [None]:

# Model training and evaluation with RandomForest
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train_selected, y_train_encoded)
y_pred_rf_encoded = model_rf.predict(X_test_selected)

# Decode the predicted labels for RandomForest
y_pred_rf = label_encoder.inverse_transform(y_pred_rf_encoded)

# Classification report for RandomForest
print("RandomForest Classification Report")
print(classification_report(y_test, y_pred_rf))

##XGBoost Model
Train an XGBoost model on the selected features and evaluate its performance using a classification report.

In [None]:

# Model training and evaluation with XGBoost
model_xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
model_xgb.fit(X_train_selected, y_train_encoded)
y_pred_xgb_encoded = model_xgb.predict(X_test_selected)

# Decode the predicted labels for XGBoost
y_pred_xgb = label_encoder.inverse_transform(y_pred_xgb_encoded)

# Classification report for XGBoost
print("XGBoost Classification Report")
print(classification_report(y_test, y_pred_xgb))

#**Feature Importances**

#Random Forest Feature Importance

In [None]:
# Get feature importances from the RandomForest model
rf_importances = model_rf.feature_importances_
rf_feature_names = selected_features_final

# Create a DataFrame for better visualization
rf_importances_df = pd.DataFrame({
    'Feature': rf_feature_names,
    'Importance': rf_importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(rf_importances_df['Feature'], rf_importances_df['Importance'])
plt.xlabel('Importance')
plt.title('RandomForest Feature Importances')
plt.gca().invert_yaxis()
plt.show()


#XGBoost Feature Importance

In [None]:

# Get feature importances from the XGBoost model
xgb_importances = model_xgb.feature_importances_
xgb_feature_names = selected_features_final

# Create a DataFrame for better visualization
xgb_importances_df = pd.DataFrame({
    'Feature': xgb_feature_names,
    'Importance': xgb_importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(xgb_importances_df['Feature'], xgb_importances_df['Importance'])
plt.xlabel('Importance')
plt.title('XGBoost Feature Importances')
plt.gca().invert_yaxis()
plt.show()
