# Classification/Decision trees

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

import pandas as pd
import joblib
import numpy as np
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from scipy import stats

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)


Preprocessing the data

In [7]:
data = pd.read_csv('../Data Analysis/siren_data_train.csv')

#Preprocessing data
data.dropna(inplace=True)

df = pd.DataFrame()
distances = []
heard = []
for i in range(len(data)):
    point_a = np.array((float(data.iloc[i, 1]), float(data.iloc[i, 2])))
    point_b = np.array((float(data.iloc[i, 6]), float(data.iloc[i, 7])))
    distance = np.linalg.norm(point_a - point_b)
    distances.append(distance)
    heard.append(int(data.iloc[i, 4]))

df['distance'] = distances
df['heard'] = heard

#Removing outliers
z = np.abs(stats.zscore(df))
threshold = 2
outlier_indices = np.where(z > threshold)[0]
df_no_outliers = df.drop(outlier_indices, axis=0)
df_no_outliers.dropna(inplace=True)

print("original shape: ", df.shape)
print("new shape: ", df_no_outliers.shape)

data = data.assign(distance=df['distance'].values)

data.drop(["near_angle", "near_x", "near_y", "xcoor", "ycoor", "near_fid"], axis=1, inplace=True)

original shape:  (5710, 2)
new shape:  (5462, 2)


Training the model with the test data and evaluating the model with validation data

In [13]:
#Define input and output
X = data.drop(["heard"], axis=1)
y = data["heard"]

#Create a RandomForestClassifier
model = RandomForestClassifier()

#Randomly splitting the dataset into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the classifier on the training data
model.fit(X_train, y_train)

#Make predictions on the test data
y_pred = model.predict(X_val)

#Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5)

#Print the mean accuracy
print(f"Mean accuracy: {scores.mean()}")

#Print the confusion matrix
pd.crosstab(y_val, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Mean accuracy: 0.9161120840630472


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,227,63,290
1,36,816,852
All,263,879,1142
