# Import modules

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Process the data

Import data

In [2]:
data = pd.read_csv('siren_data_train.csv')

Replace coordinates with distance in the dataset and drop unnecassary attributes

In [3]:
distances = {'dist': []}

for i in range(0, len(data)):
    point_a = np.array((float(data.iloc[i, 1]), float(data.iloc[i, 2])))
    point_b = np.array((float(data.iloc[i, 6]), float(data.iloc[i, 7])))
    distance = np.linalg.norm(point_a - point_b)
    distances['dist'].append(distance)


data['distance'] = distances['dist']
data = data.drop(['near_fid','near_x','near_y','near_angle','xcoor','ycoor'], axis=1)

data.head()

Unnamed: 0,heard,building,noise,in_vehicle,asleep,no_windows,age,distance
0,1,0,0,0,0,0,59,901.283517
1,1,0,0,0,0,0,29,972.00626
2,1,0,0,0,0,0,32,872.340924
3,1,0,0,0,0,0,36,257.804449
4,1,0,0,0,0,0,55,529.686791


Clean up outliers

In [4]:
#Only look at distance since all other parameters are binary
z = np.abs(stats.zscore(data.distance))
threshold = 2
outlier_indices = np.where(z > threshold)[0]
data_no_outliers = data.drop(outlier_indices, axis=0)
data_no_outliers.dropna(inplace=True)

print("original shape: ", data.shape)
print("new shape: ", data_no_outliers.shape)

original shape:  (5710, 8)
new shape:  (5462, 8)


# train the model

Split the data

In [5]:
X = data_no_outliers.drop(['heard'], axis=1)
y = data_no_outliers['heard']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=30)


Create the model

In [6]:
logreg = LogisticRegression(max_iter=1000, solver='liblinear')
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_val)

print('Accuracy_val:', accuracy_score(y_val, y_pred))

y_hat = logreg.predict(X_train)

print('Accuracy_train:',accuracy_score(y_train, y_hat))

score = cross_val_score(logreg, X, y, cv=5)

print('crossval:', score.mean(), 'standard deviation', score.std())

Accuracy_val: 0.9139981701738334
Accuracy_train: 0.9217212176699473
crossval: 0.9188942957012489 standard deviation 0.0019659767157300698


Tune the models using grid search

In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 5, 10, 1000],
    'penalty': ['l1', 'l2'],
}

grid_search = GridSearchCV(estimator = logreg, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 0)

grid_search.fit(X, y)

best_params = grid_search.best_params_

print("Best parameters: ", best_params)

print("Best score: ", grid_search.best_score_)

Best parameters:  {'C': 1000, 'penalty': 'l1'}
Best score:  0.9236540220986699
