In [1]:
#imports only
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, RandomizedSearchCV
from scipy import stats
import sklearn.preprocessing as skl_pre
import sklearn.linear_model as skl_lm
import sklearn.discriminant_analysis as skl_da
import sklearn.neighbors as skl_nb
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
import seaborn as sns
from pandas.plotting import scatter_matrix


In [2]:
file_path = 'siren_data_train.csv'
data = pd.read_csv(file_path)

#Split the binary columns into zeroes and ones and use that going forwards
#data = pd.get_dummies(data, columns=["building","noise", "in_vehicle", "asleep","no_windows"])
data_copy = data.copy()

data.head()

Unnamed: 0,near_fid,near_x,near_y,near_angle,heard,building,xcoor,ycoor,noise,in_vehicle,asleep,no_windows,age
0,2712,1998301.0,9011692.0,-171.588672,1,0,1999193.0,9011824,0,0,0,0,59
1,2721,1928907.0,8954624.0,-51.208102,1,0,1928298.0,8955382,0,0,0,0,29
2,297,2026384.0,8256164.0,39.018754,1,0,2025706.0,8255615,0,0,0,0,32
3,739,1743184.0,8052652.0,15.046022,1,0,1742935.0,8052585,0,0,0,0,36
4,1852,1350375.0,7909850.0,144.60317,1,0,1350807.0,7909543,0,0,0,0,55


In [3]:
# create age groups

#new bins based on hearing with age
new_age_bins = [0, 40, 50, 60, 70, float('inf')]
new_age_labels = ['0-39', '40-49', '50-59', '60-69', '70+']

# Create age groups column with new bins
data['age_group'] = pd.cut(data['age'], bins=new_age_bins, labels=new_age_labels, right=False)


#create new age_groups--------
#new bins based on hearing with age
new_age_bins = [0, 30, 60, float('inf')]
new_age_labels = ['Young', 'Middle-aged', 'Elderly']

# Create age groups column with new bins
data['new_age_group'] = pd.cut(data['age'], bins=new_age_bins, labels=new_age_labels, right=False)





In [4]:
#create distance

data['dist'] = np.sqrt((data['xcoor'] - data['near_x'])**2 + (data['ycoor'] - data['near_y'])**2)


In [5]:
# create distance group


# Add distance groups
# Create bins for distance ranging from 0 to 3000 with a step size of 100
step_size = 500
distance_bins = np.arange(0, 3100, step_size)

# Create labels for distance bins
distance_labels = [f'{i}-{i+step_size}' for i in range(0, 3000, step_size)]
# Append a bin edge for values greater than 3000
distance_bins = np.append(distance_bins, np.inf)
distance_labels.append('>3000')

# Assign each distance value to a corresponding bin
data['distance_groups'] = pd.cut(data['dist'], bins=distance_bins, labels=distance_labels, right=False)

In [6]:
dummies = pd.get_dummies(data, columns=["age_group", "new_age_group", "distance_groups"])

keep_these_original_columns_for_plots = data[["distance_groups", "age_group"]]

data = pd.concat([keep_these_original_columns_for_plots, dummies], axis=1)

data_copy = data.copy()

In [7]:
data

Unnamed: 0,distance_groups,age_group,near_fid,near_x,near_y,near_angle,heard,building,xcoor,ycoor,...,new_age_group_Young,new_age_group_Middle-aged,new_age_group_Elderly,distance_groups_0-500,distance_groups_500-1000,distance_groups_1000-1500,distance_groups_1500-2000,distance_groups_2000-2500,distance_groups_2500-3000,distance_groups_>3000
0,500-1000,50-59,2712,1.998301e+06,9.011692e+06,-171.588672,1,0,1999193.0,9011824,...,0,1,0,0,1,0,0,0,0,0
1,500-1000,0-39,2721,1.928907e+06,8.954624e+06,-51.208102,1,0,1928298.0,8955382,...,1,0,0,0,1,0,0,0,0,0
2,500-1000,0-39,297,2.026384e+06,8.256164e+06,39.018754,1,0,2025706.0,8255615,...,0,1,0,0,1,0,0,0,0,0
3,0-500,0-39,739,1.743184e+06,8.052652e+06,15.046022,1,0,1742935.0,8052585,...,0,1,0,1,0,0,0,0,0,0
4,500-1000,50-59,1852,1.350375e+06,7.909850e+06,144.603170,1,0,1350807.0,7909543,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5705,0-500,0-39,5,2.008871e+06,8.255775e+06,-176.234663,1,0,2009140.0,8255793,...,1,0,0,1,0,0,0,0,0,0
5706,1000-1500,40-49,4069,1.981871e+06,8.270452e+06,45.691415,1,0,1981119.0,8269682,...,0,1,0,0,0,1,0,0,0,0
5707,500-1000,60-69,2170,1.463760e+06,8.074997e+06,-175.473118,1,0,1464308.0,8075040,...,0,0,1,0,1,0,0,0,0,0
5708,0-500,0-39,1591,1.479843e+06,7.526377e+06,142.958054,1,0,1480125.0,7526164,...,0,1,0,1,0,0,0,0,0,0


In [8]:
#random sampling
def split_train_test(data, test_ratio, random_state): 
    shuffled_indices = np.random.permutation(len(data)) if random_state is None else np.random.RandomState(random_state).permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

#stratified sampling, where we create a sample that takes the distribution of age into account
def stratified_sampling(data, test_ratio, important_data_column):
    split = StratifiedShuffleSplit(n_splits=1, test_size = test_ratio, random_state=42)
    for train_index, test_index in split.split(data, important_data_column):
        strat_train_set = data.loc[train_index]
        strat_test_set = data.loc[test_index]
    return strat_train_set, strat_test_set


#create samples
#train, test = stratified_sampling(data, 0.2, data["age"])
train, test = split_train_test(data, 0.2, random_state=42)

#print(len(test_set))
#print(len(train_set))

In [9]:
X_train = train.drop(columns=['heard', "distance_groups", "age_group"]) 
y_train = train['heard']

# Extract features and target for testing set
X_test = test.drop(columns=['heard', "distance_groups", "age_group"])
y_test = test['heard']



In [10]:
X_train

Unnamed: 0,near_fid,near_x,near_y,near_angle,building,xcoor,ycoor,noise,in_vehicle,asleep,...,new_age_group_Young,new_age_group_Middle-aged,new_age_group_Elderly,distance_groups_0-500,distance_groups_500-1000,distance_groups_1000-1500,distance_groups_1500-2000,distance_groups_2000-2500,distance_groups_2500-3000,distance_groups_>3000
2707,3488,1.539340e+06,7.445810e+06,-94.448289,0,1539356.0,7446015,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4450,2732,1.920911e+06,8.955261e+06,-26.953477,0,1919838.0,8955807,0,0,0,...,0,1,0,0,0,1,0,0,0,0
5582,2857,2.263289e+06,9.309547e+06,94.793487,0,2263376.0,9308506,0,0,0,...,0,0,1,0,0,1,0,0,0,0
219,766,1.801783e+06,8.093233e+06,-126.466310,1,1802002.0,8093529,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4299,693,1.896830e+06,8.263999e+06,157.136154,0,1898035.0,8263491,0,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,742,1.727620e+06,8.054662e+06,-148.946005,0,1727716.0,8054720,1,0,0,...,1,0,0,1,0,0,0,0,0,0
5191,72,2.009944e+06,8.251584e+06,164.094575,0,2010188.0,8251514,0,0,0,...,0,0,1,1,0,0,0,0,0,0
5226,536,1.966241e+06,8.366161e+06,63.700957,0,1965821.0,8365312,0,1,0,...,0,1,0,0,1,0,0,0,0,0
5390,3037,2.416546e+06,9.823889e+06,-52.356561,0,2415633.0,9825073,0,0,0,...,0,1,0,0,0,1,0,0,0,0


# LDA 

In [11]:
import sklearn.discriminant_analysis as skl_da

In [12]:
X_train

Unnamed: 0,near_fid,near_x,near_y,near_angle,building,xcoor,ycoor,noise,in_vehicle,asleep,...,new_age_group_Young,new_age_group_Middle-aged,new_age_group_Elderly,distance_groups_0-500,distance_groups_500-1000,distance_groups_1000-1500,distance_groups_1500-2000,distance_groups_2000-2500,distance_groups_2500-3000,distance_groups_>3000
2707,3488,1.539340e+06,7.445810e+06,-94.448289,0,1539356.0,7446015,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4450,2732,1.920911e+06,8.955261e+06,-26.953477,0,1919838.0,8955807,0,0,0,...,0,1,0,0,0,1,0,0,0,0
5582,2857,2.263289e+06,9.309547e+06,94.793487,0,2263376.0,9308506,0,0,0,...,0,0,1,0,0,1,0,0,0,0
219,766,1.801783e+06,8.093233e+06,-126.466310,1,1802002.0,8093529,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4299,693,1.896830e+06,8.263999e+06,157.136154,0,1898035.0,8263491,0,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,742,1.727620e+06,8.054662e+06,-148.946005,0,1727716.0,8054720,1,0,0,...,1,0,0,1,0,0,0,0,0,0
5191,72,2.009944e+06,8.251584e+06,164.094575,0,2010188.0,8251514,0,0,0,...,0,0,1,1,0,0,0,0,0,0
5226,536,1.966241e+06,8.366161e+06,63.700957,0,1965821.0,8365312,0,1,0,...,0,1,0,0,1,0,0,0,0,0
5390,3037,2.416546e+06,9.823889e+06,-52.356561,0,2415633.0,9825073,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [13]:
lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)

predict_prob_lda = lda_model.predict_proba(X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.999478 0.000522]
 [0.000396 0.999604]
 [0.037137 0.962863]
 [0.944505 0.055495]
 [0.007328 0.992672]]
Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      233   30
1       57  822 

Accuracy: 0.924


In [14]:
X_train

Unnamed: 0,near_fid,near_x,near_y,near_angle,building,xcoor,ycoor,noise,in_vehicle,asleep,...,new_age_group_Young,new_age_group_Middle-aged,new_age_group_Elderly,distance_groups_0-500,distance_groups_500-1000,distance_groups_1000-1500,distance_groups_1500-2000,distance_groups_2000-2500,distance_groups_2500-3000,distance_groups_>3000
2707,3488,1.539340e+06,7.445810e+06,-94.448289,0,1539356.0,7446015,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4450,2732,1.920911e+06,8.955261e+06,-26.953477,0,1919838.0,8955807,0,0,0,...,0,1,0,0,0,1,0,0,0,0
5582,2857,2.263289e+06,9.309547e+06,94.793487,0,2263376.0,9308506,0,0,0,...,0,0,1,0,0,1,0,0,0,0
219,766,1.801783e+06,8.093233e+06,-126.466310,1,1802002.0,8093529,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4299,693,1.896830e+06,8.263999e+06,157.136154,0,1898035.0,8263491,0,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,742,1.727620e+06,8.054662e+06,-148.946005,0,1727716.0,8054720,1,0,0,...,1,0,0,1,0,0,0,0,0,0
5191,72,2.009944e+06,8.251584e+06,164.094575,0,2010188.0,8251514,0,0,0,...,0,0,1,1,0,0,0,0,0,0
5226,536,1.966241e+06,8.366161e+06,63.700957,0,1965821.0,8365312,0,1,0,...,0,1,0,0,1,0,0,0,0,0
5390,3037,2.416546e+06,9.823889e+06,-52.356561,0,2415633.0,9825073,0,0,0,...,0,1,0,0,0,1,0,0,0,0


#These columns give Accuracy of 91.6
Index(['near_fid', 'near_x', 'near_y', 'near_angle', 'building', 'xcoor',
       'ycoor', 'noise', 'in_vehicle', 'asleep', 'no_windows', 'age', 'dist',
       'age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+', 'distance_groups_0-500',
       'distance_groups_500-1000', 'distance_groups_1000-1500',
       'distance_groups_1500-2000', 'distance_groups_2000-2500',
       'distance_groups_2500-3000', 'distance_groups_>3000'],
      dtype='object')

# what happens if I remove "Age" ? 

In [15]:
X_train_wo_age = X_train.copy().drop(columns=["age"], axis=1)
y_train = train['heard']

# Extract features and target for testing set
X_test_wo_age = X_test.copy().drop(columns=["age"], axis=1)
y_test = test['heard']

In [16]:
X_train_wo_age

Unnamed: 0,near_fid,near_x,near_y,near_angle,building,xcoor,ycoor,noise,in_vehicle,asleep,...,new_age_group_Young,new_age_group_Middle-aged,new_age_group_Elderly,distance_groups_0-500,distance_groups_500-1000,distance_groups_1000-1500,distance_groups_1500-2000,distance_groups_2000-2500,distance_groups_2500-3000,distance_groups_>3000
2707,3488,1.539340e+06,7.445810e+06,-94.448289,0,1539356.0,7446015,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4450,2732,1.920911e+06,8.955261e+06,-26.953477,0,1919838.0,8955807,0,0,0,...,0,1,0,0,0,1,0,0,0,0
5582,2857,2.263289e+06,9.309547e+06,94.793487,0,2263376.0,9308506,0,0,0,...,0,0,1,0,0,1,0,0,0,0
219,766,1.801783e+06,8.093233e+06,-126.466310,1,1802002.0,8093529,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4299,693,1.896830e+06,8.263999e+06,157.136154,0,1898035.0,8263491,0,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,742,1.727620e+06,8.054662e+06,-148.946005,0,1727716.0,8054720,1,0,0,...,1,0,0,1,0,0,0,0,0,0
5191,72,2.009944e+06,8.251584e+06,164.094575,0,2010188.0,8251514,0,0,0,...,0,0,1,1,0,0,0,0,0,0
5226,536,1.966241e+06,8.366161e+06,63.700957,0,1965821.0,8365312,0,1,0,...,0,1,0,0,1,0,0,0,0,0
5390,3037,2.416546e+06,9.823889e+06,-52.356561,0,2415633.0,9825073,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [17]:
lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(X_train_wo_age, y_train)

In [18]:
predict_prob_lda = lda_model.predict_proba(X_test_wo_age)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.999355 0.000645]
 [0.000437 0.999563]
 [0.042341 0.957659]
 [0.951142 0.048858]
 [0.008125 0.991875]]


In [19]:
X_test_wo_age

Unnamed: 0,near_fid,near_x,near_y,near_angle,building,xcoor,ycoor,noise,in_vehicle,asleep,...,new_age_group_Young,new_age_group_Middle-aged,new_age_group_Elderly,distance_groups_0-500,distance_groups_500-1000,distance_groups_1000-1500,distance_groups_1500-2000,distance_groups_2000-2500,distance_groups_2500-3000,distance_groups_>3000
1700,2256,1.484834e+06,8.292286e+06,-154.867455,0,1526880.0,8312011,1,0,0,...,0,0,1,0,0,0,0,0,0,1
2495,2726,1.921416e+06,8.952792e+06,-11.453870,1,1921031.0,8952870,0,0,0,...,0,1,0,1,0,0,0,0,0,0
561,204,2.006886e+06,8.248473e+06,99.358200,0,2007140.0,8246933,1,0,0,...,0,1,0,0,0,0,1,0,0,0
5612,3978,2.009112e+06,8.281005e+06,33.102340,0,2007588.0,8280011,1,1,0,...,0,0,1,0,0,0,1,0,0,0
5278,1569,1.470975e+06,7.500691e+06,-126.023459,1,1471361.0,7501222,1,0,0,...,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2754,1512,1.445986e+06,7.480849e+06,37.366301,0,1445671.0,7480608,0,0,0,...,0,1,0,1,0,0,0,0,0,0
803,221,1.990484e+06,8.258627e+06,-127.274753,0,1991303.0,8259703,0,0,0,...,0,1,0,0,0,1,0,0,0,0
857,87,2.007504e+06,8.253508e+06,60.149539,1,2007438.0,8253393,0,0,0,...,0,1,0,1,0,0,0,0,0,0
2963,830,1.657722e+06,8.061634e+06,-86.881132,0,1657073.0,8073553,0,1,0,...,0,1,0,0,0,0,0,0,0,1


In [20]:
prediction_lda = np.empty(len(X_test_wo_age), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      234   30
1       56  822 

Accuracy: 0.925


In [21]:
#Without age, it drops to 91.4%

# what happens if I remove "distance" ? 

In [22]:
# Set these:

columns_to_remove = ['dist']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test_original = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict
predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.996716 0.003284]
 [0.000611 0.999389]
 [0.042331 0.957669]
 [0.941049 0.058951]
 [0.007595 0.992405]]
Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      235   42
1       55  810 

Accuracy: 0.915


# testing with Sequential Feature selector



In [23]:
from sklearn.feature_selection import SequentialFeatureSelector

In [24]:



estimator = lda_model

# Instantiate SequentialFeatureSelector
selector = SequentialFeatureSelector(estimator,
                                     direction='forward',  # or 'backward'
                                     scoring='accuracy',  # or any other scoring metric
                                     n_jobs=-1)

# Fit the selector to your training data
selector.fit(X_train, y_train)

# Get the selected feature indices
selected_features_indices = selector.get_support(indices=True)

# Get the selected feature names
selected_features_names = X_train.columns[selected_features_indices]

# Print the selected feature names
print("Selected Features:", selected_features_names)



Selected Features: Index(['near_fid', 'near_x', 'near_y', 'near_angle', 'building', 'xcoor',
       'ycoor', 'noise', 'in_vehicle', 'asleep', 'no_windows',
       'new_age_group_Middle-aged', 'distance_groups_2500-3000',
       'distance_groups_>3000'],
      dtype='object')


In [25]:
# Testing with the columns that was recommended by sequential feature selector

In [26]:
X_train.columns

Index(['near_fid', 'near_x', 'near_y', 'near_angle', 'building', 'xcoor',
       'ycoor', 'noise', 'in_vehicle', 'asleep', 'no_windows', 'age', 'dist',
       'age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+', 'new_age_group_Young',
       'new_age_group_Middle-aged', 'new_age_group_Elderly',
       'distance_groups_0-500', 'distance_groups_500-1000',
       'distance_groups_1000-1500', 'distance_groups_1500-2000',
       'distance_groups_2000-2500', 'distance_groups_2500-3000',
       'distance_groups_>3000'],
      dtype='object')

In [27]:
columns_to_remove = ["age", "dist", 'age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+', 'distance_groups_0-500',
       'distance_groups_500-1000', 'distance_groups_1000-1500',
       'distance_groups_1500-2000', 'distance_groups_2000-2500',
       'distance_groups_2500-3000']
X_train_sequential = X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']

# Extract features and target for testing set
X_test_sequential = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

In [28]:
lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(X_train_sequential, y_train)

In [29]:
predict_prob_lda = lda_model.predict_proba(X_test_sequential)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.993911 0.006089]
 [0.00139  0.99861 ]
 [0.025512 0.974488]
 [0.8533   0.1467  ]
 [0.008768 0.991232]]


In [30]:
prediction_lda = np.empty(len(X_test_sequential), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      234   41
1       56  811 

Accuracy: 0.915


# Building my own sequential feature selector


In [31]:

columns = X_train.columns
for column in columns:
    column_to_remove = column
    print("Column that was removed: ", column)
    columns_to_remove = [column]
    X_train_temp = X_train.copy().drop(columns=columns_to_remove, axis=1)
    y_train_temp = train['heard']
    
    # Extract features and target for testing set
    X_test_temp = X_test.copy().drop(columns=columns_to_remove, axis=1)
    y_test_temp = test['heard']

    lda_model = skl_da.LinearDiscriminantAnalysis()
    lda_model.fit(X_train_temp, y_train_temp)

    predict_prob_lda = lda_model.predict_proba(X_test_temp)

    prediction_lda = np.empty(len(X_test_temp), dtype=object)
    prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
    #print("Five first predictions: ")
    #print(prediction_lda[0:5], "\n")
    
    
    # Confusion matrix
    #print("Confusion matrix: \n")
    #print(pd.crosstab(prediction_lda, y_test_temp), "\n")
    
    #Accuracy
    print(f"Accuracy: {np.mean(prediction_lda == y_test_temp):.3f}")

    #print("The class order in the model: ")
    #print(lda_model.classes_)

    #print("Examples of predicted probabilities for the above classes: ")
    #with np.printoptions(suppress=True, precision=6):
        #print(predict_prob_lda[0:5])



Column that was removed:  near_fid
Accuracy: 0.924
Column that was removed:  near_x
Accuracy: 0.924
Column that was removed:  near_y
Accuracy: 0.924
Column that was removed:  near_angle
Accuracy: 0.922
Column that was removed:  building
Accuracy: 0.924
Column that was removed:  xcoor
Accuracy: 0.924
Column that was removed:  ycoor
Accuracy: 0.924
Column that was removed:  noise
Accuracy: 0.915
Column that was removed:  in_vehicle
Accuracy: 0.910
Column that was removed:  asleep
Accuracy: 0.920
Column that was removed:  no_windows
Accuracy: 0.914
Column that was removed:  age
Accuracy: 0.925
Column that was removed:  dist
Accuracy: 0.915
Column that was removed:  age_group_0-39
Accuracy: 0.924
Column that was removed:  age_group_40-49
Accuracy: 0.924
Column that was removed:  age_group_50-59
Accuracy: 0.924
Column that was removed:  age_group_60-69
Accuracy: 0.924
Column that was removed:  age_group_70+
Accuracy: 0.924
Column that was removed:  new_age_group_Young
Accuracy: 0.924
Column

# What happens if I only have original features + distance? No distance groups and age groups

In [32]:
# Set these:

columns_to_remove = ['new_age_group_Young',
       'new_age_group_Middle-aged', 'new_age_group_Elderly','age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+', 'distance_groups_0-500',
       'distance_groups_500-1000', 'distance_groups_1000-1500',
       'distance_groups_1500-2000', 'distance_groups_2000-2500',
       'distance_groups_2500-3000']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.999285 0.000715]
 [0.000593 0.999407]
 [0.020705 0.979295]
 [0.871994 0.128006]
 [0.006607 0.993393]]
Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      232   32
1       58  820 

Accuracy: 0.921


# What happens if I remove distance entirely?

In [33]:
# Set these:

columns_to_remove = ['dist',
       'distance_groups_500-1000', 'distance_groups_1000-1500',
       'distance_groups_1500-2000', 'distance_groups_2000-2500',
       'distance_groups_2500-3000']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.996626 0.003374]
 [0.000652 0.999348]
 [0.022697 0.977303]
 [0.900917 0.099083]
 [0.010508 0.989492]]
Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      235   41
1       55  811 

Accuracy: 0.916


# What happens if I only have "dist" and no distance groups? 

In [34]:
# Set these:

columns_to_remove = [
       'distance_groups_500-1000', 'distance_groups_1000-1500',
       'distance_groups_1500-2000', 'distance_groups_2000-2500',
       'distance_groups_2500-3000']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.999467 0.000533]
 [0.000422 0.999578]
 [0.020039 0.979961]
 [0.907187 0.092813]
 [0.010105 0.989895]]
Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      232   29
1       58  823 

Accuracy: 0.924


# What happens if I limit the age groups to 3? And only have regular distance (no distance groups)

In [35]:
X_train

Unnamed: 0,near_fid,near_x,near_y,near_angle,building,xcoor,ycoor,noise,in_vehicle,asleep,...,new_age_group_Young,new_age_group_Middle-aged,new_age_group_Elderly,distance_groups_0-500,distance_groups_500-1000,distance_groups_1000-1500,distance_groups_1500-2000,distance_groups_2000-2500,distance_groups_2500-3000,distance_groups_>3000
2707,3488,1.539340e+06,7.445810e+06,-94.448289,0,1539356.0,7446015,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4450,2732,1.920911e+06,8.955261e+06,-26.953477,0,1919838.0,8955807,0,0,0,...,0,1,0,0,0,1,0,0,0,0
5582,2857,2.263289e+06,9.309547e+06,94.793487,0,2263376.0,9308506,0,0,0,...,0,0,1,0,0,1,0,0,0,0
219,766,1.801783e+06,8.093233e+06,-126.466310,1,1802002.0,8093529,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4299,693,1.896830e+06,8.263999e+06,157.136154,0,1898035.0,8263491,0,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,742,1.727620e+06,8.054662e+06,-148.946005,0,1727716.0,8054720,1,0,0,...,1,0,0,1,0,0,0,0,0,0
5191,72,2.009944e+06,8.251584e+06,164.094575,0,2010188.0,8251514,0,0,0,...,0,0,1,1,0,0,0,0,0,0
5226,536,1.966241e+06,8.366161e+06,63.700957,0,1965821.0,8365312,0,1,0,...,0,1,0,0,1,0,0,0,0,0
5390,3037,2.416546e+06,9.823889e+06,-52.356561,0,2415633.0,9825073,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [36]:
# Set these:

columns_to_remove = ['age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

      near_fid        near_x        near_y  near_angle  building      xcoor  \
2707      3488  1.539340e+06  7.445810e+06  -94.448289         0  1539356.0   
4450      2732  1.920911e+06  8.955261e+06  -26.953477         0  1919838.0   
5582      2857  2.263289e+06  9.309547e+06   94.793487         0  2263376.0   
219        766  1.801783e+06  8.093233e+06 -126.466310         1  1802002.0   
4299       693  1.896830e+06  8.263999e+06  157.136154         0  1898035.0   
...        ...           ...           ...         ...       ...        ...   
3772       742  1.727620e+06  8.054662e+06 -148.946005         0  1727716.0   
5191        72  2.009944e+06  8.251584e+06  164.094575         0  2010188.0   
5226       536  1.966241e+06  8.366161e+06   63.700957         0  1965821.0   
5390      3037  2.416546e+06  9.823889e+06  -52.356561         0  2415633.0   
860        355  1.964276e+06  8.217796e+06  102.085092         1  1964509.0   

        ycoor  noise  in_vehicle  asleep  ...  new_

## only three age groups and no age

In [37]:
# Set these:

columns_to_remove = ['age', 'age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

      near_fid        near_x        near_y  near_angle  building      xcoor  \
2707      3488  1.539340e+06  7.445810e+06  -94.448289         0  1539356.0   
4450      2732  1.920911e+06  8.955261e+06  -26.953477         0  1919838.0   
5582      2857  2.263289e+06  9.309547e+06   94.793487         0  2263376.0   
219        766  1.801783e+06  8.093233e+06 -126.466310         1  1802002.0   
4299       693  1.896830e+06  8.263999e+06  157.136154         0  1898035.0   
...        ...           ...           ...         ...       ...        ...   
3772       742  1.727620e+06  8.054662e+06 -148.946005         0  1727716.0   
5191        72  2.009944e+06  8.251584e+06  164.094575         0  2010188.0   
5226       536  1.966241e+06  8.366161e+06   63.700957         0  1965821.0   
5390      3037  2.416546e+06  9.823889e+06  -52.356561         0  2415633.0   
860        355  1.964276e+06  8.217796e+06  102.085092         1  1964509.0   

        ycoor  noise  in_vehicle  asleep  ...  new_

## Multiple age groups, age and dist

In [38]:
X_train.columns

Index(['near_fid', 'near_x', 'near_y', 'near_angle', 'building', 'xcoor',
       'ycoor', 'noise', 'in_vehicle', 'asleep', 'no_windows', 'age', 'dist',
       'age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+', 'new_age_group_Young',
       'new_age_group_Middle-aged', 'new_age_group_Elderly',
       'distance_groups_0-500', 'distance_groups_500-1000',
       'distance_groups_1000-1500', 'distance_groups_1500-2000',
       'distance_groups_2000-2500', 'distance_groups_2500-3000',
       'distance_groups_>3000'],
      dtype='object')

In [39]:
# Set these:

columns_to_remove = ['new_age_group_Young',
       'new_age_group_Middle-aged', 'new_age_group_Elderly']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

      near_fid        near_x        near_y  near_angle  building      xcoor  \
2707      3488  1.539340e+06  7.445810e+06  -94.448289         0  1539356.0   
4450      2732  1.920911e+06  8.955261e+06  -26.953477         0  1919838.0   
5582      2857  2.263289e+06  9.309547e+06   94.793487         0  2263376.0   
219        766  1.801783e+06  8.093233e+06 -126.466310         1  1802002.0   
4299       693  1.896830e+06  8.263999e+06  157.136154         0  1898035.0   
...        ...           ...           ...         ...       ...        ...   
3772       742  1.727620e+06  8.054662e+06 -148.946005         0  1727716.0   
5191        72  2.009944e+06  8.251584e+06  164.094575         0  2010188.0   
5226       536  1.966241e+06  8.366161e+06   63.700957         0  1965821.0   
5390      3037  2.416546e+06  9.823889e+06  -52.356561         0  2415633.0   
860        355  1.964276e+06  8.217796e+06  102.085092         1  1964509.0   

        ycoor  noise  in_vehicle  asleep  ...  age_

# Remove fid on best performing variant, with no distance groups and multiple age groups, no age

In [40]:
X_train.columns

Index(['near_fid', 'near_x', 'near_y', 'near_angle', 'building', 'xcoor',
       'ycoor', 'noise', 'in_vehicle', 'asleep', 'no_windows', 'age', 'dist',
       'age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+', 'new_age_group_Young',
       'new_age_group_Middle-aged', 'new_age_group_Elderly',
       'distance_groups_0-500', 'distance_groups_500-1000',
       'distance_groups_1000-1500', 'distance_groups_1500-2000',
       'distance_groups_2000-2500', 'distance_groups_2500-3000',
       'distance_groups_>3000'],
      dtype='object')

In [41]:
# Set these:

columns_to_remove = ['near_fid', 'age']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.999338 0.000662]
 [0.000441 0.999559]
 [0.041048 0.958952]
 [0.954552 0.045448]
 [0.008155 0.991845]]
Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      234   30
1       56  822 

Accuracy: 0.925


# Remove fid and coordinate columns on best performing variant, with no distance groups and multiple age groups

In [42]:
# Set these:

columns_to_remove = [ 'near_fid', 'ycoor', 'age']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.999383 0.000617]
 [0.000442 0.999558]
 [0.041431 0.958569]
 [0.95452  0.04548 ]
 [0.008001 0.991999]]
Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      234   30
1       56  822 

Accuracy: 0.925


# new features

In [43]:
corr_matrix = data.corr()
corr_matrix["heard"].sort_values(ascending=False)

heard                        1.000000
distance_groups_0-500        0.317595
distance_groups_500-1000     0.208581
age_group_0-39               0.197279
new_age_group_Young          0.156237
new_age_group_Middle-aged    0.112645
distance_groups_1000-1500    0.083388
building                     0.063283
age_group_40-49              0.063257
distance_groups_1500-2000    0.028361
near_angle                   0.006788
distance_groups_2000-2500    0.004256
xcoor                        0.002301
near_x                      -0.003943
age_group_50-59             -0.004487
distance_groups_2500-3000   -0.015979
age_group_60-69             -0.060252
near_fid                    -0.120875
near_y                      -0.121353
ycoor                       -0.121918
asleep                      -0.139777
age_group_70+               -0.233675
new_age_group_Elderly       -0.239107
no_windows                  -0.239178
age                         -0.288908
noise                       -0.400162
in_vehicle  

In [44]:
# Assuming 'data' is your DataFrame containing the 'noise' and 'in_vehicle' features
# Assuming 'data' is your DataFrame containing the 'noise' and 'in_vehicle' features
data['noise_in_vehicle'] = (data['noise'] & data['in_vehicle']).apply(lambda x: 1 if x else 0)
data['noise_not_in_vehicle'] = (data['noise'] & ~data['in_vehicle']).apply(lambda x: 1 if x else 0)
data['not_noise_in_vehicle'] = (~data['noise'] & data['in_vehicle']).apply(lambda x: 1 if x else 0)
data['not_noise_not_in_vehicle'] = (~data['noise'] & ~data['in_vehicle']).apply(lambda x: 1 if x else 0)


data['in_vehicle_noise'] = data['noise'] + data['in_vehicle']  # Addition


In [45]:
X_train = train.drop(columns=['heard', "distance_groups", "age_group"]) 
y_train = train['heard']

# Extract features and target for testing set
X_test = test.drop(columns=['heard', "distance_groups", "age_group"])
y_test = test['heard']

In [46]:
# correlation between noise and in_vehicle, likely noise is from vehicle or music in vehicle

In [47]:
data

Unnamed: 0,distance_groups,age_group,near_fid,near_x,near_y,near_angle,heard,building,xcoor,ycoor,...,distance_groups_1000-1500,distance_groups_1500-2000,distance_groups_2000-2500,distance_groups_2500-3000,distance_groups_>3000,noise_in_vehicle,noise_not_in_vehicle,not_noise_in_vehicle,not_noise_not_in_vehicle,in_vehicle_noise
0,500-1000,50-59,2712,1.998301e+06,9.011692e+06,-171.588672,1,0,1999193.0,9011824,...,0,0,0,0,0,0,0,0,-1,0
1,500-1000,0-39,2721,1.928907e+06,8.954624e+06,-51.208102,1,0,1928298.0,8955382,...,0,0,0,0,0,0,0,0,-1,0
2,500-1000,0-39,297,2.026384e+06,8.256164e+06,39.018754,1,0,2025706.0,8255615,...,0,0,0,0,0,0,0,0,-1,0
3,0-500,0-39,739,1.743184e+06,8.052652e+06,15.046022,1,0,1742935.0,8052585,...,0,0,0,0,0,0,0,0,-1,0
4,500-1000,50-59,1852,1.350375e+06,7.909850e+06,144.603170,1,0,1350807.0,7909543,...,0,0,0,0,0,0,0,0,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5705,0-500,0-39,5,2.008871e+06,8.255775e+06,-176.234663,1,0,2009140.0,8255793,...,0,0,0,0,0,0,0,0,-1,0
5706,1000-1500,40-49,4069,1.981871e+06,8.270452e+06,45.691415,1,0,1981119.0,8269682,...,1,0,0,0,0,0,0,0,-1,0
5707,500-1000,60-69,2170,1.463760e+06,8.074997e+06,-175.473118,1,0,1464308.0,8075040,...,0,0,0,0,0,0,0,0,-1,0
5708,0-500,0-39,1591,1.479843e+06,7.526377e+06,142.958054,1,0,1480125.0,7526164,...,0,0,0,0,0,0,0,0,-1,0


In [48]:
corr_matrix = data.corr()
corr_matrix["heard"].sort_values(ascending=False)

heard                        1.000000
not_noise_not_in_vehicle     0.501489
distance_groups_0-500        0.317595
distance_groups_500-1000     0.208581
age_group_0-39               0.197279
new_age_group_Young          0.156237
new_age_group_Middle-aged    0.112645
distance_groups_1000-1500    0.083388
building                     0.063283
age_group_40-49              0.063257
distance_groups_1500-2000    0.028361
near_angle                   0.006788
distance_groups_2000-2500    0.004256
xcoor                        0.002301
near_x                      -0.003943
age_group_50-59             -0.004487
distance_groups_2500-3000   -0.015979
age_group_60-69             -0.060252
near_fid                    -0.120875
near_y                      -0.121353
ycoor                       -0.121918
asleep                      -0.139777
age_group_70+               -0.233675
new_age_group_Elderly       -0.239107
no_windows                  -0.239178
not_noise_in_vehicle        -0.254710
noise_not_in

In [49]:
#the new correlation matrix seem to have given effect

In [50]:
# Set these:

columns_to_remove = [ 'near_fid', 'ycoor', 'age', 'noise', 'in_vehicle']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.997923 0.002077]
 [0.001952 0.998048]
 [0.011789 0.988211]
 [0.054077 0.945923]
 [0.001814 0.998186]]
Five first predictions: 
[0 1 1 1 1] 

Confusion matrix: 

heard    0    1
row_0          
0      225   39
1       65  813 

Accuracy: 0.909


In [51]:
# Set these:

columns_to_remove = ['near_fid', 'age', 'noise', 'in_vehicle', 'in_vehicle_noise' ]


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

KeyError: "['in_vehicle_noise'] not found in axis"