In [3]:
#imports only
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, RandomizedSearchCV
from scipy import stats
import sklearn.preprocessing as skl_pre
import sklearn.linear_model as skl_lm
import sklearn.discriminant_analysis as skl_da
import sklearn.neighbors as skl_nb
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
import seaborn as sns
from pandas.plotting import scatter_matrix


In [4]:
file_path = 'siren_data_train.csv'
data = pd.read_csv(file_path)

#Split the binary columns into zeroes and ones and use that going forwards
#data = pd.get_dummies(data, columns=["building","noise", "in_vehicle", "asleep","no_windows"])
data_copy = data.copy()

data.head()

Unnamed: 0,near_fid,near_x,near_y,near_angle,heard,building,xcoor,ycoor,noise,in_vehicle,asleep,no_windows,age
0,2712,1998301.0,9011692.0,-171.588672,1,0,1999193.0,9011824,0,0,0,0,59
1,2721,1928907.0,8954624.0,-51.208102,1,0,1928298.0,8955382,0,0,0,0,29
2,297,2026384.0,8256164.0,39.018754,1,0,2025706.0,8255615,0,0,0,0,32
3,739,1743184.0,8052652.0,15.046022,1,0,1742935.0,8052585,0,0,0,0,36
4,1852,1350375.0,7909850.0,144.60317,1,0,1350807.0,7909543,0,0,0,0,55


In [5]:
# create age groups

#new bins based on hearing with age
new_age_bins = [0, 40, 50, 60, 70, float('inf')]
new_age_labels = ['0-39', '40-49', '50-59', '60-69', '70+']

# Create age groups column with new bins
data['age_group'] = pd.cut(data['age'], bins=new_age_bins, labels=new_age_labels, right=False)


#create new age_groups--------
#new bins based on hearing with age
new_age_bins = [0, 30, 60, float('inf')]
new_age_labels = ['Young', 'Middle-aged', 'Elderly']

# Create age groups column with new bins
data['new_age_group'] = pd.cut(data['age'], bins=new_age_bins, labels=new_age_labels, right=False)





In [6]:
#create distance

data['dist'] = np.sqrt((data['xcoor'] - data['near_x'])**2 + (data['ycoor'] - data['near_y'])**2)


In [7]:
# create distance group


# Add distance groups
# Create bins for distance ranging from 0 to 3000 with a step size of 100
step_size = 500
distance_bins = np.arange(0, 3100, step_size)

# Create labels for distance bins
distance_labels = [f'{i}-{i+step_size}' for i in range(0, 3000, step_size)]
# Append a bin edge for values greater than 3000
distance_bins = np.append(distance_bins, np.inf)
distance_labels.append('>3000')

# Assign each distance value to a corresponding bin
data['distance_groups'] = pd.cut(data['dist'], bins=distance_bins, labels=distance_labels, right=False)

In [8]:
# Create new combined features of noise and in_vehicle
# Assuming 'data' is your DataFrame containing the 'noise' and 'in_vehicle' features
data['noise_in_vehicle'] = ((data['noise'] == 1) & (data['in_vehicle'] == 1)).astype(int)
data['noise_not_in_vehicle'] = ((data['noise'] == 1) & (data['in_vehicle'] == 0)).astype(int)
data['not_noise_in_vehicle'] = ((data['noise'] == 0) & (data['in_vehicle'] == 1)).astype(int)
data['not_noise_not_in_vehicle'] = ((data['noise'] == 0) & (data['in_vehicle'] == 0)).astype(int)


In [9]:
dummies = pd.get_dummies(data, columns=["age_group", "new_age_group", "distance_groups"])

keep_these_original_columns_for_plots = data[["distance_groups", "age_group"]]

data = pd.concat([keep_these_original_columns_for_plots, dummies], axis=1)
 
data = data.drop(columns=['near_fid'])  #makes no sense to have it

data_copy = data.copy()

In [10]:
data

Unnamed: 0,distance_groups,age_group,near_x,near_y,near_angle,heard,building,xcoor,ycoor,noise,...,new_age_group_Young,new_age_group_Middle-aged,new_age_group_Elderly,distance_groups_0-500,distance_groups_500-1000,distance_groups_1000-1500,distance_groups_1500-2000,distance_groups_2000-2500,distance_groups_2500-3000,distance_groups_>3000
0,500-1000,50-59,1.998301e+06,9.011692e+06,-171.588672,1,0,1999193.0,9011824,0,...,False,True,False,False,True,False,False,False,False,False
1,500-1000,0-39,1.928907e+06,8.954624e+06,-51.208102,1,0,1928298.0,8955382,0,...,True,False,False,False,True,False,False,False,False,False
2,500-1000,0-39,2.026384e+06,8.256164e+06,39.018754,1,0,2025706.0,8255615,0,...,False,True,False,False,True,False,False,False,False,False
3,0-500,0-39,1.743184e+06,8.052652e+06,15.046022,1,0,1742935.0,8052585,0,...,False,True,False,True,False,False,False,False,False,False
4,500-1000,50-59,1.350375e+06,7.909850e+06,144.603170,1,0,1350807.0,7909543,0,...,False,True,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5705,0-500,0-39,2.008871e+06,8.255775e+06,-176.234663,1,0,2009140.0,8255793,0,...,True,False,False,True,False,False,False,False,False,False
5706,1000-1500,40-49,1.981871e+06,8.270452e+06,45.691415,1,0,1981119.0,8269682,0,...,False,True,False,False,False,True,False,False,False,False
5707,500-1000,60-69,1.463760e+06,8.074997e+06,-175.473118,1,0,1464308.0,8075040,0,...,False,False,True,False,True,False,False,False,False,False
5708,0-500,0-39,1.479843e+06,7.526377e+06,142.958054,1,0,1480125.0,7526164,0,...,False,True,False,True,False,False,False,False,False,False


In [11]:
#random sampling
def split_train_test(data, test_ratio, random_state): 
    shuffled_indices = np.random.permutation(len(data)) if random_state is None else np.random.RandomState(random_state).permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

#stratified sampling, where we create a sample that takes the distribution of age into account
def stratified_sampling(data, test_ratio, important_data_column):
    split = StratifiedShuffleSplit(n_splits=1, test_size = test_ratio, random_state=42)
    for train_index, test_index in split.split(data, important_data_column):
        strat_train_set = data.loc[train_index]
        strat_test_set = data.loc[test_index]
    return strat_train_set, strat_test_set


#create samples
#train, test = stratified_sampling(data, 0.2, data["age"])
train, test = split_train_test(data, 0.2, random_state=42)

#print(len(test_set))
#print(len(train_set))

In [12]:
X_train = train.drop(columns=['heard', "distance_groups", "age_group"]) 
y_train = train['heard']

# Extract features and target for testing set
X_test = test.drop(columns=['heard', "distance_groups", "age_group"])
y_test = test['heard']



In [13]:
X_train

Unnamed: 0,near_x,near_y,near_angle,building,xcoor,ycoor,noise,in_vehicle,asleep,no_windows,...,new_age_group_Young,new_age_group_Middle-aged,new_age_group_Elderly,distance_groups_0-500,distance_groups_500-1000,distance_groups_1000-1500,distance_groups_1500-2000,distance_groups_2000-2500,distance_groups_2500-3000,distance_groups_>3000
2707,1.539340e+06,7.445810e+06,-94.448289,0,1539356.0,7446015,0,0,0,0,...,False,True,False,True,False,False,False,False,False,False
4450,1.920911e+06,8.955261e+06,-26.953477,0,1919838.0,8955807,0,0,0,0,...,False,True,False,False,False,True,False,False,False,False
5582,2.263289e+06,9.309547e+06,94.793487,0,2263376.0,9308506,0,0,0,0,...,False,False,True,False,False,True,False,False,False,False
219,1.801783e+06,8.093233e+06,-126.466310,1,1802002.0,8093529,0,0,0,0,...,False,True,False,True,False,False,False,False,False,False
4299,1.896830e+06,8.263999e+06,157.136154,0,1898035.0,8263491,0,0,0,0,...,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,1.727620e+06,8.054662e+06,-148.946005,0,1727716.0,8054720,1,0,0,0,...,True,False,False,True,False,False,False,False,False,False
5191,2.009944e+06,8.251584e+06,164.094575,0,2010188.0,8251514,0,0,0,0,...,False,False,True,True,False,False,False,False,False,False
5226,1.966241e+06,8.366161e+06,63.700957,0,1965821.0,8365312,0,1,0,0,...,False,True,False,False,True,False,False,False,False,False
5390,2.416546e+06,9.823889e+06,-52.356561,0,2415633.0,9825073,0,0,0,0,...,False,True,False,False,False,True,False,False,False,False


# LDA 

In [14]:
import sklearn.discriminant_analysis as skl_da

In [15]:
lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)

predict_prob_lda = lda_model.predict_proba(X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.99956  0.00044 ]
 [0.000378 0.999622]
 [0.042121 0.957879]
 [0.889914 0.110086]
 [0.008937 0.991063]]
Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      233   29
1       57  823 

Accuracy: 0.925


# what happens if I remove "Age" ? 

In [16]:
# Set these:

columns_to_remove = ['age']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test_original = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict
predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.999451 0.000549]
 [0.000418 0.999582]
 [0.0477   0.9523  ]
 [0.904104 0.095896]
 [0.00992  0.99008 ]]
Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      234   29
1       56  823 

Accuracy: 0.926


# what happens if I remove "distance" ? 

In [17]:
# Set these:

columns_to_remove = ['dist']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test_original = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict
predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

The class order in the model: 
[0 1]
Examples of predicted probabilities for the above classes: 
[[0.99721  0.00279 ]
 [0.000585 0.999415]
 [0.047377 0.952623]
 [0.885381 0.114619]
 [0.009319 0.990681]]
Five first predictions: 
[0 1 1 0 1] 

Confusion matrix: 

heard    0    1
row_0          
0      235   40
1       55  812 

Accuracy: 0.917


# testing with Sequential Feature selector



In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:



estimator = lda_model

# Instantiate SequentialFeatureSelector
selector = SequentialFeatureSelector(estimator,
                                     direction='forward',  # or 'backward'
                                     scoring='accuracy',  # or any other scoring metric
                                     n_jobs=-1)

# Fit the selector to your training data
selector.fit(X_train, y_train)

# Get the selected feature indices
selected_features_indices = selector.get_support(indices=True)

# Get the selected feature names
selected_features_names = X_train.columns[selected_features_indices]

# Print the selected feature names
print("Selected Features:", selected_features_names)

In [None]:
# Testing with the columns that was recommended by sequential feature selector

In [None]:
X_train.columns

In [None]:
columns_to_remove = ["age", "dist", 'age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+', 'distance_groups_0-500',
       'distance_groups_500-1000', 'distance_groups_1000-1500',
       'distance_groups_1500-2000', 'distance_groups_2000-2500',
       'distance_groups_2500-3000']
X_train_sequential = X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']

# Extract features and target for testing set
X_test_sequential = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

In [None]:
lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(X_train_sequential, y_train)

In [None]:
predict_prob_lda = lda_model.predict_proba(X_test_sequential)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

In [None]:
prediction_lda = np.empty(len(X_test_sequential), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

# Building my own sequential feature selector


In [None]:

columns = X_train.columns
for column in columns:
    column_to_remove = column
    print("Column that was removed: ", column)
    columns_to_remove = [column]
    X_train_temp = X_train.copy().drop(columns=columns_to_remove, axis=1)
    y_train_temp = train['heard']
    
    # Extract features and target for testing set
    X_test_temp = X_test.copy().drop(columns=columns_to_remove, axis=1)
    y_test_temp = test['heard']

    lda_model = skl_da.LinearDiscriminantAnalysis()
    lda_model.fit(X_train_temp, y_train_temp)

    predict_prob_lda = lda_model.predict_proba(X_test_temp)

    prediction_lda = np.empty(len(X_test_temp), dtype=object)
    prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
    #print("Five first predictions: ")
    #print(prediction_lda[0:5], "\n")
    
    
    # Confusion matrix
    #print("Confusion matrix: \n")
    #print(pd.crosstab(prediction_lda, y_test_temp), "\n")
    
    #Accuracy
    print(f"Accuracy: {np.mean(prediction_lda == y_test_temp):.3f}")

    #print("The class order in the model: ")
    #print(lda_model.classes_)

    #print("Examples of predicted probabilities for the above classes: ")
    #with np.printoptions(suppress=True, precision=6):
        #print(predict_prob_lda[0:5])



# What happens if I only have original features + distance? No distance groups and age groups

In [None]:
# Set these:

columns_to_remove = ['new_age_group_Young',
       'new_age_group_Middle-aged', 'new_age_group_Elderly','age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+', 'distance_groups_0-500',
       'distance_groups_500-1000', 'distance_groups_1000-1500',
       'distance_groups_1500-2000', 'distance_groups_2000-2500',
       'distance_groups_2500-3000']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

# What happens if I remove distance entirely?

In [None]:
# Set these:

columns_to_remove = ['dist',
       'distance_groups_500-1000', 'distance_groups_1000-1500',
       'distance_groups_1500-2000', 'distance_groups_2000-2500',
       'distance_groups_2500-3000']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

# What happens if I only have "dist" and no distance groups? 

In [None]:
# Set these:

columns_to_remove = [
       'distance_groups_500-1000', 'distance_groups_1000-1500',
       'distance_groups_1500-2000', 'distance_groups_2000-2500',
       'distance_groups_2500-3000']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

# What happens if I limit the age groups to 3? And only have regular distance (no distance groups)

In [None]:
X_train

In [None]:
# Set these:

columns_to_remove = ['age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+', 'distance_groups_500-1000', 'distance_groups_1000-1500',
       'distance_groups_1500-2000', 'distance_groups_2000-2500',
       'distance_groups_2500-3000']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

## only three age groups and no age

In [None]:
# Set these:

columns_to_remove = ['age', 'age_group_0-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70+']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

## Multiple age groups, age and dist

In [None]:
X_train.columns

In [None]:
# Set these:

columns_to_remove = ['new_age_group_Young',
       'new_age_group_Middle-aged', 'new_age_group_Elderly', 'age']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

# Remove fid on best performing variant, with no distance groups and multiple age groups, no age

In [None]:
X_train.columns

In [None]:
# Set these:

columns_to_remove = ['age']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

# Remove fid and coordinate columns on best performing variant, and test to remove 3 age groups, performs worse than multiple and 3 age groups

In [None]:
X_train.columns

In [None]:
# Set these:

columns_to_remove = ['near_y', 'age', 'near_x', 'xcoor', 'ycoor', 'near_angle']


#Let these be
current_X_train= X_train.copy().drop(columns=columns_to_remove, axis=1)
y_train = train['heard']
#print(current_X_train)

current_X_test = X_test.copy().drop(columns=columns_to_remove, axis=1)
y_test = test['heard']

#LDA

lda_model = skl_da.LinearDiscriminantAnalysis()
lda_model.fit(current_X_train, y_train)

#Predict

predict_prob_lda = lda_model.predict_proba(current_X_test)

print("The class order in the model: ")
print(lda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_lda[0:5])

prediction_lda = np.empty(len(current_X_test), dtype=object)
prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_lda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_lda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_lda == y_test):.3f}")

In [None]:
#sns.heatmap(mini_data.corr());

# Increase the size of the heatmap.
plt.figure(figsize=(16, 6))
# Store heatmap object in a variable to easily access it when you want to include more features (such as title).
# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(X_train.corr(), vmin=-1, vmax=1, annot=False)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

# QDA 

In [None]:
#columns_to_remove = ['asleep', 'distance_groups_>3000', 'noise', 'age_group_50-59', 'distance_groups_2500-3000']
#columns_to_remove = ['asleep', 'distance_groups_>3000', 'distance_groups_2500-3000']
X_train_QDA = X_train.drop(columns=columns_to_remove , axis=1)
X_test_QDA = X_test.drop(columns=columns_to_remove , axis=1)



In [None]:
X_train_QDA.columns

In [None]:
qda_model = skl_da.QuadraticDiscriminantAnalysis()
qda_model.fit(X_train_QDA, y_train)

In [None]:
predict_prob_qda = qda_model.predict_proba(X_test_QDA)

print("The class order in the model: ")
print(qda_model.classes_)

print("Examples of predicted probabilities for the above classes: ")
with np.printoptions(suppress=True, precision=6):
    print(predict_prob_qda[0:5])

In [None]:
prediction_qda = np.empty(len(X_test), dtype=object)
prediction_qda = np.where(predict_prob_qda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
print("Five first predictions: ")
print(prediction_qda[0:5], "\n")


# Confusion matrix
print("Confusion matrix: \n")
print(pd.crosstab(prediction_qda, y_test), "\n")

#Accuracy
print(f"Accuracy: {np.mean(prediction_qda == y_test):.3f}")

In [None]:

columns = X_train_QDA.columns
for column in columns:
    column_to_remove = column
    print("Column that was removed: ", column)
    columns_to_remove = [column]
    X_train_temp = X_train_QDA.copy().drop(columns=columns_to_remove, axis=1)
    y_train_temp = train['heard']
    
    # Extract features and target for testing set
    X_test_temp = X_test_QDA.copy().drop(columns=columns_to_remove, axis=1)
    y_test_temp = test['heard']

    lda_model = skl_da.QuadraticDiscriminantAnalysis()
    lda_model.fit(X_train_temp, y_train_temp)

    predict_prob_lda = lda_model.predict_proba(X_test_temp)

    prediction_lda = np.empty(len(X_test_temp), dtype=object)
    prediction_lda = np.where(predict_prob_lda[:, 0] >= 0.5, 0, 1) #0 is not heard, 1 is heard
    #print("Five first predictions: ")
    #print(prediction_lda[0:5], "\n")
    
    
    # Confusion matrix
    #print("Confusion matrix: \n")
    #print(pd.crosstab(prediction_lda, y_test_temp), "\n")
    
    #Accuracy
    print(f"Accuracy: {np.mean(prediction_lda == y_test_temp):.3f}")

    #print("The class order in the model: ")
    #print(lda_model.classes_)

    #print("Examples of predicted probabilities for the above classes: ")
    #with np.printoptions(suppress=True, precision=6):
        #print(predict_prob_lda[0:5])

