In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io
import seaborn as sns
import psycopg2
import sqlite3
from scipy.signal import butter, lfilter
from joblib import Parallel, delayed #Paralleize calculation
from sqlalchemy import create_engine, Column, Integer, ARRAY, MetaData, Table, Text, TypeDecorator, LargeBinary, BLOB
from sqlalchemy.dialects.postgresql import ARRAY as PG_ARRAY
from psycopg2.extensions import register_adapter, AsIs
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier


In [2]:
#WICHTIG!!!
database_sqlite3 = True # Set True for sqlite3 and False for postgre

In [48]:
# Custom adapter function for postgre
def adapt_numpy_ndarray(numpy_array):
    return AsIs(list(numpy_array))
# Register the postgre-adapter
register_adapter(np.ndarray, adapt_numpy_ndarray)

# Function to convert the mess of an sqlite-BLOB-column
def convert_binary_to_array(binary_data):
    if binary_data is not None:
        out = io.BytesIO(binary_data)
        return np.load(out, allow_pickle=True)
    return None

# Database connection parameters and alchemy engine
dbname = 'bathunting'
user = 'python'
password = 'python_password'
host = 'localhost'
port = '5432' 

query_flavour = ''
if database_sqlite3:
    # sqllite3
    engine = create_engine('sqlite:///batcallsv14.db')
    table_name = 'batcalls'
    array_col = 'arr'
else:
    #postgres
    engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}:{port}/{dbname}')
    table_name = 'batcall'
    array_col = 'new_arr'
    query_flavour = '10 < ANY(new_arr) and'

def get_target_data(target, limit=0, no_target=False):
    lmt = "" if limit<=0 else f"LIMIT {limit}"
    #query = ""
    if no_target:
        query = f"SELECT {array_col} FROM {table_name} where {query_flavour} target = {target} {lmt}"
    else:
        query = f"SELECT target, {array_col} FROM {table_name} where {query_flavour} target = {target} {lmt}"
    df = pd.read_sql_query(query, engine)
    if database_sqlite3:
        df['new_arr'] = df['arr'].apply(convert_binary_to_array)
        df.drop('arr', axis=1, inplace=True)
    if no_target:
        df = pd.DataFrame(df['new_arr'].tolist())
    return df

def get_data(targets=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18], limit=0, no_target=False):
    all_df = Parallel(n_jobs=-3, prefer="threads")(delayed(get_target_data)(target, limit, no_target) for target in targets)
    df = pd.concat(all_df)
    return df

def get_targets():
    #conn = psycopg2.connect(dbname=dbname, user=user, password=password, host=host)
    #cursor = conn.cursor()
    query = f"SELECT target, bat FROM {table_name} group by target, bat order by target"
    df = pd.read_sql_query(query, engine)
    #conn.close()
    return df

def get_shape(nested_list):
    try:
        # Initialize shape list
        shape = []
        # Iterate to calculate the shape
        while isinstance(nested_list, list) or isinstance(nested_list, np.ndarray):
            shape.append(len(nested_list))
            nested_list = nested_list[0]
        return tuple(shape)
    except (TypeError, IndexError) as e:
        # In case the nested lists are not uniformly sized
        return f"Irregular shape - nested lists are not of equal size. \n ERROR: {e}"
    
# Get data to work with
def get_features_and_targets(limit=500, scaler=StandardScaler(), categorical=True):
    data = get_data(limit=limit)
    df = pd.DataFrame(data["new_arr"].tolist())
    if scaler != None:
        df = scaler.fit_transform(df)

    labels = pd.DataFrame(data["target"])
    if categorical:
        labels = to_categorical(labels, num_classes=19)
    return df, labels

def vogl_conversion(df):
    data_reshaped = []
    for _,data in df.iterrows():
        # Normalize
        data -= np.mean(data)
        data /= np.std(data)
        # Realy no idea just assuming prof did it right
        # Calculate spectrogram with FFT
        stft = np.abs(librosa.stft(np.array(data), n_fft=512, hop_length=32))
        stft = 10 * np.log10(stft)
        stft = np.nan_to_num(stft)
        # Scale between [0,1] and reduce shape if needed
        stft = (stft - np.min(stft)) / (np.max(stft) - np.min(stft))
        stft = np.reshape(stft, (257, 138, 1))
        stft = stft[:256, -128: , :]
        data_reshaped.append(stft)
    return np.array(data_reshaped)


In [49]:
df, labels = get_features_and_targets(categorical=False)
pca = PCA(n_components=0.99)  # You can change the number of components
pca.fit(df)
df_pca = pca.transform(df)
#print("Explained variance ratio:", pca.explained_variance_ratio_)
#print("Components:", pca.components_)
print(f'Number of principal components: {len(pca.components_)}')
X_train, X_test, y_train, y_test = train_test_split(df_pca, labels['target'], test_size=0.2, random_state=42)

Number of principal components: 803


## KNN 
... kannste in die Tonne treten:
- schlechte Genauigkeit
- kommt im recall so gut wie nie über 50%
- Rauschen wird kaum erkannt
- F1 Score ist für die meisten Klassen weit unter 0.5

In [50]:
for i in range(1,100):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    # Evaluate accuracy
    print(f"Accuracy (n={i}):", accuracy_score(y_test, y_pred))
    # More detailed report
    print(classification_report(y_test, y_pred))

Accuracy (n=1): 0.44432194046306506
              precision    recall  f1-score   support

           0       0.34      0.35      0.35       108
           1       0.92      0.92      0.92        63
           2       0.42      0.36      0.39        98
           3       0.28      0.14      0.19       112
           4       0.60      0.55      0.57        91
           5       0.31      0.33      0.32        90
           6       0.29      0.36      0.32       109
           7       0.84      0.66      0.74        71
           8       0.90      0.89      0.90        74
           9       0.57      0.69      0.63        98
          10       0.43      0.38      0.40       102
          11       0.45      0.44      0.45       108
          12       0.07      0.11      0.08       103
          13       0.30      0.38      0.33        98
          14       0.26      0.23      0.24       103
          15       0.51      0.37      0.43       115
          16       0.52      0.72      0.61  

In [9]:
# PCA basic example
X = np.array([[1, 2], [3, 4], [5, 6]])
X = pd.DataFrame(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=0.999)  # You can change the number of components
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Components:", pca.components_)


Explained variance ratio: [1.]
Components: [[0.70710678 0.70710678]]
