# ELEC 478 Final Code Submission
Grace Wang and Didi Zhou

## Load in Packages

In [11]:
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# from tqdm import tqdm
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.metrics import balanced_accuracy_score, make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn import model_selection
from sklearn.svm import SVC
# from sklearn.cluster import SpectralClustering
from sklearn.ensemble import RandomForestClassifier
# from imblearn.ensemble import BalancedRandomForestClassifier
# from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
# from sklearn.ensemble import BaggingClassifier
from xgboost import plot_tree
import random
from sklearn.impute import KNNImputer
import math



## Data Cleaning & Feature Engineering

### Data Imputation for Morph Embeddings
Use kNN Imputation to fill in missing pre- morph embeddings

In [None]:
#load in training data on each ADP / potential synapse
data = pd.read_csv("../Data/train_data.csv")

#load in morph embedding df
morph_embeddings = pd.read_csv("../Data/morph_embeddings.csv")

# merge ADP data with morph embedding data
full_data = (
    data.merge(
        morph_embeddings.rename(columns=lambda x: "pre_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
    .merge(
        morph_embeddings.rename(columns=lambda x: "post_" + x),
        how="left",
        validate="m:1",
        copy=False,
    )
)

# get only the morph embeddings back out (some pre-morph embeddings
# will be null)
morph_embed = full_data.filter(regex="_morph_emb_")

# complete the imputation
imputer = KNNImputer(n_neighbors=2)
imputed_morph_embed = imputer.fit_transform(morph_embed)
imputed_morph_embed_df = pd.DataFrame(imputed_morph_embed)
csv_imputed_morph_embed_df = imputed_morph_embed_df.copy()

# save to CSV
csv_imputed_morph_embed_df.to_csv('../Data/imputed_morph_embed.csv',index=False)


In [10]:
# the resulting dataframe
pd.read_csv('Data/imputed_morph_embed.csv').head(5)

Unnamed: 0,pre_morph_emb_0,pre_morph_emb_1,pre_morph_emb_2,pre_morph_emb_3,pre_morph_emb_4,pre_morph_emb_5,pre_morph_emb_6,pre_morph_emb_7,pre_morph_emb_8,pre_morph_emb_9,...,post_morph_emb_22,post_morph_emb_23,post_morph_emb_24,post_morph_emb_25,post_morph_emb_26,post_morph_emb_27,post_morph_emb_28,post_morph_emb_29,post_morph_emb_30,post_morph_emb_31
0,0.373316,0.209818,-0.546946,0.630883,0.832248,-0.983688,1.085743,-0.395466,-1.151271,-0.495176,...,-1.064851,-0.816273,-0.215224,-0.598382,0.545335,-0.525224,0.171648,1.022962,-0.645146,-0.687774
1,0.373316,0.209818,-0.546946,0.630883,0.832248,-0.983688,1.085743,-0.395466,-1.151271,-0.495176,...,-1.064851,-0.816273,-0.215224,-0.598382,0.545335,-0.525224,0.171648,1.022962,-0.645146,-0.687774
2,0.373316,0.209818,-0.546946,0.630883,0.832248,-0.983688,1.085743,-0.395466,-1.151271,-0.495176,...,-1.064851,-0.816273,-0.215224,-0.598382,0.545335,-0.525224,0.171648,1.022962,-0.645146,-0.687774
3,0.373316,0.209818,-0.546946,0.630883,0.832248,-0.983688,1.085743,-0.395466,-1.151271,-0.495176,...,-1.064851,-0.816273,-0.215224,-0.598382,0.545335,-0.525224,0.171648,1.022962,-0.645146,-0.687774
4,0.373316,0.209818,-0.546946,0.630883,0.832248,-0.983688,1.085743,-0.395466,-1.151271,-0.495176,...,-1.064851,-0.816273,-0.215224,-0.598382,0.545335,-0.525224,0.171648,1.022962,-0.645146,-0.687774


## General Cleaning & Feature Engineering

In [12]:
def cleaner(train, feature, imp_morph):
    """
    Function that performs data cleaning and feature engineering for the data

    inputs:
    - train: path to training data
    - feature: path to feature data
    - imp_morph: path to imputed morph embedding data

    outputs:
    - data: cleaned and feature engineered data
    """

    data = pd.read_csv(train)

    ############## CONCAT FEATURE WEIGHT DATA ##############
    
    #load in feature weight information for each neuron
    feature_weights = pd.read_csv(feature)
        
    # make feature_weights into a numPy array
    feature_weights["feature_weights"] = (feature_weights.filter(regex="feature_weight_").sort_index(axis=1)
                                        .apply(lambda x: np.array(x), axis=1))

    # delete the feature_weight_i columns
    feature_weights.drop(
        feature_weights.filter(regex="feature_weight_").columns, axis=1, inplace=True)
    
    # update data with feature weight information for pre- and post- neurons
    data = (
    data.merge(
        feature_weights.rename(columns=lambda x: "pre_" + x), 
        how="left", 
        validate="m:1",
        copy=False,
    )
    .merge(
        feature_weights.rename(columns=lambda x: "post_" + x),
        how="left",
        validate="m:1",
        copy=False,
    ))
    
    ############## CONCAT IMPUTED MORPH EMBEDDINGS ################
    # load in imputed morph embedding data
    morph_embs = pd.read_csv(imp_morph)

    # put all the morph embedding data into vectors
    morph_embs["pre_morph_embeddings"] = (morph_embs.filter(regex="pre_morph_emb_").sort_index(axis=1)
                                          .apply(lambda x: np.array(x), axis=1))
    
    morph_embs["post_morph_embeddings"] = (morph_embs.filter(regex="post_morph_emb_")
                                           .sort_index(axis=1).apply(lambda x: np.array(x), axis=1))
    
    # drop the individual morph embedding columns
    morph_embs.drop(morph_embs.filter(regex="_morph_emb_").columns, axis=1, inplace=True)
    morph_embs["ID"] = data["ID"]

    # merge the main df with morph embeddings
    data = data.merge(morph_embs, on="ID")

    ############## FE: CALCULATE SIMILARITY BETWEEN PRE- AND POST- MORPH EMBEDDINGS ###################
    data["me_similarity"] = data.apply(row_morph_similarity, axis=1)
    
    ############## FE: CALCULATE SIMILARITY BETWEEN PRE- AND POST- FEATURE WEIGHTS ##############
    data["fw_similarity"] = data.apply(row_feature_similarity, axis=1)
    
    ############## FE: COMBINE COORDINATES INTO ARRAYS ##############
    data = coord_column(data, "axonal_coords", "axonal_coor_")
    data = coord_column(data, "dendritic_coords", "dendritic_coor_")
    data = coord_column(data, "pre_rf_coords", "pre_rf_[xy]")
    data = coord_column(data, "post_rf_coords", "post_rf_[xy]")
    data = coord_column(data, "pre_nucleus_coords", "pre_nucleus_[xyz]")
    data = coord_column(data, "post_nucleus_coords", "post_nucleus_[xyz]")
    data = coord_column(data, "pre_nucleus_xy", "pre_nucleus_[xy]")
    data = coord_column(data, "post_nucleus_xy", "post_nucleus_[xy]")

    ############## FE: RF Distance ##############
    data = coord_rf(data)
    data["rf_distance"] = data.apply(rfdistance, axis=1)

    ############## FE: ONE HOT ENCODE BRAIN AREA ##############
    data = one_hot('pre_brain_area', data, '_pre')
    data = one_hot('post_brain_area', data, '_post')

    ############## FE: BRAIN COMPARTMENT GROUPING ##############
    data = area_cols(data)

    ############## FE: MINICOLUMNS ##############
    data["minicol_dist"] =  data[["pre_nucleus_xy", "post_nucleus_xy"]].apply(
    lambda x: math.dist(x["pre_nucleus_xy"], x["post_nucleus_xy"]), axis=1)

    ############## FE: DISTANCE FROM PRE-SYNAPTIC NUCLEUS TO AXON ##############
    data["nuclei_adp_dist"] =  data[["pre_nucleus_coords", "axonal_coords"]].apply(
    lambda x: math.dist(x["pre_nucleus_coords"], x["axonal_coords"]), axis=1)

    ############## STANDARDIZE ALL NUMERIC DATA #############
    num_cols = data.select_dtypes(include='number').drop(columns=['ID', 'pre_nucleus_id', 'post_nucleus_id'])
    num_cols = num_cols.columns
    for column in num_cols:
        data[column] = StandardScaler().fit_transform(np.array(data[column]).reshape(-1, 1))
    
    # return processed data
    return data

def row_feature_similarity(row):
    """
    Cosine similarity function for feature weight similarity

    Inputs: row - a row of the dataframe containing feature weight information
    Outputs: the cosine similarity between the pre and post feature weights
    """
    pre = row["pre_feature_weights"]
    post = row["post_feature_weights"]
    return (pre * post).sum() / (np.linalg.norm(pre) * np.linalg.norm(post))

def row_morph_similarity(row):
    """
    Morph embedding similarity function for feature weight similarity

    Inputs: row - a row of the dataframe containing feature weight information
    Outputs: the cosine similarity between the pre and post morph embeddings
    """
    pre = row["pre_morph_embeddings"]
    post = row["post_morph_embeddings"]
    return (pre * post).sum() / (np.linalg.norm(pre) * np.linalg.norm(post))


def coord_column(df, new_col, old_cols):
    """
    Function that combines coordinate data into an array of coordinates
    Inputs:
        - df: the data frame
        - new_col: the new column created to store the coordinate array
        - old_cols: the old coordinate columns to be combined to form the new column
    Outputs:
        - df: the updated data frame
    """
    df[new_col] = (
        df.filter(regex=old_cols)
        .sort_index(axis=1)
        .apply(lambda x: np.array(x), axis=1)
    )
    
    return df

def coord_rf(df):
    """
    Function that combines coordinate data for the rf data 
    (readout location of deep learning model)
    Inputs:
        - df: the data frame
    Outputs:
        - df: the updated data frame
    """
    df = coord_column(df, "pre_rf_coords_xy", "pre_rf_[xy]")
    df = coord_column(df, "post_rf_coords_xy", "post_rf_[xy]")
    return df

def rfdistance(row):
    """
    Function that calculates the distance between the rf locations
    Inputs:
        - row: a row describing one ADP
    Outputs:
        - the distance between the pre- and post- neuron readout locations
    """
    pre = row["pre_rf_coords_xy"]
    post = row["post_rf_coords_xy"]
    return math.dist(pre, post)

def one_hot(column, df, suffix=''):
    """
    Function for one-hot encoding

    Inputs:
        - column: the column to be one-hot encoded
        - df: the dataframe
        - suffix: an optional suffix to be added to the column when it is one-hot encoded
    
    Outputs:
        - df: the updated dataframe
    """
    cats = pd.unique(df[column])

    for cat in cats:
        new_col = cat+suffix
        df[new_col] = df[column]==cat
        df[new_col] = df[new_col].astype('int')
    
    df = df.drop(columns=column)
    return df

def area_cols(df):
    """
    Function that groups and encodes the compartment data
    Inputs:
        - df: the data frame
    Outputs:
        - df: the updated data frame with grouped compartments
    """
    # Encode brain areas
    area1 = ["basal", "soma"] # the cell body
    area2 = ["axon", "apical", "oblique", "apical_shaft"] # axonal areas
    area3 = ["apical_tuft"] # terminal areas
    df["area1"] = df["compartment"].isin(area1).astype('int')
    df["area2"] = df["compartment"].isin(area2).astype('int')
    df["area3"] = df["compartment"].isin(area3).astype('int')
    return df

