In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer, StandardScaler

from sklearn.manifold import TSNE
from scipy.spatial.distance import pdist, squareform

from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

import time
from collections import defaultdict

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/Colab_Notebooks/oai/TKR_twin'

**Load and Fuse OAI + KNOAP Challenge Data**

In [None]:
# Set base path for files
base_path = '/content/drive/MyDrive/Colab_Notebooks/oai/TKR_twin/'

# Read CSV files
all_clin = pd.read_csv(base_path + 'OAI_all_knees_data.csv')
clean_oai = pd.read_csv(base_path + 'publish_dataframes/pca_modes_and_demos_all_timepoints_12082023.csv')

# Process all_clin
all_clin[['id', 'side']] = all_clin['Knee'].str.split("_", expand=True)
all_clin['side'] = all_clin['side'].replace({'R': 'RIGHT', 'L': 'LEFT'})
all_clin['id'] = pd.to_numeric(all_clin['id'])
all_clin.drop(columns=['Knee', 'Age', 'Gender', 'BMI', 'Postmeno', 'KL'], inplace=True)

# Derive and process tkr from clean_oai
tkr = clean_oai[['id', 'side', 'visit', 'total_or_partial']]

# Process tkr_ids and tkr_right
tkr_ids = tkr[tkr['total_or_partial'].notna()]['id']
tkr_right = tkr[tkr['side'] == 'RIGHT']
tkr_right_ids = tkr_right[tkr_right['total_or_partial'].notna()]['id']

# Merge clean_oai and all_clin into oai_extra, and rename a column
oai_extra = pd.merge(clean_oai, all_clin, on=['id', 'side']).rename(columns={'max_kl': 'oa_prog'})

# Filter and process right side data
right = oai_extra[oai_extra['side'] == 'RIGHT']
right['total_or_partial'].fillna(0, inplace=True)
right['tkr'] = right['total_or_partial'].apply(lambda x: 0 if x == 0 else 1)

# Process baseline_clean_oai
baseline_clean_oai = right[right['visit'] == 'V00']


**Output Dataframes:**

*oai_extra*:
'publish_dataframes/oai_knoap_allTimepoints_12082023.csv'

*baseline_clean_oai*:
'publish_dataframes/oa_incidence_tkr_pc_modes_and_demos_baseline_rightside_12082023.csv'

**Choose Relevant PC mode, clinical factor, and outcome target columns**

In [None]:
# Specific columns
specific_cols = [
    'id', 'side', 'pred_kl', 'KL', 'oa_prog', 'total_or_partial', 'tkr',
    'hisp', 'race', 'gender', 'Varus', 'Tenderness', 'Injury_history',
    'Mild_symptoms', 'Heberden', 'Crepitus', 'Morning_stiffness',
    'age', 'height', 'weight', 'BMI', 'womac_pain', 'womac_adl',
    'womac_stiff', 'womac_total', 'koos_pain', 'koos_symptom', 'koos_func', 'koos_qol'
]

# Pattern-based columns for 'bs', 't2', 'thick'
pattern_based_cols_1 = [
    f"{prefix}_{part}_pc{num}"
    for prefix in ['bs', 't2', 'thick']
    for part in ['fem', 'pat', 'tib']
    for num in range(1, 11)
]

# Separate pattern for 'med' and 'lat' columns
pattern_based_cols_2 = [f"{prefix}_pc{num}" for prefix in ['med', 'lat'] for num in range(1, 11)]

# Combine the lists
all_columns = specific_cols + pattern_based_cols_1 + pattern_based_cols_2

# Create the new DataFrame
oai_particular = baseline_clean_oai[all_columns]
oai_particular.reset_index(inplace=True, drop=True)

**Output Dataframes**:

*oai_particular*:
'publish_dataframes/stats_matching_targets_demos_pcmodes_12082023.csv'

**Drop rows/samples with missingness in PC mode features**

In [None]:
# Select the last 110 columns
columns_to_check = oai_particular.iloc[:, -110:]

# DataFrame with rows dropped if any NaN values in the last 110 columns
df_no_missing = oai_particular.dropna(subset=columns_to_check.columns)

# DataFrame with only rows that have NaN values in any of the last 110 columns
df_with_missing = oai_particular[oai_particular.iloc[:, -110:].isna().any(axis=1)]

**Output Dataframes**:

*df_no_missing*:
'publish_dataframes/before_matching_no_pcmode_missingness_group.csv'

*df_with_missing*:
'publish_dataframes/before_matching_pcmode_missingness_group.csv'

**Drop columns with more than 5% missingness**

In [None]:
# Calculate the percentage of missing values per column
missing_percentage = df_no_missing.iloc[:,:-110].isnull().mean() * 100

# Drop columns with more than 5% missing values
columns_to_drop = missing_percentage[missing_percentage > 5].index
df_no_missing_dropped = df_no_missing.drop(columns=columns_to_drop)

'Varus' and 'koos_func' dropped

**Perform Multiple Imputattion on missing data by group of target variable**

In [None]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

def impute_by_group(df, group_col):
    # Columns for imputation
    categorical_columns = df.columns[7:16]  # Categorical variable columns for imputation
    continuous_columns = df.columns[16:-110]  # Continuous variable columns for imputation

    # Initialize IterativeImputer for continuous and categorical data
    imputer_cont = IterativeImputer(estimator=RandomForestRegressor(), initial_strategy='median', max_iter=40, random_state=0)
    imputer_cat = IterativeImputer(estimator=RandomForestClassifier(), initial_strategy='most_frequent', max_iter=40, random_state=0)

    # Empty list to hold imputed parts
    imputed_parts = []

    # Iterate over each group in specified column
    for name, group in df.groupby(group_col):
        # Separate categorical and continuous data
        categorical_data = group[categorical_columns]
        continuous_data = group[continuous_columns]

        # Impute continuous and categorical data
        continuous_data_imputed = imputer_num.fit_transform(continuous_data)
        categorical_data_imputed = imputer_cat.fit_transform(categorical_data)

        # Convert imputed data back to DataFrame
        continuous_data_imputed_df = pd.DataFrame(continuous_data_imputed, columns=continuous_columns, index=group.index)
        categorical_data_imputed_df = pd.DataFrame(categorical_data_imputed, columns=categorical_columns, index=group.index)

        # Combine imputed data with non-imputed data
        combined_data = pd.concat([group.iloc[:,:7], categorical_data_imputed_df, continuous_data_imputed_df, group.iloc[:,-110:]], axis=1)

        # Append combined data to the list
        imputed_parts.append(combined_data)

    # Concatenate all parts into one DataFrame and sort by index
    return pd.concat(imputed_parts).sort_index()

# df_no_missing_dropped is your DataFrame
df = df_no_missing_dropped.copy()

# Use the function for 'tkr' column
df_imputed_tkr = impute_by_group(df, 'tkr')

# Use the function for 'oa_prog' column
df_imputed_oa_inc = impute_by_group(df, 'oa_prog')


**Output Dataframes**:

*df_imputed_oa_inc*:
'publish_dataframes/oa_inc_multiple_imputation_filled.csv'

*df_imputed_tkr*:
'publish_dataframes/tkr_multiple_imputation_filled.csv'

**Sensitivity Analysis - comparing the distributions of variables before and after multiple imputation**

In [None]:
import pandas as pd
from scipy.stats import ks_2samp

def ks_test_by_group_to_excel(original_df, imputed_df, columns_to_test, group_col, excel_file_path):
    # Initialize a list to store test results
    ks_test_results = []

    # Iterate over each group in the specified column
    for name, group in original_df.groupby(group_col):
        group_imputed = imputed_df[imputed_df[group_col] == name]

        for column in columns_to_test:
            original_data = group[column].dropna()  # Original data for the column in the group
            imputed_data = group_imputed[column]    # Imputed data for the column in the group

            # Conduct the Kolmogorov-Smirnov test
            ks_statistic, p_value = ks_2samp(original_data, imputed_data)

            # Append the results to the list
            ks_test_results.append({
                'Group': name,
                'Column': column,
                'KS Statistic': ks_statistic,
                'P-Value': p_value
            })

    # Convert the list of dictionaries to a DataFrame
    results_df = pd.DataFrame(ks_test_results)

    # Save the results to an Excel file
    results_df.to_excel(excel_file_path, index=False)

    return results_df

# df is the original DataFrame and df_imputed_tkr is the DataFrame after imputation
# Columns to test: 7:-110 in df_imputed
columns_to_test = df_imputed_tkr.columns[7:-110].tolist()

# File path for the Excel output
excel_file_path = 'publish_dataframes/tkr_ks_test_results.xlsx'

# Use the function for 'tkr' column and save results to Excel
ks_results_tkr = ks_test_by_group_to_excel(df, df_imputed_tkr, columns_to_test, 'tkr', excel_file_path)

# Print a preview of the results
print(ks_results_tkr.head())


In [None]:
# OA Incidence Target Column:

columns_to_test = df_imputed_oa_inc.columns[7:-110].tolist()

# File path for the Excel output
excel_file_path = 'publish_dataframes/oa_inc_ks_test_results.xlsx'

# Use the function for 'tkr' column and save results to Excel
ks_results_oa_inc = ks_test_by_group_to_excel(df, df_imputed_oa_inc, columns_to_test, 'oa_prog', excel_file_path)

# Print a preview of the results
print(ks_results_oa_inc.head())

**Output Statistics**:

*ks_results_oa_inc*: 'publish_dataframes/oa_inc_ks_test_results.xlsx'

*ks_results_tkr*: 'publish_dataframes/tkr_ks_test_results.xlsx'


**Standardizing Data**

In [None]:
from sklearn.preprocessing import StandardScaler

def scale_columns(df, start_col_idx, end_col_idx):
    """
    Scales specified columns of a dataframe based on given column indices.

    Parameters:
    df (pd.DataFrame): The dataframe to be scaled.
    start_col_idx (int): The starting index of the columns to be scaled.
    end_col_idx (int): The ending index of the columns to be scaled (exclusive).

    Returns:
    pd.DataFrame: A copy of the dataframe with specified columns scaled.
    """
    cols_to_scale = df.columns[start_col_idx:end_col_idx]

    sc = StandardScaler()
    sc.fit(df[cols_to_scale])

    transformed_df = df.copy()
    transformed_df[cols_to_scale] = sc.transform(transformed_df[cols_to_scale])

    return transformed_df

# Example usage for df_imputed_tkr
transformed_tkr = scale_columns(df_imputed_tkr, 16, -110)

# Adjust the indices as per the dataframe structure
transformed_oa_inc = scale_columns(df_imputed_oa_inc, 16, -110)


In [None]:
# only looking at patients who did not have oa to start with but may progress

transformed_oa_inc_control = transformed_oa_inc[(transformed_oa_inc['pred_kl']==0) | (transformed_oa_inc['pred_kl']==1)]

transformed_oa_inc_control['pred_kl'].unique()   # array([1., 0.])

**Output Dataframes**:

*transformed_oa_inc_control*:
'publish_dataframes/control_oa_inc_standardized_df.csv'



*transformed_tkr*: 'publish_dataframes/tkr_standardized_df.csv'


**Prepare Dataframes for Matching**

In [None]:
import pandas as pd

def prepare_dataframe_for_matching(dataframe, target_flag_name, covariates_range, id_column_name):
    """
    Prepares a DataFrame for matching by selecting the relevant columns.

    Parameters:
    dataframe (pd.DataFrame): The original DataFrame.
    target_flag_name (str): The name of the column containing the target flag.
    covariates_range (slice): The range of columns to be used as covariates.
    id_column_name (str): The name of the column containing the ID.

    Returns:
    pd.DataFrame: A new DataFrame with the selected columns.
    """

    # Select the covariates and the target flag
    covariates = dataframe.iloc[:, covariates_range]
    target_flag = dataframe[target_flag_name]
    id_column = dataframe[id_column_name]

    # Create a new DataFrame with only the relevant columns
    prepared_dataframe = pd.concat([id_column, target_flag, covariates], axis=1)

    return prepared_dataframe

# Example usage
tkr_covariate_df = prepare_dataframe_for_matching(transformed_tkr, 'tkr', slice(7, -110), 'id')

# control at baseline - no oa at baseline group
oa_inc_covariate_df = prepare_dataframe_for_matching(transformed_oa_inc_control, 'oa_prog', slice(7, -110), 'id')

# tkr_covariate_df.reset_index(inplace=True, drop=True)
# oa_inc_covariate_df.reset_index(inplace=True, drop=True)

**Output Dataframes**:

*oa_inc_covariate_df*: 'publish_dataframes/oa_inc_covariate_df.csv'


*tkr_covariate_df*: 'publish_dataframes/tkr_covariate_df.csv'

In [None]:
tkr_covariate_df['id'] = tkr_covariate_df['id'].astype('str')
oa_inc_covariate_df['id'] = oa_inc_covariate_df['id'].astype('str')

**R in Python**

In [None]:
!pip install --upgrade rpy2==3.5.1

In [None]:
from rpy2.robjects import r

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
# install.packages("MatchIt", repos="http://cran.us.r-project.org")
# install.packages("Matching", repos="http://cran.us.r-project.org")
# install.packages("rgenoud", repos="http://cran.us.r-project.org")

In [None]:

def calculate_distance(input_dataframe, method='euclidean'):
    # Add logic to select only covariate columns if needed
    # Example: input_dataframe = input_dataframe.iloc[:, 2:]

    # Convert the DataFrame to CSV for R
    temp_covariates_csv = 'temp_covariates_data.csv'
    input_dataframe.to_csv(temp_covariates_csv, index=False)

    # R script to calculate distance
    r_script = f'''
    df_covariates <- read.csv("{temp_covariates_csv}")
    euclidean_dist_matrix <- as.matrix(dist(df_covariates, method = "{method}"))
    '''

    # Execute R code
    r(r_script)

    # Return the name of the R object holding the distance matrix
    return 'euclidean_dist_matrix'

In [None]:

def perform_matching(input_dataframe, target_column, output_csv, output_matched_csv, method='nearest', params=None,  distance_method=None):
    print("Starting matching process...")

    # Convert IDs to string
    input_dataframe['id'] = input_dataframe['id'].astype('str')

    # Save the DataFrame as a CSV file for R
    temp_csv = 'temp_input_data.csv'
    input_dataframe.to_csv(temp_csv, index=False)
    print("Dataframe saved as CSV for R.")

    # Prepare the parameters string from the dictionary
    params_string = ", ".join([f"{key} = {value}" for key, value in params.items()]) if params else ""

    # Check if a specific distance calculation is needed
    if distance_method:
        distance_matrix_var = calculate_distance(input_dataframe, method=distance_method)
        params_string += f", distance = {distance_matrix_var}" if params_string else f"distance = {distance_matrix_var}"

    # Prepare the R script with dynamic target column and custom parameters
    r_script = f'''
    library(MatchIt)
    library(rgenoud)
    library(Matching)

    print("R libraries loaded. Reading data into R...")

    # Read the data into R
    input_data_r <- read.csv("{temp_csv}")

    print("Data read into R. Starting matching process with method '{method}' and custom parameters...")

    # Perform matching using MatchIt with specified method and custom parameters
    m.out <- matchit({target_column} ~ ., data = input_data_r, method = "{method}", {params_string})

    print("Matching completed. Retrieving matched data...")

    # Retrieve the matched data with IDs
    matched_data <- get_matches(m.out, id = "new_id")

    print("Matched data retrieved. Writing to CSV...")

    # Write the matched data to a CSV file
    write.csv(matched_data, "{output_matched_csv}", row.names = FALSE)
    '''

    # Execute R code including library loading
    print("Executing R code for matching...")
    r(r_script)

    print("R processing completed. Reading matched data back into Pandas...")

    # Read the matched data back into a Pandas DataFrame
    matched_data_df = pd.read_csv(output_matched_csv)
    matched_data_df.to_csv(output_csv, index=False)

    print("Matching process completed.")

In [None]:
# Nearest Neighbor - Propensity Score Example
input_df = pd.read_csv('publish_dataframes/tkr_covariate_df.csv')
output_csv = 'publish_dataframes/test_tkr_NN_noRep_df.csv'
output_matched_csv = 'publish_dataframes/tkr_temp_matched_data.csv'

perform_matching(input_df, 'tkr', output_csv, output_matched_csv,
                 method='nearest')

#**************************************

# Genetic Twins Example
# input_df = pd.read_csv('publish_dataframes/tkr_covariate_df.csv')
# output_csv = 'publish_dataframes/tkr_Genetic_Twins_noRep_df.csv'
# output_matched_csv = 'publish_dataframes/tkr_temp_matched_data.csv'

# perform_matching(input_df, 'tkr', output_csv, output_matched_csv)

#**************************************

# CEM Example
# Specify CEM parameters
# cem_params = {
#     'cuts': 30,
#     'M': 1.0,
#     'weighting': 'TRUE',
# }
# perform_matching2(input_df, 'tkr', output_csv, output_matched_csv,
#                  method='cem', params=cem_params)

#**************************************

# Euclidean Distance with Replacement Example
# params_dict = {'replace': True}
# perform_matching(input_df, 'tkr', output_csv, output_matched_csv,
#                  method='nearest', params=params_dict, distance_method='euclidean')


**Output Dataframes**:

*nearest neighbor, no replacement*:
'publish_dataframes/oa_inc_matchit_nearestNeightborMethod_noReplacement_df.csv'
'publish_dataframes/tkr_matchit_nearestNeightborMethod_noReplacement_df.csv'

*nearest neighbor, replacement*:
'publish_dataframes/oa_inc_matchit_nearestNeightborMethod_Replacement_df.csv'
'publish_dataframes/tkr_matchit_nearestNeightborMethod_Replacement_df.csv'


*euclidean distance matrix, replacement*:
'publish_dataframes/oa_inc_matchit_EuclideanDistanceMatrixMethod_Replacement_df.csv'
'publish_dataframes/tkr_matchit_EuclideanDistanceMatrixMethod_Replacement_df.csv'

*euclidean distance matrix, no replacement*:
'publish_dataframes/oa_inc_matchit_EuclideanDistanceMatrixMethod_noReplacement_df.csv'
'publish_dataframes/tkr_matchit_EuclideanDistanceMatrixMethod_noReplacement_df.csv'

Chosen Matchit Method: Nearest Neighbors, no Replacement - matches found via Propensity Score

**Or the TSNE 3D dimensionality reduction + Euclidean Distance**

In [None]:
def perform_tsne(df, tsne_cols, tsne_params):
    """
    Performs t-SNE transformation on the specified columns of the DataFrame.

    :param df: DataFrame to be processed.
    :param tsne_cols: Columns to be used for t-SNE transformation (as a slice object).
    :param tsne_params: Dictionary of parameters for t-SNE.
    :return: DataFrame with t-SNE components added.
    """
    tsne = TSNE(**tsne_params)
    tsne_components = tsne.fit_transform(df.iloc[:, tsne_cols])
    tsne_df = df.copy()
    tsne_df[['tsne-one', 'tsne-two', 'tsne-three']] = tsne_components
    return tsne_df


In [None]:
def calculate_distances(df, id_col):
    """
    Calculates the pairwise Euclidean distances between rows in the DataFrame.

    :param df: DataFrame with t-SNE components.
    :param id_col: The column name that uniquely identifies each row.
    :return: DataFrame of distances.
    """
    dist_df = pd.DataFrame(
        squareform(pdist(df[['tsne-one', 'tsne-two', 'tsne-three']])),
        columns=df[id_col].unique(),
        index=df[id_col].unique()
    )
    return dist_df


In [None]:
def find_closest_pairs(df, target_col, dist_df, id_col):
    """
    Identifies the closest pairs in the dataset.

    :param df: Original DataFrame with target information.
    :param target_col: Target column name.
    :param dist_df: DataFrame with distances.
    :param id_col: The column name that uniquely identifies each row.
    :return: DataFrame with closest pair information.
    """
    target_ids = df[df[target_col] == 1][id_col].tolist()
    progressing_dist = dist_df[target_ids]

    # Creating Long Format DataFrame
    progressing_long = progressing_dist.stack().reset_index().rename(columns={'level_0':'id','level_1':'pt2',0:'distance'})

    # Merging with Target Column Information
    target_vals = df[['id', target_col]]
    target_long = progressing_long.merge(target_vals, on='id').rename(columns={'id':'pt1_id', 'pt2':'id', target_col: target_col + '_pt1'})
    target_2pat = target_long.merge(target_vals, on='id').rename(columns={'id':'pt2_id', target_col: target_col + '_pt2'})

    # Identifying Different Pairs
    min_sort_target = (target_2pat.sort_values(['pt1_id','distance'], ascending=True)[['pt1_id','pt2_id','distance', target_col+'_pt1', target_col+'_pt2']])
    min_sort_target['diff_pair'] = (min_sort_target[target_col+'_pt1'] != min_sort_target[target_col+'_pt2'])

    # Finding the closest different pairs for each 'pt2_id'
    closest_pairs = min_sort_target[min_sort_target['diff_pair']].sort_values('distance').groupby('pt2_id').first().reset_index()

    # Dropping the 'diff_pair' column as it's no longer needed
    closest_pairs = closest_pairs.drop('diff_pair', axis=1)

    # Prepare for melting: Identify columns ending with '_id'
    id_cols = [col for col in closest_pairs.columns if col.endswith('_id')]

    # Melting the DataFrame to long format
    twins_melt = pd.melt(closest_pairs, id_vars='distance', value_vars=id_cols, value_name='id')

    # Assigning 'tkr' values based on the 'variable' column
    twins_melt[target_col] = np.where(twins_melt['variable'] == 'pt1_id', 0, 1)

    # Dropping the 'variable' column as it's no longer needed
    twins_melt = twins_melt.drop('variable', axis=1)

    return twins_melt


In [None]:
tsne_params = {'n_components': 3, 'perplexity': 50, 'n_iter': 4000, 'random_state' : 42}

transformed_oa_inc_control = pd.read_csv('publish_dataframes/control_oa_inc_standardized_df.csv')

df = transformed_oa_inc_control
target_col = 'oa_prog'    # target column
tsne_cols = slice(7, 27)  # column slice for t-SNE

# Perform t-SNE transformation
tsne_df = perform_tsne(df, tsne_cols, tsne_params)

# Calculate distances
dist_df = calculate_distances(tsne_df, 'id')

# Find closest pairs
matched_oa_inc_df = find_closest_pairs(df, target_col, dist_df, 'id')

# matched_oa_inc_df.to_csv('publish_dataframes/oa_inc_matchit_TSNE_EuclideanDist_Replacement_df.csv', index=False)


In [None]:
tsne_params = {'n_components': 3, 'perplexity': 65, 'n_iter': 4000, 'random_state' : 42}  # perp is sqrt(N) -> sqrt(4283) ~ 65

transformed_tkr = pd.read_csv('publish_dataframes/tkr_standardized_df.csv')

df = transformed_tkr
target_col = 'tkr'    # target column
tsne_cols = slice(7, 27)  # column slice for t-SNE

# Perform t-SNE transformation
tsne_df = perform_tsne(df, tsne_cols, tsne_params)

# Calculate distances
dist_df = calculate_distances(tsne_df, 'id')

# Find closest pairs
matched_tkr_df = find_closest_pairs(df, target_col, dist_df, 'id')

# matched_tkr_df.to_csv('publish_dataframes/tkr_matchit_TSNE_EuclideanDist_Replacement_df.csv', index=False)

**Output Dataframes**:

*matched_oa_inc_df*:
'publish_dataframes/rand_state_oa_inc_matchit_TSNE_EuclideanDist_Replacement_df.csv'

*matched_tkr_df*:

'publish_dataframes/rand_state_tkr_matchit_TSNE_EuclideanDist_Replacement_df.csv'

In [None]:
# OA Inc matched cohort size:
# matched_oa_inc_df[matched_oa_inc_df['oa_prog']==0]['id'].nunique()
# matched_oa_inc_df[matched_oa_inc_df['oa_prog']==1]['id'].nunique()
# control: 319
# oa inc: 357

# TKR matched cohort size:
# matched_tkr_df[matched_tkr_df['tkr']==0]['id'].nunique()
# matched_tkr_df[matched_tkr_df['tkr']==1]['id'].nunique()
# control: 233
# tkr: 253