In [8]:
import io #importing the 'io' module - io module provides a set of tools for working with streams of data. Makes it easuer to work with input and output streams and makes it easier to handle different types of data
import os #importing the 'os' module - allows you to perform system-related tasks such as: file and directory operations, working with paths, environment variables, process management, working directory
import pandas as pd #importing the Pandas library and giving it the alias 'pd' - Pandas library helps with data manipulation and analysis, simplifies working with structured data
import plotly.express as px #loading a specific part of the Plotly library (used to create charts and graphs), alias as px: gives it the nickname px for convenience when creating visualizations
import plotly.io as pio #loading the Plotly IO module (used to control how Plotly visualizations are displayed and saved - options include resolution, interactive behavior), alias gives it a simpler name when calling the function
import fastparquet #make sure that this is installed in your anaconda environment
import numpy as np
import statistics as stat
import matplotlib.pyplot as plt
from scipy import stats
import array
from IPython.display import display
from scipy.stats import zscore
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from feature_engine.selection import DropCorrelatedFeatures
from scipy.stats import pearsonr, spearmanr
from scipy.spatial.distance import cosine

In [2]:
#loads the file into the label "data"
filename='/Users/cooperross/Downloads/Captone Project/Coding files/Unconfirmed 561666.crdownload' #edit the filename here for testing other .parquet files

data=pd.read_parquet(filename, engine='fastparquet') 

# REMOVE UNNECESSARY COLUMNS (Ryan's code)
data2 = data.loc[:,~data.columns.str.contains('BoundingBox')]
data2 = data2.loc[:,~data2.columns.str.contains('Center')]
data2 = data2.loc[:,~data2.columns.str.contains('Neighbors')]
data2 = data2.loc[:,~data2.columns.str.contains('Location')]
data2 = data2.loc[:,~data2.columns.str.contains('Euler')]
data2 = data2.loc[:,~data2.columns.str.contains('Manders')]
data2 = data2.loc[:,~data2.columns.str.contains('FormFactor')]
data2 = data2.loc[:,~data2.columns.str.contains('Count')]
data2 = data2.loc[:,~data2.columns.str.contains('Image')]
data2 = data2.loc[:,~data2.columns.str.contains('mito_tubeness')]
data2 = data2.loc[:,~data2.columns.str.contains('Overflow')]
data2 = data2.loc[:,~data2.columns.str.contains('MinIntensity')]

#Separate out the metadata columns
metadata_cols = data2.loc[:,data2.columns.str.contains('Metadata')]
data2 = data2.loc[:,~data2.columns.str.contains('Metadata')]

# KEEP ONLY NUMERICAL DATA
data_numerical=data2._get_numeric_data() #removing columns that have non quantitiative data

# MAD-NORMALIZATION
df_normalized = (0.6745 * (data_numerical - data_numerical.median(axis=0))) / stats.median_abs_deviation(data_numerical, axis=0)
df_normalized = df_normalized.dropna(axis=1)
df_normalized

# REMOVE OUTLIERS (Ryan's slides)
n_MAD = 500
threshold = 0.6745 * n_MAD
rows_to_keep = (np.abs(df_normalized) < threshold).all(axis=1)
df_normalized = df_normalized[rows_to_keep]
df_normalized # "add in the meta-data" - sam ???

# REMOVE HIGHLY CORRELATED COLUMNS (you will need to pip install feature engine in the conda environment)
corrs = DropCorrelatedFeatures(variables=None, method='spearman', threshold=0.90) #set the function to use later (Ryan's code)
df_normalized.shape
dropped_correlated = corrs.fit_transform(df_normalized)
dropped_correlated.shape

(382, 717)

In [3]:
# FINAL PAIRWISE CORRELATION PROTOTYPE 1 ("EITHER / OR" METHOD)
def find_correlated_rows(user_input_row):
    pearson_correlations = []
    spearman_correlations = []
    
    user_row = df_normalized.iloc[user_input_row]
    num_rows = len(df_normalized)
    
    for i in range(num_rows):
        if i != user_input_row:
            compared_row = df_normalized.iloc[i]
            
            pearson_corr, _ = pearsonr(user_row, compared_row)
            spearman_corr, _ = spearmanr(user_row, compared_row)
            
            pearson_correlations.append((i, pearson_corr))
            spearman_correlations.append((i, spearman_corr))
    
    correlated_rows = [(i, p_corr, s_corr) for i, (p_corr, s_corr) in enumerate(zip(pearson_correlations, spearman_correlations)) if abs(p_corr[1]) > 0.9 or abs(s_corr[1]) > 0.9]
    
    return correlated_rows

# Example usage:
user_input = int(input("Enter the row number you'd like to investigate: "))
correlated_rows = find_correlated_rows(user_input)

# Sort by the absolute value of Pearson correlation
correlated_rows.sort(key=lambda x: max(abs(x[1][1]), abs(x[2][1])), reverse=True)

# Display the results
print(f"Original User Input Row: {user_input}")
print(f"Total Quantity of Correlated Rows: {len(correlated_rows)}")
print("Correlated Rows with Pearson and Spearman Correlations:")
for row_num, pearson_corr, spearman_corr in correlated_rows:
    pearson_corr = round(pearson_corr[1], 5)
    spearman_corr = round(spearman_corr[1], 5)
    print(f"Row {row_num}: Pearson Correlation = {pearson_corr}, Spearman Correlation = {spearman_corr}")

Enter the row number you'd like to investigate:  333


Original User Input Row: 333
Total Quantity of Correlated Rows: 3
Correlated Rows with Pearson and Spearman Correlations:
Row 142: Pearson Correlation = 0.9656, Spearman Correlation = 0.95552
Row 215: Pearson Correlation = 0.94713, Spearman Correlation = 0.88771
Row 23: Pearson Correlation = 0.9237, Spearman Correlation = 0.7859


In [4]:
# FINAL PAIRWISE CORRELATION PROTOTYPE 1 ("AND" METHOD)
def find_correlated_rows(user_input_row):
    pearson_correlations = []
    spearman_correlations = []
    
    user_row = df_normalized.iloc[user_input_row]
    num_rows = len(df_normalized)
    
    for i in range(num_rows):
        if i != user_input_row:
            compared_row = df_normalized.iloc[i]
            
            pearson_corr, _ = pearsonr(user_row, compared_row)
            spearman_corr, _ = spearmanr(user_row, compared_row)
            
            pearson_correlations.append((i, pearson_corr))
            spearman_correlations.append((i, spearman_corr))
    
    correlated_rows = [(i, p_corr, s_corr) for i, (p_corr, s_corr) in enumerate(zip(pearson_correlations, spearman_correlations)) if abs(p_corr[1]) > 0.9 and abs(s_corr[1]) > 0.9]
    
    return correlated_rows

# Example usage:
user_input = int(input("Enter the row number you'd like to investigate: "))
correlated_rows = find_correlated_rows(user_input)

# Sort by the absolute value of Pearson correlation
correlated_rows.sort(key=lambda x: max(abs(x[1][1]), abs(x[2][1])), reverse=True)

# Display the results
print(f"Original User Input Row: {user_input}")
print(f"Total Quantity of Correlated Rows: {len(correlated_rows)}")
print("Correlated Rows with Pearson and Spearman Correlations:")
for row_num, pearson_corr, spearman_corr in correlated_rows:
    pearson_corr = round(pearson_corr[1], 5)
    spearman_corr = round(spearman_corr[1], 5)
    print(f"Row {row_num}: Pearson Correlation = {pearson_corr}, Spearman Correlation = {spearman_corr}")


Enter the row number you'd like to investigate:  333


Original User Input Row: 333
Total Quantity of Correlated Rows: 1
Correlated Rows with Pearson and Spearman Correlations:
Row 142: Pearson Correlation = 0.9656, Spearman Correlation = 0.95552


In [7]:
# FINAL PAIRWISE CORRELATION PROTOTYPE 2 ("AVERAGE" METHOD)
def find_correlated_rows(user_input_row):
    pearson_correlations = []
    spearman_correlations = []
    
    user_row = df_normalized.iloc[user_input_row]
    num_rows = len(df_normalized)
    
    for i in range(num_rows):
        if i != user_input_row:
            compared_row = df_normalized.iloc[i]
            
            pearson_corr, _ = pearsonr(user_row, compared_row)
            spearman_corr, _ = spearmanr(user_row, compared_row)
            
            pearson_correlations.append((i, pearson_corr))
            spearman_correlations.append((i, spearman_corr))
    
    correlated_rows = [(i, p_corr, s_corr) for i, (p_corr, s_corr) in enumerate(zip(pearson_correlations, spearman_correlations)) if (abs(p_corr[1]) + abs(s_corr[1])) / 2 > 0.9]
    
    return correlated_rows

# Example usage:
user_input = int(input("Enter the row number you'd like to investigate: "))
correlated_rows = find_correlated_rows(user_input)

# Sort by the average of absolute values of Pearson and Spearman correlations
correlated_rows.sort(key=lambda x: (abs(x[1][1]) + abs(x[2][1])) / 2, reverse=True)

# Display the results
print(f"Original User Input Row: {user_input}")
print(f"Total Quantity of Correlated Rows: {len(correlated_rows)}")
print("Correlated Rows with Pearson and Spearman Correlations:")
for row_num, pearson_corr, spearman_corr in correlated_rows:
    avg_corr = round((abs(pearson_corr[1]) + abs(spearman_corr[1])) / 2, 5)
    pearson_corr = round(pearson_corr[1], 5)
    spearman_corr = round(spearman_corr[1], 5)
    print(f"Row {row_num}: Average Correlation = {avg_corr}, Pearson Correlation = {pearson_corr}, Spearman Correlation = {spearman_corr}")


Enter the row number you'd like to investigate:  333


Original User Input Row: 333
Total Quantity of Correlated Rows: 2
Correlated Rows with Pearson and Spearman Correlations:
Row 142: Average Correlation = 0.96056, Pearson Correlation = 0.9656, Spearman Correlation = 0.95552
Row 215: Average Correlation = 0.91742, Pearson Correlation = 0.94713, Spearman Correlation = 0.88771


In [13]:
# FINAL PAIRWISE CORRELATION PROTOTYPE ("EITHER / OR" METHOD WITH COSINE)
def find_correlated_rows(user_input_row):
    pearson_correlations = []
    spearman_correlations = []
    cosine_correlations = []
    
    user_row = df_normalized.iloc[user_input_row]
    num_rows = len(df_normalized)
    
    for i in range(num_rows):
        if i != user_input_row:
            compared_row = df_normalized.iloc[i]
            
            pearson_corr, _ = pearsonr(user_row, compared_row)
            spearman_corr, _ = spearmanr(user_row, compared_row)
            cosine_corr = 1 - cosine(user_row, compared_row)  # Cosine similarity
            
            pearson_correlations.append((i, pearson_corr))
            spearman_correlations.append((i, spearman_corr))
            cosine_correlations.append((i, cosine_corr))
    
    correlated_rows = [(i, p_corr, s_corr, c_corr) for i, (p_corr, s_corr, c_corr) in enumerate(zip(pearson_correlations, spearman_correlations, cosine_correlations)) if abs(p_corr[1]) > 0.9 or abs(s_corr[1]) > 0.9 or abs(c_corr[1]) > 0.9]
    
    return correlated_rows

# Example usage:
user_input = int(input("Enter the row number you'd like to investigate: "))
correlated_rows = find_correlated_rows(user_input)

# Sort by the absolute value of Pearson correlation
correlated_rows.sort(key=lambda x: max(abs(x[1][1]), abs(x[2][1]), abs(x[3][1])), reverse=True)

# Display the results
print(f"Original User Input Row: {user_input}")
print(f"Total Quantity of Correlated Rows: {len(correlated_rows)}")
print("Correlated Rows with Pearson, Spearman, and Cosine Correlations:")
for row_num, pearson_corr, spearman_corr, cosine_corr in correlated_rows:
    pearson_corr = round(pearson_corr[1], 5)
    spearman_corr = round(spearman_corr[1], 5)
    cosine_corr = round(cosine_corr[1], 5)
    print(f"Row {row_num}: Pearson Correlation = {pearson_corr}, Spearman Correlation = {spearman_corr}, Cosine Similarity = {cosine_corr}")


Enter the row number you'd like to investigate:  333


Original User Input Row: 333
Total Quantity of Correlated Rows: 3
Correlated Rows with Pearson, Spearman, and Cosine Correlations:
Row 142: Pearson Correlation = 0.9656, Spearman Correlation = 0.95552, Cosine Similarity = 0.96757
Row 215: Pearson Correlation = 0.94713, Spearman Correlation = 0.88771, Cosine Similarity = 0.94204
Row 23: Pearson Correlation = 0.9237, Spearman Correlation = 0.7859, Cosine Similarity = 0.92222


In [10]:
# FINAL PAIRWISE CORRELATION PROTOTYPE ("AND" METHOD WITH COSINE)
def find_correlated_rows(user_input_row):
    pearson_correlations = []
    spearman_correlations = []
    cosine_correlations = []
    
    user_row = df_normalized.iloc[user_input_row]
    num_rows = len(df_normalized)
    
    for i in range(num_rows):
        if i != user_input_row:
            compared_row = df_normalized.iloc[i]
            
            pearson_corr, _ = pearsonr(user_row, compared_row)
            spearman_corr, _ = spearmanr(user_row, compared_row)
            cosine_corr = 1 - cosine(user_row, compared_row)  # Cosine similarity
            
            pearson_correlations.append((i, pearson_corr))
            spearman_correlations.append((i, spearman_corr))
            cosine_correlations.append((i, cosine_corr))
    
    correlated_rows = [(i, p_corr, s_corr, c_corr) for i, (p_corr, s_corr, c_corr) in enumerate(zip(pearson_correlations, spearman_correlations, cosine_correlations)) if abs(p_corr[1]) > 0.9 and abs(s_corr[1]) > 0.9 and abs(c_corr[1]) > 0.9]
    
    return correlated_rows

# Example usage:
user_input = int(input("Enter the row number you'd like to investigate: "))
correlated_rows = find_correlated_rows(user_input)

# Sort by the absolute value of Pearson correlation
correlated_rows.sort(key=lambda x: max(abs(x[1][1]), abs(x[2][1]), abs(x[3][1])), reverse=True)

# Display the results
print(f"Original User Input Row: {user_input}")
print(f"Total Quantity of Correlated Rows: {len(correlated_rows)}")
print("Correlated Rows with Pearson, Spearman, and Cosine Correlations:")
for row_num, pearson_corr, spearman_corr, cosine_corr in correlated_rows:
    pearson_corr = round(pearson_corr[1], 5)
    spearman_corr = round(spearman_corr[1], 5)
    cosine_corr = round(cosine_corr[1], 5)
    print(f"Row {row_num}: Pearson Correlation = {pearson_corr}, Spearman Correlation = {spearman_corr}, Cosine Similarity = {cosine_corr}")

Enter the row number you'd like to investigate:  333


Original User Input Row: 333
Total Quantity of Correlated Rows: 1
Correlated Rows with Pearson, Spearman, and Cosine Correlations:
Row 142: Pearson Correlation = 0.9656, Spearman Correlation = 0.95552, Cosine Similarity = 0.96757


In [11]:
# FINAL PAIRWISE CORRELATION PROTOTYPE ("AVERAGE" METHOD WITH COSINE)
def find_correlated_rows(user_input_row):
    pearson_correlations = []
    spearman_correlations = []
    cosine_correlations = []
    
    user_row = df_normalized.iloc[user_input_row]
    num_rows = len(df_normalized)
    
    for i in range(num_rows):
        if i != user_input_row:
            compared_row = df_normalized.iloc[i]
            
            pearson_corr, _ = pearsonr(user_row, compared_row)
            spearman_corr, _ = spearmanr(user_row, compared_row)
            cosine_corr = 1 - cosine(user_row, compared_row)  # Cosine similarity
            
            pearson_correlations.append((i, pearson_corr))
            spearman_correlations.append((i, spearman_corr))
            cosine_correlations.append((i, cosine_corr))
    
    correlated_rows = [(i, p_corr, s_corr, c_corr) for i, (p_corr, s_corr, c_corr) in enumerate(zip(pearson_correlations, spearman_correlations, cosine_correlations)) if (abs(p_corr[1]) + abs(s_corr[1]) + abs(c_corr[1])) / 3 > 0.9]
    
    return correlated_rows

# Example usage:
user_input = int(input("Enter the row number you'd like to investigate: "))
correlated_rows = find_correlated_rows(user_input)

# Sort by the average of absolute values of Pearson, Spearman, and Cosine correlations
correlated_rows.sort(key=lambda x: (abs(x[1][1]) + abs(x[2][1]) + abs(x[3][1])) / 3, reverse=True)

# Display the results
print(f"Original User Input Row: {user_input}")
print(f"Total Quantity of Correlated Rows: {len(correlated_rows)}")
print("Correlated Rows with Pearson, Spearman, and Cosine Correlations:")
for row_num, pearson_corr, spearman_corr, cosine_corr in correlated_rows:
    avg_corr = round((abs(pearson_corr[1]) + abs(spearman_corr[1]) + abs(cosine_corr[1])) / 3, 5)
    pearson_corr = round(pearson_corr[1], 5)
    spearman_corr = round(spearman_corr[1], 5)
    cosine_corr = round(cosine_corr[1], 5)
    print(f"Row {row_num}: Average Correlation = {avg_corr}, Pearson Correlation = {pearson_corr}, Spearman Correlation = {spearman_corr}, Cosine Similarity = {cosine_corr}")

Enter the row number you'd like to investigate:  333


Original User Input Row: 333
Total Quantity of Correlated Rows: 2
Correlated Rows with Pearson, Spearman, and Cosine Correlations:
Row 142: Average Correlation = 0.9629, Pearson Correlation = 0.9656, Spearman Correlation = 0.95552, Cosine Similarity = 0.96757
Row 215: Average Correlation = 0.92563, Pearson Correlation = 0.94713, Spearman Correlation = 0.88771, Cosine Similarity = 0.94204
