### This file calculates the correlations for the female data features.

In [19]:
import pandas as pd
from scipy.stats import chi2_contingency
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

#### Importing and cleaning data

In [21]:
# Importing imputed participant data on features we're interested in
dfFemale = pd.read_csv("/gpfs/home/ezhu/Reprocess_NewData/Imputation (4)/final_female_data.csv")

In [24]:
# Reorganizing the data
dfFemale = dfFemale.sort_values(by=['eid'])
dfFemale = dfFemale.reset_index()
dfFemale = dfFemale.drop(columns = ["Unnamed: 0", "index"], axis=1)

In [25]:
# Splitting the data into categorical and continuous features based on their column names
df_contF = pd.DataFrame()
df_catF = pd.DataFrame()
df_catF_cols = []
for i in tqdm(dfFemale.columns):
    if "_" in str(i):
        df_catF_cols.append(i)
        df_catF = pd.concat([df_catF, dfFemale[i]], axis=1)

100%|██████████| 519/519 [00:01<00:00, 466.29it/s] 


#### Calculating Cramer's V for Categorical Variables

In [26]:
# Categorical-Categorical
# --> use corrected Cramer's V
def cramers_V(var1,var2) :
    crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
    stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
    obs = np.sum(crosstab) # Number of observations
    mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table - why mini = 0??????????????
    return np.sqrt(stat/(obs*mini))

In [27]:
df_Categ_F_Clean = df_catF.loc[:, (df_catF != 0).any(axis=0)]
print(df_Categ_F_Clean.shape)

(1624, 239)


In [29]:
columns = list(df_Categ_F_Clean.columns)
rows= []

for var1 in tqdm(columns):
    col = []
    for var2 in columns:
        cramers = cramers_V(df_Categ_F_Clean[var1], df_Categ_F_Clean[var2]) # Cramer's V test
        col.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V  
    rows.append(col)

cramers_results = np.array(rows)

100%|██████████| 239/239 [13:14<00:00,  3.32s/it]


In [30]:
dfCramer_F = pd.DataFrame(cramers_results, columns = df_Categ_F_Clean.columns, index = df_Categ_F_Clean.columns)

In [31]:
dfCramer_F.to_csv("fem_cramersv_correlation_matrix.csv") 

#### Calculating Pearson's Correlation for Continuous Variables

In [38]:
# Continuous-Continuous
# --> use Pearson's
df_contF = dfFemale.drop(columns, axis=1)
corr_mat = df_contF.corr(method='pearson')

# pearson's correlation value will be NaN if there is no standard deviation in one of the columns
# so, this would indicate no correlation - so i replaced the NaN with 0
corr_mat = corr_mat.fillna(0) 

dfPearson_F = corr_mat.copy()

In [40]:
dfPearson_F = dfPearson_F.drop(['eid'], axis=1)
dfPearson_F = dfPearson_F.drop(['eid'], axis=0)

In [42]:
dfPearson_F.to_csv("fem_pearson_correlation_matrix.csv")