<a href="https://colab.research.google.com/github/dennistay1981/Resources/blob/main/Code%20and%20data%20in%20publications/Chapter%3A%20Data%20science%20approaches%20to%20metaphor%20and%20mental%20health/Data_science_approaches_to_mental_health.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Descriptive analytics: cross-tabulating metaphor sources and targets

In [None]:
#Import Python libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

#Import data
data = pd.read_csv('https://raw.githubusercontent.com/dennistay1981/Resources/refs/heads/main/Code%20and%20data%20in%20publications/Chapter%3A%20Data%20science%20approaches%20to%20metaphor%20and%20mental%20health/Descriptive.csv')

# Cross-tabulation
crosstab = pd.crosstab(data['SOURCE'], data['TARGET'])
# Observed frequencies
observed = crosstab.values
# Expected frequencies
chi2, p, dof, expected = chi2_contingency(observed)
# Pearson's residuals
residuals = (observed - expected) / np.sqrt(expected)
# Cramer's V
n = observed.sum()
phi2 = chi2 / n
r, k = observed.shape
phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
rcorr = r - ((r - 1)**2) / (n - 1)
kcorr = k - ((k - 1)**2) / (n - 1)
cramer_v = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

# Combine observed frequencies and residuals into a single DataFrame
combined_data = pd.DataFrame(observed, index=crosstab.index, columns=crosstab.columns)
combined_data = combined_data.astype(str) + ' (' + pd.DataFrame(residuals, index=crosstab.index, columns=crosstab.columns).round(2).astype(str) + ')'

# Mark statistically significant residuals using a threshold of +-1.96 (approximately 95% confidence)
significant_residuals = np.where(np.abs(residuals) > 1.96)

for i, j in zip(*significant_residuals):
    combined_data.iloc[i, j] = combined_data.iloc[i, j] + "*"


# Calculate row and column totals
row_totals = observed.sum(axis=1)
col_totals = observed.sum(axis=0)

# Heatmap with combined data and row/column totals
plt.figure(figsize=(12, 8))
ax = sns.heatmap(pd.DataFrame(residuals, index=crosstab.index, columns=crosstab.columns),
                 annot=combined_data, fmt='', cmap='Blues', center=0, cbar=True, annot_kws={"size": 16})


# Annotate row totals on the right side of the heatmap
for i, total in enumerate(row_totals):
  ax.text(len(crosstab.columns) + 0.1, i+0.5, total, ha='center', va='center', fontsize=16, color='gray')

# Annotate column totals at the bottom of the heatmap
for j, total in enumerate(col_totals):
  ax.text(j+0.5, len(crosstab.index) + 0.4,  total, ha='center', va='center', fontsize=16, color='gray')

plt.title('Observed Frequencies and Pearson Residuals (* = statistically significant)')
plt.xlabel('TARGET')
plt.ylabel('SOURCE')
plt.tight_layout()
plt.show()

# Print relevant statistics
print(f"Chi-square statistic: {chi2:.2f}")
print(f"P-value: {p:.3f}")
print(f"Degrees of freedom: {dof}")
print(f"Cramer's V: {cramer_v:.3f}")

Diagnostic (ARM)


In [None]:
#Import Python libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules, apriori

#Import data
data = pd.read_csv('https://raw.githubusercontent.com/dennistay1981/Resources/refs/heads/main/Code%20and%20data%20in%20publications/Chapter%3A%20Data%20science%20approaches%20to%20metaphor%20and%20mental%20health/Diagnostic.csv')

#Split ASD and non-ASD individuals into two dataframes
data_ASD = data.loc[data['ASD']=='Y']
data_NASD = data.loc[data['ASD']=='N']

#convert transactions into lists: one for ASD, one for Non-ASD, and one for the overall dataset
transactions_ASD = data_ASD['sources'].apply(lambda t: t.split(','))
transactions_NASD = data_NASD['sources'].apply(lambda t: t.split(','))
transactions_all = data['sources'].apply(lambda t: t.split(','))

# Instantiate transaction encoder and identify unique items in transactions
encoder = TransactionEncoder().fit(transactions_ASD)
encoder2 = TransactionEncoder().fit(transactions_NASD)
encoder3 = TransactionEncoder().fit(transactions_all)


# One-hot encode transactions
onehot_ASD = encoder.transform(transactions_ASD)
onehot_NASD = encoder2.transform(transactions_NASD)
onehot_all = encoder3.transform(transactions_all)

# Convert one-hot encoded data to DataFrame
onehot_ASD = pd.DataFrame(onehot_ASD, columns = encoder.columns_)
onehot_NASD = pd.DataFrame(onehot_NASD, columns = encoder2.columns_)
onehot_all = pd.DataFrame(onehot_all, columns = encoder3.columns_)


# Compute frequent itemsets using the Apriori algorithm
# i.e. what are the items (max length 3) that (jointly) appear in at least 5% of transactions
frequent_itemsets_ASD = apriori(onehot_ASD, min_support = 0.05, max_len = 3, use_colnames = True)  #minimum support value, maximum itemset length to be retained
frequent_itemsets_NASD = apriori(onehot_NASD, min_support = 0.05, max_len = 3, use_colnames = True)
frequent_itemsets_all = apriori(onehot_all, min_support = 0.05, max_len = 3, use_colnames = True)

# Compute all association rules for frequent_itemsets, limiting only to rules with support > 0.05
# i.e. both antecedent and consequent items jointly appear in at least 5% of transactions
rules_ASD = association_rules(frequent_itemsets_ASD, metric = "support", min_threshold = 0.05)
rules_NASD = association_rules(frequent_itemsets_NASD, metric = "support", min_threshold = 0.05)
rules_all = association_rules(frequent_itemsets_all, metric = "support", min_threshold = 0.05)

# Replace frozen sets with strings
rules_ASD['antecedents'] = rules_ASD['antecedents'].apply(lambda a: ','.join(list(a)))
rules_ASD['consequents'] = rules_ASD['consequents'].apply(lambda a: ','.join(list(a)))
rules_NASD['antecedents'] = rules_NASD['antecedents'].apply(lambda a: ','.join(list(a)))
rules_NASD['consequents'] = rules_NASD['consequents'].apply(lambda a: ','.join(list(a)))
rules_all['antecedents'] = rules_all['antecedents'].apply(lambda a: ','.join(list(a)))
rules_all['consequents'] = rules_all['consequents'].apply(lambda a: ','.join(list(a)))

# Sort rules by confidence, then lift, then support in descending order
rules_ASD = rules_ASD.sort_values(['confidence', 'lift', 'support'], ascending=[False, False, False])
rules_NASD = rules_NASD.sort_values(['confidence', 'lift', 'support'], ascending=[False, False, False])
rules_all = rules_all.sort_values(['confidence', 'lift', 'support'], ascending=[False, False, False])



Predictive (survival regression)


In [None]:
#Install lifelines
!pip install lifelines

In [None]:
#Import Python libraries
from lifelines import KaplanMeierFitter
import pandas as pd
import seaborn as sns

#Import data
data = pd.read_csv('https://raw.githubusercontent.com/dennistay1981/Resources/refs/heads/main/Code%20and%20data%20in%20publications/Chapter%3A%20Data%20science%20approaches%20to%20metaphor%20and%20mental%20health/Predictive.csv')

#Initiate and fit. durations=time, event_observed= event of interest, 1 (occurred) or 0 (not)
kmf = KaplanMeierFitter()
kmf.fit(durations=data['Turns'], event_observed=data['Attrition'])

#Cumulative distribution function of survived turns by Initiator and Approach
sns.ecdfplot(data, x='Turns', hue='Initiator')
sns.ecdfplot(data, x='Turns', hue='Approach')





"""
Plot survival function with subgroups and median lines (INITIATOR)
"""
ax = plt.subplot(111)

kmf_A = KaplanMeierFitter()
ax = kmf_A.fit(durations=data.loc[data.Initiator =='T'].Turns,
               event_observed=data.loc[data.Initiator =='T'].Attrition, label='Therapist').plot_survival_function(ax=ax)
ax.axvline(kmf_A.median_survival_time_, linestyle='--', color='blue', label=f'Median time: {kmf_A.median_survival_time_:.1f}')

kmf_B = KaplanMeierFitter()
ax = kmf_B.fit(durations=data.loc[data.Initiator =='C'].Turns,
               event_observed=data.loc[data.Initiator =='C'].Attrition, label='Client').plot_survival_function(ax=ax)
ax.axvline(kmf_B.median_survival_time_, linestyle='--', color='orange', label=f'Median time: {kmf_B.median_survival_time_:.1f}')

plt.legend()
plt.ylabel('Probability')
plt.xlabel('Timeline (turns)')
plt.xticks(range(0, 26))
plt.title('Survival curve by INITIATOR')

#to add risk counts table
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf_A, kmf_B, ax=ax, fontsize=6)
plt.tight_layout()
plt.show()


"""
Plot survival function with subgroups and median lines  (APPROACH)
"""
ax = plt.subplot(111)

kmf_A = KaplanMeierFitter()
ax = kmf_A.fit(durations=data.loc[data.Approach =='CBT'].Turns,
               event_observed=data.loc[data.Approach =='CBT'].Attrition, label='CBT').plot_survival_function(ax=ax)
ax.axvline(kmf_A.median_survival_time_, linestyle='--', color='blue', label=f'Median time: {kmf_A.median_survival_time_:.1f}')

kmf_B = KaplanMeierFitter()
ax = kmf_B.fit(durations=data.loc[data.Approach =='PA'].Turns,
               event_observed=data.loc[data.Approach =='PA'].Attrition, label='PA').plot_survival_function(ax=ax)

ax.axvline(kmf_B.median_survival_time_, linestyle='--', color='orange', label=f'Median time: {kmf_B.median_survival_time_:.1f}')

plt.legend()
plt.ylabel('Probability')
plt.xlabel('Timeline (turns)')
plt.xticks(range(0, 26))
plt.title('Survival curve by APPROACH')

#to add risk counts table
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf_A, kmf_B, ax=ax, fontsize=6)
plt.tight_layout()
plt.show()


kmf.event_table

Unnamed: 0_level_0,removed,observed,censored,entrance,at_risk
event_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,4,4,0,200,200
1.0,8,4,4,0,196
2.0,12,12,0,0,188
3.0,18,15,3,0,176
4.0,19,19,0,0,158
5.0,23,22,1,0,139
6.0,22,22,0,0,116
7.0,22,22,0,0,94
8.0,20,20,0,0,72
9.0,18,18,0,0,52


In [None]:
#Import Python libraries
from lifelines import KaplanMeierFitter
import pandas as pd
import seaborn as sns

#Import data
data = pd.read_csv('https://raw.githubusercontent.com/dennistay1981/Resources/refs/heads/main/Code%20and%20data%20in%20publications/Chapter%3A%20Data%20science%20approaches%20to%20metaphor%20and%20mental%20health/Predictive.csv')

#Initiate and fit survival function. durations=time, event_observed= event of interest, 1 (occurred) or 0 (not)
kmf = KaplanMeierFitter()
kmf.fit(durations=data['Turns'], event_observed=data['Attrition'])

#Plot survival function
kmf.survival_function_.plot()
plt.axvline(kmf.median_survival_time_, linestyle='--', color='blue', label=f'Median time: {kmf.median_survival_time_:.1f}')
plt.title('Survival Curve estimated with Kaplan-Meier Fitter')
plt.legend()
plt.show()

#Summary
kmf.event_table


"""
Plot survival function with subgroups and median lines (INITIATOR)
"""
ax = plt.subplot(111)

kmf_A = KaplanMeierFitter()
ax = kmf_A.fit(durations=data.loc[data.Initiator =='T'].Turns,
               event_observed=data.loc[data.Initiator =='T'].Attrition, label='Therapist').plot_survival_function(ax=ax)
ax.axvline(kmf_A.median_survival_time_, linestyle='--', color='blue', label=f'Median time: {kmf_A.median_survival_time_:.1f}')

kmf_B = KaplanMeierFitter()
ax = kmf_B.fit(durations=data.loc[data.Initiator =='C'].Turns,
               event_observed=data.loc[data.Initiator =='C'].Attrition, label='Client').plot_survival_function(ax=ax)
ax.axvline(kmf_B.median_survival_time_, linestyle='--', color='orange', label=f'Median time: {kmf_B.median_survival_time_:.1f}')

plt.legend()
plt.ylabel('Probability')
plt.xlabel('Timeline (turns)')
plt.xticks(range(0, 26))
plt.title('Survival curve by INITIATOR')

#to add risk counts table
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf_A, kmf_B, ax=ax, fontsize=6)
plt.tight_layout()
plt.show()


"""
Plot survival function with subgroups and median lines  (APPROACH)
"""
ax = plt.subplot(111)

kmf_A = KaplanMeierFitter()
ax = kmf_A.fit(durations=data.loc[data.Approach =='CBT'].Turns,
               event_observed=data.loc[data.Approach =='CBT'].Attrition, label='CBT').plot_survival_function(ax=ax)
ax.axvline(kmf_A.median_survival_time_, linestyle='--', color='blue', label=f'Median time: {kmf_A.median_survival_time_:.1f}')

kmf_B = KaplanMeierFitter()
ax = kmf_B.fit(durations=data.loc[data.Approach =='PA'].Turns,
               event_observed=data.loc[data.Approach =='PA'].Attrition, label='PA').plot_survival_function(ax=ax)

ax.axvline(kmf_B.median_survival_time_, linestyle='--', color='orange', label=f'Median time: {kmf_B.median_survival_time_:.1f}')

plt.legend()
plt.ylabel('Probability')
plt.xlabel('Timeline (turns)')
plt.xticks(range(0, 26))
plt.title('Survival curve by APPROACH')

#to add risk counts table
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf_A, kmf_B, ax=ax, fontsize=6)
plt.tight_layout()
plt.show()


"""
COX'S PROPORTIONAL-HAZARDS MODEL
A regression model to discover the relationship between
the survival time of individuals and predictor variable(s)/covariate(s). It works with
both categorical and numerical predictor variables. We will use Cox-PH
model to not only investigate which are the factors that have high & low
impact on survival but also to predict future survival probabilities of current
employees.
"""


#Fit the selected data to CoxPHFitter and specify survival duration and event columns.
from lifelines import CoxPHFitter
# Initialize and fit the model
coxph = CoxPHFitter()
coxph.fit(data, duration_col='Turns', event_col='Attrition', formula="Initiator + Approach")
coxph.print_summary()





0,1
model,lifelines.CoxPHFitter
duration col,'Turns'
event col,'Attrition'
baseline estimation,breslow
number of observations,200
number of events observed,192
partial log-likelihood,-808.34
time fit was run,2024-11-07 02:40:29 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
Initiator[T.T],0.75,2.11,0.16,0.44,1.05,1.55,2.86,0.0,4.78,<0.005,19.09
Approach[T.PA],-0.22,0.81,0.15,-0.51,0.08,0.6,1.08,0.0,-1.44,0.15,2.74

0,1
Concordance,0.63
Partial AIC,1620.68
log-likelihood ratio test,27.81 on 2 df
-log2(p) of ll-ratio test,20.06
