<a href="https://colab.research.google.com/github/dennistay1981/Resources/blob/main/Code%20and%20data%20in%20publications/Chapter%3A%20Data%20science%20approaches%20to%20metaphor%20and%20mental%20health/Data_science_approaches_to_mental_health.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Descriptive analytics: cross-tabulating metaphor sources and targets

In [None]:
#Import Python libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

#Import data
data = pd.read_csv('https://raw.githubusercontent.com/dennistay1981/Resources/refs/heads/main/Code%20and%20data%20in%20publications/Chapter%3A%20Data%20science%20approaches%20to%20metaphor%20and%20mental%20health/Descriptive.csv')

# Cross-tabulation
crosstab = pd.crosstab(data['SOURCE'], data['TARGET'])

# Observed frequencies
observed = crosstab.values

# Expected frequencies
chi2, p, dof, expected = chi2_contingency(observed)

# Pearson's residuals
residuals = (observed - expected) / np.sqrt(expected)

# Cramer's V
n = observed.sum()
phi2 = chi2 / n
r, k = observed.shape
phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
rcorr = r - ((r - 1)**2) / (n - 1)
kcorr = k - ((k - 1)**2) / (n - 1)
cramer_v = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

# Heatmap with observed frequencies
plt.figure(figsize=(12, 8))
sns.heatmap(crosstab, annot=True, fmt='d', cmap='Blues', cbar=True)
plt.title('Cross-tabulation of Metaphor Sources and Targets')
plt.xlabel('Target')
plt.ylabel('Source')
plt.show()

# Heatmap with Pearson's residuals
plt.figure(figsize=(12, 8))
sns.heatmap(residuals, annot=True, fmt=".2f", cmap='Reds', cbar=True)
plt.title('Pearson Residuals')
plt.xlabel('Target')
plt.ylabel('Source')
plt.show()

print(f"Chi-square statistic: {chi2:.2f}")
print(f"P-value: {p:.3f}")
print(f"Degrees of freedom: {dof}")
print(f"Cramer's V: {cramer_v:.3f}")

Diagnostic (ARM)


In [22]:
#Import Python libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules, apriori

#Import data
data = pd.read_csv('https://raw.githubusercontent.com/dennistay1981/Resources/refs/heads/main/Code%20and%20data%20in%20publications/Chapter%3A%20Data%20science%20approaches%20to%20metaphor%20and%20mental%20health/Diagnostic1.csv')

#Split ASD and non-ASD individuals into two dataframes
data_ASD = data.loc[data['ASD']=='Y']
data_NASD = data.loc[data['ASD']=='N']

#convert transactions into lists: one for ASD, one for Non-ASD, and one for the overall dataset
transactions_ASD = data_ASD['sources'].apply(lambda t: t.split(','))
transactions_NASD = data_NASD['sources'].apply(lambda t: t.split(','))
transactions_all = data['sources'].apply(lambda t: t.split(','))

# Instantiate transaction encoder and identify unique items in transactions
encoder = TransactionEncoder().fit(transactions_ASD)
encoder2 = TransactionEncoder().fit(transactions_NASD)
encoder3 = TransactionEncoder().fit(transactions_all)


# One-hot encode transactions
onehot_ASD = encoder.transform(transactions_ASD)
onehot_NASD = encoder2.transform(transactions_NASD)
onehot_all = encoder3.transform(transactions_all)

# Convert one-hot encoded data to DataFrame
onehot_ASD = pd.DataFrame(onehot_ASD, columns = encoder.columns_)
onehot_NASD = pd.DataFrame(onehot_NASD, columns = encoder2.columns_)
onehot_all = pd.DataFrame(onehot_all, columns = encoder3.columns_)


# Compute frequent itemsets using the Apriori algorithm
# i.e. what are the items (max length 3) that (jointly) appear in at least 5% of transactions
frequent_itemsets_ASD = apriori(onehot_ASD, min_support = 0.05, max_len = 3, use_colnames = True)  #minimum support value, maximum itemset length to be retained
frequent_itemsets_NASD = apriori(onehot_NASD, min_support = 0.05, max_len = 3, use_colnames = True)
frequent_itemsets_all = apriori(onehot_all, min_support = 0.05, max_len = 3, use_colnames = True)

# Compute all association rules for frequent_itemsets,
# limiting only to rules with support > 0.05
# i.e. both antecedent and consequent items jointly appear in at least 5% of transactions
rules_ASD = association_rules(frequent_itemsets_ASD, metric = "support", min_threshold = 0.05)
rules_NASD = association_rules(frequent_itemsets_NASD, metric = "support", min_threshold = 0.05)
rules_all = association_rules(frequent_itemsets_all, metric = "support", min_threshold = 0.05)

# Replace frozen sets with strings
rules_ASD['antecedents'] = rules_ASD['antecedents'].apply(lambda a: ','.join(list(a)))
rules_ASD['consequents'] = rules_ASD['consequents'].apply(lambda a: ','.join(list(a)))
rules_NASD['antecedents'] = rules_NASD['antecedents'].apply(lambda a: ','.join(list(a)))
rules_NASD['consequents'] = rules_NASD['consequents'].apply(lambda a: ','.join(list(a)))
rules_all['antecedents'] = rules_all['antecedents'].apply(lambda a: ','.join(list(a)))
rules_all['consequents'] = rules_all['consequents'].apply(lambda a: ','.join(list(a)))

# Sort rules by confidence, then lift, then support in descending order
rules_ASD = rules_ASD.sort_values(['confidence', 'lift', 'support'], ascending=[False, False, False])
rules_NASD = rules_NASD.sort_values(['confidence', 'lift', 'support'], ascending=[False, False, False])
rules_all = rules_all.sort_values(['confidence', 'lift', 'support'], ascending=[False, False, False])



  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
37,"movement,war/conflict",space/location,0.056738,0.368794,0.056738,1.0,2.711538,0.035813,inf,0.669173
42,"space/location,sight/vision",physical activity,0.134752,0.468085,0.106383,0.789474,1.686603,0.043308,2.526596,0.470492
31,"movement,physical activity",space/location,0.085106,0.368794,0.06383,0.75,2.033654,0.032443,2.524823,0.555556
44,"sight/vision,physical activity",space/location,0.156028,0.368794,0.106383,0.681818,1.848776,0.048841,1.983789,0.543978
8,movement,space/location,0.148936,0.368794,0.099291,0.666667,1.807692,0.044364,1.893617,0.525
50,"war/conflict,physical activity",space/location,0.085106,0.368794,0.056738,0.666667,1.807692,0.025351,1.893617,0.488372
30,"movement,space/location",physical activity,0.099291,0.468085,0.06383,0.642857,1.373377,0.017353,1.489362,0.301837
38,"space/location,war/conflict",movement,0.092199,0.148936,0.056738,0.615385,4.131868,0.043006,2.212766,0.834961
1,entity,space/location,0.092199,0.368794,0.056738,0.615385,1.668639,0.022735,1.641135,0.441406
16,space/location,physical activity,0.368794,0.468085,0.22695,0.615385,1.314685,0.054323,1.382979,0.379213
