In [None]:
# Sources
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statistics
from scipy import stats

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Junior/Dyanne JP/ABCD_Release4.0_Tabular_Dataset.csv')
df_baseline = df[df['eventname'] == 'baseline_year_1_arm_1']
df_2year = df[df['eventname'] == '2_year_follow_up_y_arm_1']

In [None]:
baseline_2year = pd.merge(df_baseline, df_2year, on='subjectkey', how='left', suffixes=('_baseline', '_2year'))

In [None]:
baseline_2year.dropna(subset=['eventname_2year'], inplace=True)

In [None]:
check = ['nihtbx_picvocab_uncorrected_baseline','nihtbx_picvocab_uncorrected_2year','nihtbx_flanker_uncorrected_baseline',
         'nihtbx_flanker_uncorrected_2year','nihtbx_pattern_uncorrected_baseline','nihtbx_pattern_uncorrected_2year',
         'nihtbx_picture_uncorrected_baseline','nihtbx_picture_uncorrected_2year','nihtbx_reading_uncorrected_baseline',
         'nihtbx_reading_uncorrected_2year','nihtbx_cryst_uncorrected_baseline','nihtbx_cryst_uncorrected_2year']

cleaned = baseline_2year.dropna(subset=check)
#(7172,1055)

# drop columns that only have NaN values
cleaned.dropna(axis=1, how='all', inplace=True)
#(7172,892)

print('Number of participants with all 5 test scores')
print(cleaned.shape[0])

In [None]:
non_numeric_columns = cleaned.select_dtypes(exclude=['number']).columns

print("Non-numeric columns:", non_numeric_columns.tolist())

In [None]:
(cleaned['sex_baseline']==cleaned['sex_2year']).sum()

In [None]:
cleaned = cleaned.drop('sex_2year',axis=1)

In [None]:
cleaned['sex_baseline'] = cleaned['sex_baseline'].replace({'M': 0, 'F': 1})

In [None]:
from sklearn.impute import SimpleImputer

def medianimpute(df):
    # Create a copy of the DataFrame to avoid changing the original data
    df_imputed = df.copy()

    # Identify numeric columns by data type
    numeric_cols = df_imputed.select_dtypes(include=[np.number]).columns

    # Define the imputer with a median strategy
    imputer = SimpleImputer(strategy='median')

    # Apply the imputer only to the numeric columns
    df_imputed[numeric_cols] = imputer.fit_transform(df_imputed[numeric_cols])

    return df_imputed

In [None]:
imputed = medianimpute(cleaned)

In [None]:
df_low = imputed[(imputed['income_baseline'] >= 1) & (imputed['income_baseline'] < 7)]
df_med = imputed[imputed['income_baseline'].isin([7])]
df_high = imputed[(imputed['income_baseline'] > 7) & (imputed['income_baseline'] <= 10)]

In [None]:
low_diff = pd.DataFrame({
    'low_diff_picvocab': df_low['nihtbx_picvocab_uncorrected_2year'] - df_low['nihtbx_picvocab_uncorrected_baseline'],
    'low_diff_flanker': df_low['nihtbx_flanker_uncorrected_2year'] - df_low['nihtbx_flanker_uncorrected_baseline'],
    'low_diff_pattern': df_low['nihtbx_pattern_uncorrected_2year'] - df_low['nihtbx_pattern_uncorrected_baseline'],
    'low_diff_picture': df_low['nihtbx_picture_uncorrected_2year'] - df_low['nihtbx_picture_uncorrected_baseline'],
    'low_diff_reading': df_low['nihtbx_reading_uncorrected_2year'] - df_low['nihtbx_reading_uncorrected_baseline']
})

high_diff = pd.DataFrame({
    'high_diff_picvocab': df_high['nihtbx_picvocab_uncorrected_2year'] - df_high['nihtbx_picvocab_uncorrected_baseline'],
    'high_diff_flanker': df_high['nihtbx_flanker_uncorrected_2year'] - df_high['nihtbx_flanker_uncorrected_baseline'],
    'high_diff_pattern': df_high['nihtbx_pattern_uncorrected_2year'] - df_high['nihtbx_pattern_uncorrected_baseline'],
    'high_diff_picture': df_high['nihtbx_picture_uncorrected_2year'] - df_high['nihtbx_picture_uncorrected_baseline'],
    'high_diff_reading': df_high['nihtbx_reading_uncorrected_2year'] - df_high['nihtbx_reading_uncorrected_baseline']
})

df_low = pd.concat([df_low, low_diff], axis=1)
df_high = pd.concat([df_high, high_diff], axis=1)

In [None]:
low_picvocab_neg = df_low[df_low['low_diff_picvocab']<0]
high_picvocab_neg = df_high[df_high['high_diff_picvocab']<0]
low_picvocab_pos = df_low[df_low['low_diff_picvocab']>0]
high_picvocab_pos = df_high[df_high['high_diff_picvocab']>0]

low_flanker_neg = df_low[df_low['low_diff_flanker']<0]
high_flanker_neg = df_high[df_high['high_diff_flanker']<0]
low_flanker_pos = df_low[df_low['low_diff_flanker']>0]
high_flanker_pos = df_high[df_high['high_diff_flanker']>0]

low_pattern_neg = df_low[df_low['low_diff_pattern']<0]
high_pattern_neg = df_high[df_high['high_diff_pattern']<0]
low_pattern_pos = df_low[df_low['low_diff_pattern']>0]
high_pattern_pos = df_high[df_high['high_diff_pattern']>0]

low_picture_neg = df_low[df_low['low_diff_picture']<0]
high_picture_neg = df_high[df_high['high_diff_picture']<0]
low_picture_pos = df_low[df_low['low_diff_picture']>0]
high_picture_pos = df_high[df_high['high_diff_picture']>0]

low_reading_neg = df_low[df_low['low_diff_reading']<0]
high_reading_neg = df_high[df_high['high_diff_reading']<0]
low_reading_pos = df_low[df_low['low_diff_reading']>0]
high_reading_pos = df_high[df_high['high_diff_reading']>0]

In [None]:
def drop_non_numeric(df):
    return df.select_dtypes(include=[np.number])

In [None]:
w_stat, p_value = stats.shapiro(df_low['low_diff_picvocab'])

print(f"W-statistic: {w_stat}, P-value: {p_value}")

if p_value > 0.05:
    print("The data follows a normal distribution.")
else:
    print("The data does not follow a normal distribution.")

In [None]:
w_stat, p_value = stats.shapiro(df_high['high_diff_picvocab'])

print(f"W-statistic: {w_stat}, P-value: {p_value}")

if p_value > 0.05:
    print("The data follows a normal distribution.")
else:
    print("The data does not follow a normal distribution.")

In [None]:
# Perform the Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(df_low['low_diff_picvocab'], df_high['high_diff_picvocab'])

# Output the results
print(f"U-statistic: {u_stat}, P-value: {p_value}")

if p_value < 0.05:
    print("There is a significant difference between the two distributions.")
else:
    print("There is no significant difference between the two distributions.")

In [None]:
w_stat, p_value = stats.shapiro(df_low['low_diff_flanker'])

print(f"W-statistic: {w_stat}, P-value: {p_value}")

if p_value > 0.05:
    print("The data follows a normal distribution.")
else:
    print("The data does not follow a normal distribution.")

In [None]:
w_stat, p_value = stats.shapiro(df_high['high_diff_flanker'])

print(f"W-statistic: {w_stat}, P-value: {p_value}")

if p_value > 0.05:
    print("The data follows a normal distribution.")
else:
    print("The data does not follow a normal distribution.")

In [None]:
# Perform the Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(df_low['low_diff_flanker'], df_high['high_diff_flanker'])

# Output the results
print(f"U-statistic: {u_stat}, P-value: {p_value}")

if p_value < 0.05:
    print("There is a significant difference between the two distributions.")
else:
    print("There is no significant difference between the two distributions.")

In [None]:
w_stat, p_value = stats.shapiro(df_low['low_diff_picture'])

print(f"W-statistic: {w_stat}, P-value: {p_value}")

if p_value > 0.05:
    print("The data follows a normal distribution.")
else:
    print("The data does not follow a normal distribution.")

In [None]:
w_stat, p_value = stats.shapiro(df_high['high_diff_picture'])

print(f"W-statistic: {w_stat}, P-value: {p_value}")

if p_value > 0.05:
    print("The data follows a normal distribution.")
else:
    print("The data does not follow a normal distribution.")

In [None]:
# Perform the Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(df_low['low_diff_picture'], df_high['high_diff_picture'])

# Output the results
print(f"U-statistic: {u_stat}, P-value: {p_value}")

if p_value < 0.05:
    print("There is a significant difference between the two distributions.")
else:
    print("There is no significant difference between the two distributions.")

In [None]:
w_stat, p_value = stats.shapiro(df_low['low_diff_pattern'])

print(f"W-statistic: {w_stat}, P-value: {p_value}")

if p_value > 0.05:
    print("The data follows a normal distribution.")
else:
    print("The data does not follow a normal distribution.")

In [None]:
w_stat, p_value = stats.shapiro(df_high['high_diff_pattern'])

print(f"W-statistic: {w_stat}, P-value: {p_value}")

if p_value > 0.05:
    print("The data follows a normal distribution.")
else:
    print("The data does not follow a normal distribution.")

In [None]:
# Perform the Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(df_low['low_diff_pattern'], df_high['high_diff_pattern'])

# Output the results
print(f"U-statistic: {u_stat}, P-value: {p_value}")

if p_value < 0.05:
    print("There is a significant difference between the two distributions.")
else:
    print("There is no significant difference between the two distributions.")

In [None]:
w_stat, p_value = stats.shapiro(df_low['low_diff_reading'])

print(f"W-statistic: {w_stat}, P-value: {p_value}")

if p_value > 0.05:
    print("The data follows a normal distribution.")
else:
    print("The data does not follow a normal distribution.")

In [None]:
w_stat, p_value = stats.shapiro(df_high['high_diff_reading'])

print(f"W-statistic: {w_stat}, P-value: {p_value}")

if p_value > 0.05:
    print("The data follows a normal distribution.")
else:
    print("The data does not follow a normal distribution.")

In [None]:
# Perform the Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(df_low['low_diff_reading'], df_high['high_diff_reading'])

# Output the results
print(f"U-statistic: {u_stat}, P-value: {p_value}")

if p_value < 0.05:
    print("There is a significant difference between the two distributions.")
else:
    print("There is no significant difference between the two distributions.")