In [None]:
# Sources
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statistics
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Junior/Dyanne JP/ABCD_Release4.0_Tabular_Dataset.csv')
df_baseline = df[df['eventname'] == 'baseline_year_1_arm_1']
df_2year = df[df['eventname'] == '2_year_follow_up_y_arm_1']

In [None]:
# All participants = 11879, baseline = 11876, 2 year = 10414
df['subjectkey'].nunique()
df_baseline['subjectkey'].nunique()
df_2year['subjectkey'].nunique()

In [None]:
baseline_2year = pd.merge(df_baseline, df_2year, on='subjectkey', how='left', suffixes=('_baseline', '_2year'))

In [None]:
print('Number of participants with baseline and 2-year data:')
both = baseline_2year.shape[0] - baseline_2year['eventname_2year'].isna().sum()
print(both)

In [None]:
baseline_2year.dropna(subset=['eventname_2year'], inplace=True)

In [None]:
check = ['nihtbx_picvocab_uncorrected_baseline','nihtbx_picvocab_uncorrected_2year','nihtbx_flanker_uncorrected_baseline',
         'nihtbx_flanker_uncorrected_2year','nihtbx_pattern_uncorrected_baseline','nihtbx_pattern_uncorrected_2year',
         'nihtbx_picture_uncorrected_baseline','nihtbx_picture_uncorrected_2year','nihtbx_reading_uncorrected_baseline',
         'nihtbx_reading_uncorrected_2year','nihtbx_cryst_uncorrected_baseline','nihtbx_cryst_uncorrected_2year']

cleaned = baseline_2year.dropna(subset=check)
#(7172,1055)

# drop columns that only have NaN values
cleaned.dropna(axis=1, how='all', inplace=True)
#(7172,892)

print('Number of participants with all 5 test scores')
print(cleaned.shape[0])

In [None]:
non_numeric_columns = cleaned.select_dtypes(exclude=['number']).columns

print("Non-numeric columns:", non_numeric_columns.tolist())

In [None]:
(cleaned['sex_baseline']==cleaned['sex_2year']).sum()

In [None]:
cleaned = cleaned.drop('sex_2year',axis=1)

In [None]:
cleaned['sex_baseline'] = cleaned['sex_baseline'].replace({'M': 0, 'F': 1})

In [None]:
from sklearn.impute import SimpleImputer

def medianimpute(df):
    # Create a copy of the DataFrame to avoid changing the original data
    df_imputed = df.copy()

    # Identify numeric columns by data type
    numeric_cols = df_imputed.select_dtypes(include=[np.number]).columns

    # Define the imputer with a median strategy
    imputer = SimpleImputer(strategy='median')

    # Apply the imputer only to the numeric columns
    df_imputed[numeric_cols] = imputer.fit_transform(df_imputed[numeric_cols])

    return df_imputed

In [None]:
imputed = medianimpute(cleaned)

In [None]:
print('Number of families in the sample')
imputed['family_id_baseline'].nunique()

In [None]:
# nest family in ABCD study site for LME random effect
imputed['site_family'] = imputed['abcd_site_baseline'].astype(str) + "_" + imputed['family_id_baseline'].astype(str)
imputed['site_family'] = imputed['abcd_site_2year'].astype(str) + "_" + imputed['family_id_baseline'].astype(str)

In [None]:
df_low = imputed[(imputed['income_baseline'] >= 1) & (imputed['income_baseline'] < 7)]
df_med = imputed[imputed['income_baseline'].isin([7])]
df_high = imputed[(imputed['income_baseline'] > 7) & (imputed['income_baseline'] <= 10)]

In [None]:
low_diff = pd.DataFrame({
    'low_diff_picvocab': df_low['nihtbx_picvocab_uncorrected_2year'] - df_low['nihtbx_picvocab_uncorrected_baseline'],
    'low_diff_flanker': df_low['nihtbx_flanker_uncorrected_2year'] - df_low['nihtbx_flanker_uncorrected_baseline'],
    'low_diff_pattern': df_low['nihtbx_pattern_uncorrected_2year'] - df_low['nihtbx_pattern_uncorrected_baseline'],
    'low_diff_picture': df_low['nihtbx_picture_uncorrected_2year'] - df_low['nihtbx_picture_uncorrected_baseline'],
    'low_diff_reading': df_low['nihtbx_reading_uncorrected_2year'] - df_low['nihtbx_reading_uncorrected_baseline']
})

high_diff = pd.DataFrame({
    'high_diff_picvocab': df_high['nihtbx_picvocab_uncorrected_2year'] - df_high['nihtbx_picvocab_uncorrected_baseline'],
    'high_diff_flanker': df_high['nihtbx_flanker_uncorrected_2year'] - df_high['nihtbx_flanker_uncorrected_baseline'],
    'high_diff_pattern': df_high['nihtbx_pattern_uncorrected_2year'] - df_high['nihtbx_pattern_uncorrected_baseline'],
    'high_diff_picture': df_high['nihtbx_picture_uncorrected_2year'] - df_high['nihtbx_picture_uncorrected_baseline'],
    'high_diff_reading': df_high['nihtbx_reading_uncorrected_2year'] - df_high['nihtbx_reading_uncorrected_baseline']
})

df_low = pd.concat([df_low, low_diff], axis=1)
df_high = pd.concat([df_high, high_diff], axis=1)

In [None]:
low_picvocab_neg = df_low[df_low['low_diff_picvocab']<0]
high_picvocab_neg = df_high[df_high['high_diff_picvocab']<0]
low_picvocab_pos = df_low[df_low['low_diff_picvocab']>0]
high_picvocab_pos = df_high[df_high['high_diff_picvocab']>0]

low_flanker_neg = df_low[df_low['low_diff_flanker']<0]
high_flanker_neg = df_high[df_high['high_diff_flanker']<0]
low_flanker_pos = df_low[df_low['low_diff_flanker']>0]
high_flanker_pos = df_high[df_high['high_diff_flanker']>0]

low_pattern_neg = df_low[df_low['low_diff_pattern']<0]
high_pattern_neg = df_high[df_high['high_diff_pattern']<0]
low_pattern_pos = df_low[df_low['low_diff_pattern']>0]
high_pattern_pos = df_high[df_high['high_diff_pattern']>0]

low_picture_neg = df_low[df_low['low_diff_picture']<0]
high_picture_neg = df_high[df_high['high_diff_picture']<0]
low_picture_pos = df_low[df_low['low_diff_picture']>0]
high_picture_pos = df_high[df_high['high_diff_picture']>0]

low_reading_neg = df_low[df_low['low_diff_reading']<0]
high_reading_neg = df_high[df_high['high_diff_reading']<0]
low_reading_pos = df_low[df_low['low_diff_reading']>0]
high_reading_pos = df_high[df_high['high_diff_reading']>0]

In [None]:
def drop_non_numeric(df):
    return df.select_dtypes(include=[np.number])

In [None]:
include = ["KSADS_BD_y_baseline", "KSADS_Anx_y_baseline", "KSADS_Suicide_y_baseline", "KSADS_Sleep_y_baseline", "KSADS_total_y_baseline",
                "KSADS_BD_y_2year", "KSADS_Anx_y_2year", "KSADS_Eat_y_2year", "KSADS_Suicide_y_2year", "KSADS_Sleep_y_2year", "KSADS_total_y_2year",
                "KSADS_BD_p_baseline", "KSADS_Anx_p_baseline", "KSADS_Eat_p_baseline", "KSADS_Suicide_p_baseline", "KSADS_Sleep_p_baseline", "KSADS_total_p_baseline",
                "KSADS_BD_p_2year", "KSADS_Anx_p_2year", "KSADS_Eat_p_2year", "KSADS_Suicide_p_2year", "KSADS_Sleep_p_2year", "KSADS_total_p_2year", "KSADS_Subst_p_baseline",
                "KSADS_Subst_p_2year",'cbcl_internal_baseline','cbcl_internal_2year','cbcl_external_baseline','cbcl_external_2year','cbcl_totprob_baseline','cbcl_totprob_2year',
                'upps_premeditation_baseline','upps_premeditation_2year','upps_perseverence_baseline','upps_perseverence_2year','upps_sensation_baseline','upps_sensation_2year',
                'upps_negative_baseline','upps_negative_2year','upps_positive_baseline','upps_positive_2year','BIS_baseline','BIS_2year','BAS_RR_baseline','BAS_RR_2year',
                'BAS_Drive_baseline','BAS_Drive_2year','BAS_Fun_baseline','BAS_Fun_2year','sleep_disturb_total_baseline','sleep_disturb_total_2year','ELS_total_baseline',
                'ELS_total_2year','rh_adi_wsum1_baseline','rh_adi_wsum1_2year','rh_adi_wsum2_baseline','rh_adi_wsum2_2year',
                "risk_alcohol_p_baseline", "risk_cigarette_p_baseline", "risk_electro_nicotine_p_baseline", "risk_marijuana_p_baseline", "risk_drug_p_baseline",
                "risk_med_legal_p_baseline", "risk_med_marijuana_p_baseline", "risk_med_marijuana_pre_p_baseline", "risk_family_med_marijuana_p_baseline",
                "risk_alcohol_p_2year", "risk_cigarette_p_2year", "risk_electro_nicotine_p_2year", "risk_marijuana_p_2year", "risk_drug_p_2year",
                "risk_med_legal_p_2year", "risk_med_marijuana_p_2year", "risk_med_marijuana_pre_p_2year", "risk_family_med_marijuana_p_2year",
                "risk_alcohol_y_2year", "risk_cigarette_y_2year", "risk_electro_nicotine_y_2year", "risk_marijuana_y_2year", "risk_med_legal_y_2year", "risk_med_marijuana_y_2year",
                "risk_med_marijuana_pre_y_2year", "risk_family_med_marijuana_y_2year", "risk_drug_y_2year", "risk_gas_y_2year", "risk_pain_y_2year", "risk_anix_y_2year",
                "risk_stimulant_y_2year","good_school_baseline", "good_parent1_baseline", "good_parent2_baseline", "good_parent_baseline",
                "good_school_2year", "prenatal_tobacco_before_baseline", "prenatal_alcohol_max_before_baseline", "prenatal_alcohol_avg_before_baseline", "prenatal_alcohol_eff_before_baseline",
                "prenatal_marijuana_before_baseline", "prenatal_cocaine_before_baseline", "prenatal_heroin_before_baseline", "prenatal_oxycontin_before_baseline", "prenatal_tobacco_after_baseline",
                "prenatal_alcohol_max_after_baseline", "prenatal_alcohol_avg_after_baseline", "prenatal_alcohol_eff_after_baseline", "prenatal_marijuana_after_baseline", "prenatal_cocaine_after_baseline",
                "prenatal_heroin_after_baseline", "prenatal_oxycontin_after_baseline", "prenatal_weeks_baseline","screentime_wkday_tv_baseline", "screentime_wkday_videos_baseline", "screentime_wkday_games_baseline",
                "screentime_wkday_texting_baseline", "screentime_wkday_sns_baseline", "screentime_wkday_videochat_baseline", "screentime_wkend_tv_baseline", "screentime_wkend_videos_baseline", "screentime_wkend_games_baseline",
                "screentime_wkend_texting_baseline", "screentime_wkend_sns_baseline", "screentime_wkend_videochat_baseline", "screentime_wkday_y_baseline", "screentime_wkend_y_baseline", "screentime_wkday_p_baseline",
                "screentime_wkend_p_baseline","screentime_maturegames_baseline", "screentime_rmovies_baseline","screentime_addict_p_2year", "screentime_risk_p_2year", "screentime_maturegames_2year",
                "screentime_rmovies_2year",'bpm_total_y_2year','bpm_total_t_baseline','bpm_total_t_2year',"prosocial_y_baseline", "prosocial_p_baseline", "detention_baseline", "detention_rea_baseline",
                "friends_boys_baseline", "friends_girls_baseline", "Cfriends_boys_baseline", "Cfriends_girls_baseline", "friends_same_baseline", "friends_diff_baseline", "Cfriends_diff_baseline", "Cfriends_same_baseline",
                "prosocial_y_2year", "prosocial_p_2year", "friends_boys_2year", "friends_girls_2year", "Cfriends_boys_2year", "Cfriends_girls_2year", "friends_same_2year", "friends_diff_2year", "Cfriends_diff_2year", "Cfriends_same_2year",
                "cpeur2_baseline", "eaeur1_baseline", "depeur4_baseline", "mddeur6_baseline", "depmulti_baseline", "bmieur4_baseline", "bmimulti_baseline", "insomniaeur6_baseline", "snoringeur1_baseline", "iqeur2_baseline", "happieur4_baseline",
                "ghappieur2_baseline", "ghappimeaneur1_baseline", "ghappihealth6_baseline", "alcdep_eurauto_baseline", "alcdep_afrauto_baseline", "alcdep_metaauto_baseline", "asdauto_baseline", "aspauto_baseline", "bipauto_baseline", "cannabisauto_baseline",
                "crossauto_baseline", "drinkauto_baseline", "edauto_baseline", "neuroticismauto_baseline", "ocdauto_baseline", "risk4pcauto_baseline", "risktolauto_baseline", "scz_eurauto_baseline", "scz_easauto_baseline", "scz_metaauto_baseline", "smokerauto_baseline",
                "worryauto_baseline", "anxietyauto_baseline", "ptsdeur4_baseline", "ptsdmeta6_baseline", "adhdeur6_baseline","euro_baseline", "sex_baseline", "race_g_baseline", "parent_identity_baseline", "demo_brthdat_v2_baseline", "demo_sex_v2_baseline",
                "gender_identity_baseline", "parent_age_baseline", "foreign_born_family_baseline", "married_baseline", "high_educ_baseline", "high_educ2_baseline", "income_baseline", "foreign_born_baseline", "religion_prefer_baseline", "gay_parent_baseline",
                "gay_youth_baseline", "race_ethnicity_baseline", "age_baseline", "family_adversity_baseline", "height_baseline", "weight_baseline","vol_baseline", "bmi_baseline", "total_ratio_baseline", "history_ratio_baseline",
                "euro_2year", "race_g_2year", "gay_youth_2year", "age_2year","height_2year", "weight_2year","vol_2year", "bmi_2year"]

In [None]:
X = low_picvocab_neg[include]

In [None]:
# low SES - picvocab - negative

# X = drop_non_numeric(low_picvocab_neg)
y = low_picvocab_neg['nihtbx_picvocab_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = high_picvocab_neg[include]

In [None]:
###

In [None]:
# high SES - picvocab - negative

y = high_picvocab_neg['nihtbx_picvocab_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
# high SES - picvocab - negative

y = high_picvocab_neg['nihtbx_picvocab_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2'],
              'bootstrap': [True,False]}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = low_picvocab_pos[include]

In [None]:
###

In [None]:
# low SES - picvocab - positive

y = low_picvocab_pos['nihtbx_picvocab_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
# low SES - picvocab - positive

y = low_picvocab_pos['nihtbx_picvocab_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2'],
              'bootstrap': [True,False]}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = high_picvocab_pos[include]

In [None]:
###

In [None]:
# high SES - picvocab - positive

y = high_picvocab_pos['nihtbx_picvocab_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
# high SES - picvocab - positive

y = high_picvocab_pos['nihtbx_picvocab_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2'],
              'bootstrap': [True,False]}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = low_flanker_neg[include]

In [None]:
###

In [None]:
# low SES - flanker - negative

y = low_flanker_neg['nihtbx_flanker_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
# low SES - flanker - negative

y = low_flanker_neg['nihtbx_flanker_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2'],
              'bootstrap': [True,False]}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = high_flanker_neg[include]

In [None]:
# high SES - flanker - negative

y = high_flanker_neg['nihtbx_flanker_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
# high SES - flanker - negative

y = high_flanker_neg['nihtbx_flanker_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2'],
              'bootstrap': [True,False]}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = low_flanker_pos[include]

In [None]:
# low SES - flanker - positive

y = low_flanker_pos['nihtbx_flanker_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
# low SES - flanker - positive

y = low_flanker_pos['nihtbx_flanker_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2'],
              'bootstrap': [True,False]}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = high_flanker_pos[include]

In [None]:
# high SES - flanker - positive

y = high_flanker_pos['nihtbx_flanker_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = low_pattern_neg[include]

In [None]:
# low SES - pattern - negative

y = low_pattern_neg['nihtbx_pattern_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = high_pattern_neg[include]

In [None]:
# high SES - pattern - negative

y = high_pattern_neg['nihtbx_pattern_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = low_pattern_pos[include]

In [None]:
#####

In [None]:
# low SES - pattern - positive

y = low_pattern_pos['nihtbx_pattern_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = high_pattern_pos[include]

In [None]:
# high SES - pattern - positive

y = high_pattern_pos['nihtbx_pattern_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = low_picture_neg[include]

In [None]:
# low SES - picture - negative

y = low_picture_neg['nihtbx_picture_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = high_picture_neg[include]

In [None]:
# high SES - picture - negative

y = high_picture_neg['nihtbx_picture_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = low_picture_pos[include]

In [None]:
# low SES - picture - positive

y = low_picture_pos['nihtbx_picture_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = high_picture_pos[include]

In [None]:
# high SES - picture - positive

y = high_picture_pos['nihtbx_picture_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = low_reading_neg[include]

In [None]:
# low SES - reading - negative

y = low_reading_neg['nihtbx_reading_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = high_reading_neg[include]

In [None]:
# high SES - reading - negative

y = high_reading_neg['nihtbx_reading_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = low_reading_pos[include]

In [None]:
# low SES - reading - positive

y = low_reading_pos['nihtbx_reading_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
X = high_reading_pos[include]

In [None]:
# high SES - reading - positive

y = high_reading_pos['nihtbx_reading_uncorrected_2year']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Code adapted from Precept 5 Classification Methods and 7 Decision Trees
parameters = {'n_estimators': [50,100,200],
              'max_depth': [None,10,20,30],
              'min_samples_split': [2,5,10],
              'min_samples_leaf': [1,2,4],
              'max_features':['sqrt','log2']}

rf = RandomForestRegressor(random_state=12)

grid_search = GridSearchCV(estimator=rf, param_grid=parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")

model_rf = grid_search.best_estimator_

# Feature Importance
feature_importances = model_rf.feature_importances_
features = list(X.columns)
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Review the importance_df DataFrame to see the importance of each feature
print(importance_df.head(20))

# # Evaluate the model
y_pred = model_rf.predict(X_test)

# Evaluate with appropriate regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")