In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.data.preprocess import VCTEDataHandler, DemographicsDataHandler, ColumnsTextDataHandler
from src.models.classifier import XGBClassifier
import pickle

In [4]:
pd.options.mode.chained_assignment = None

vcte_handler = VCTEDataHandler()
demographics_handler = DemographicsDataHandler()
columns_text_handler = ColumnsTextDataHandler()

parsed_dict = columns_text_handler.parse_txt_file_to_dict('data/column_names.txt')
df_cleaned = pd.read_csv('data/raw/NhanesPrepandemicCleaned.csv')

flat_dict = {}
for section, columns in parsed_dict.items():
    flat_dict.update(columns)

df_cleaned = df_cleaned.rename(columns=flat_dict)

df_vcte_filtered = vcte_handler.get_acceptable_vcte(df_cleaned)
df_vcte_filtered = df_vcte_filtered[df_vcte_filtered['isGoodFibroScan'] == 1]
df_vcte_filtered = vcte_handler.add_at_risk_mash(df_vcte_filtered, cutoff=0.35)
df_vcte_filtered = vcte_handler.add_at_risk_mash(df_vcte_filtered, cutoff=0.67)

df_vcte_filtered = demographics_handler.process_demographics_data(df_vcte_filtered)
df_vcte_filtered = df_vcte_filtered[df_vcte_filtered['isHighAlcoholConsumptionGT'] == 0]

cols = ['ALANINE AMINOTRANSFERASE (ALT) (U/L)',
        'GAMMA GLUTAMYL TRANSFERASE (GGT) (IU/L)',
        'PLATELET COUNT (1000 CELLS/UL)',
        'AGE IN YEARS AT SCREENING',
        'BODY MASS INDEX (KG/M**2)',
        'isAtRiskMASH35',
        'isAtRiskMASH67']

df_vcte_filtered[cols].to_csv('data/processed/NhanesPrepandemicSubset.csv')

Unique RESPONDENT SEQUENCE NUMBER dropped after NaN filter: 5862
Unique RESPONDENT SEQUENCE NUMBER dropped after age filter: 1308


In [7]:
def stratified_split(df, target, test_size=0.2, random_state=None):
    """
    Splits the DataFrame into training and test sets, ensuring that the proportion
    of the target classes is maintained in both the training and test sets.

    Parameters:
    df: pandas DataFrame
        The DataFrame containing the features and target variable.
    target: string
        The name of the target variable column in the DataFrame.
    test_size: float, default 0.2
        Proportion of the dataset to include in the test split.
    random_state: int, default None
        Controls the shuffling applied to the data before applying the split.

    Returns:
    X_train, X_test, y_train, y_test: tuple of pandas DataFrame and Series
    """
    X = df.drop(columns=[target])
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    return X_train, X_test, y_train, y_test

In [11]:
df = df_vcte_filtered[cols].copy()
df = df.drop('isAtRiskMASH35', axis=1)

In [18]:
df.describe()

Unnamed: 0,ALANINE AMINOTRANSFERASE (ALT) (U/L),GAMMA GLUTAMYL TRANSFERASE (GGT) (IU/L),PLATELET COUNT (1000 CELLS/UL),AGE IN YEARS AT SCREENING,BODY MASS INDEX (KG/M**2),isAtRiskMASH67
count,4772.0,4771.0,4926.0,5156.0,5105.0,5156.0
mean,21.557418,29.013624,243.524158,52.17242,29.415279,0.011443
std,16.154043,35.420854,65.407947,18.219141,7.128276,0.106368
min,2.0,2.0,8.0,18.0,14.6,0.0
25%,13.0,14.0,200.0,37.0,24.5,0.0
50%,17.0,20.0,236.0,55.0,28.2,0.0
75%,25.0,30.0,279.0,67.0,33.0,0.0
max,420.0,646.0,818.0,80.0,86.2,1.0


In [20]:
df.value_counts('isAtRiskMASH67')

isAtRiskMASH67
0    5097
1      59
Name: count, dtype: int64

In [21]:
target_column = 'isAtRiskMASH67'
X_train, X_test, y_train, y_test = stratified_split(df, target_column)

In [None]:
model = XGBClassifier(n_folds=5)
best_params = model.optimize(X_train, y_train)

In [None]:
model.fit(X_train, y_train)

In [None]:
metrics, results_df = model.evaluate(X_test, y_test)
metrics

In [None]:
pickle.dump(model.model, open("xgboost_mashai_67.pkl", "wb"))