### Dictionary

In [1]:
import pandas as pd
import numpy as np
data_dict = pd.read_csv('data_dictionary.csv')
df = pd.read_csv('train.csv')

In [2]:
import re
def clean_name(s):
    return re.sub(r'[^A-Za-z0-9]', '', s)

In [3]:
def onehotencSeasons(df):
    for idx,row in data_dict.iterrows():
        if row['Values'] =='Spring, Summer, Fall, Winter':
            field = row['Field']
            dummies = pd.get_dummies(df[field], prefix = field)
            df = pd.concat([df,dummies],axis=1)
            df =  df.drop(columns = [field])
    return df

### Basic_Demos{5-22} Age


In [4]:
# Physical Measures

def clean_physicalMeasures(df):
    measures = {
        'PhysicalBMI': (13,30),
        'PhysicalHeight': (40,75),
        'PhysicalWeight': (22,350),
        'PhysicalWaistCircumference': (16,52),
        'PhysicalDiastolicBP': (30,120),
        'PhysicalHeartRate': (20,230),
        'PhysicalSystolicBP': (60,250)
    }

    for col, (min_val, max_val) in measures.items():
        df[col] = pd.to_numeric(df[col], errors='coerce')

        df.loc[(df[col] < min_val) | (df[col] > max_val), col] = np.nan

        df[col] = df.groupby(['BasicDemosAge', 'BasicDemosSex'])[col]\
                    .transform(lambda x: x.fillna(x.mean()))

    return df
        

In [5]:
#FitnessGram Vitals and Treadmill

def clean_fgc(df):
    
    df['FitnessEnduranceMaxStage'] = df['FitnessEnduranceMaxStage'].fillna(0)
    df['FitnessEnduranceMaxStage'] = df['FitnessEnduranceMaxStage'].apply(lambda x: 0 if (x>12 or x<1) else x)
    df['FitnessEnduranceTimeMins'] = df['FitnessEnduranceTimeMins'].fillna(0)
    df['FitnessEnduranceTimeSec'] = df['FitnessEnduranceTimeSec'].fillna(0)
    
    fgc_cag = {
        'FGCFGCCU',
        'FGCFGCGSND',
        'FGCFGCGSD',
        'FGCFGCPU',
        'FGCFGCSRL',
        'FGCFGCSRR',
        'FGCFGCTL'
    }

    for col in fgc_cag:
        df[col] = df[col].fillna(0)
        df[f'{col}Zone'] = pd.to_numeric(df[f'{col}Zone'],errors = 'coerce')
        df[f'{col}ZoneMissing'] = df[f'{col}Zone'].isna().astype(int)

    return df
        

In [6]:
#Bio-electric Impedance Analysis

def clean_bia(df):
    bia_ranges = {
        'BIABIABMC':  (0.7, 8.8),
        'BIABIABMI':  (10, 50),
        'BIABIABMR':  (600, 3500),
        'BIABIADEE':  (800, 6000),
        'BIABIAECW':  (1.5, 25),
        'BIABIAFFM':  (10, 200),
        'BIABIAFFMI': (10, 26),
        'BIABIAFMI':  (1, 20),
        'BIABIAFat':  (3, 60),
        'BIABIAICW':  (2, 35),
        'BIABIALDM':  (5, 45),
        'BIABIALST':  (15, 150),
        'BIABIASMM':  (10, 120),
        'BIABIATBW':  (20, 120)
    }

    for col, (min_val, max_val) in bia_ranges.items():
        df[col] = pd.to_numeric(df[col], errors='coerce')

        df.loc[(df[col] < min_val) | (df[col] > max_val), col] = np.nan

        df[col] = df.groupby(['BasicDemosAge', 'BasicDemosSex'])[col]\
                    .transform(lambda x: x.fillna(x.mean()))

    return df




In [None]:
from sklearn.preprocessing import StandardScaler

df normalise(df):

    numeric_cols = df.select_dtypes(include=['float', 'int']).columns
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

In [11]:
def preprocess(df):
    
    df.columns = [clean_name(col) for col in df.columns]
    data_dict['Field'] = [clean_name(val) for val in data_dict['Field']]
    df = onehotencSeasons(df)
    df = clean_physicalMeasures(df)
    df = clean_fgc(df)
    df = clean_bia(df)
    
    #CGAS
    df['CGASCGASScore'] = df['CGASCGASScore'].replace({999: 99})
    # Physical Activity Questionnaire (Adolescents)
    df['PAQAPAQATotal'] = df['PAQAPAQATotal'].fillna(0)
    # Physical Activity Questionnaire (Children)
    df['PAQCPAQCTotal'] = df['PAQCPAQCTotal'].fillna(0)
    # PCIAT data
    for i in range(1, 21):
        df[f'PCIATPCIAT{i:02d}'] = df[f'PCIATPCIAT{i:02d}'].fillna(0)
    
    #PCIAT categorisation
    df['PCIATPCIATTotal'] = df['PCIATPCIATTotal'].clip(0, 100)
    df['PCIATPCIATTotal'] = pd.cut(
        df['PCIATPCIATTotal'],
        bins=[0, 30, 49, 79, 100],
        labels=[1, 2, 3, 4],
    )
    
    df['PCIATPCIATTotal'] = df['PCIATPCIATTotal'].notna().astype(int)
    df['PCIATPCIATTotal'] = df['PCIATPCIATTotal'].notna().astype(int)
    df['SDSSDSTotalRawMissing'] = df['SDSSDSTotalRaw'].isna().astype(int)
    df['SDSSDSTotalTMissing'] = df['SDSSDSTotalT'].isna().astype(int)
    df['PreIntEduHxcomputerinternethoursdayMissing'] = df['PreIntEduHxcomputerinternethoursday'].isna().astype(int)

    df = normalise(df)
    
    return df

    

In [12]:
train_df = preprocess(df)

In [13]:
from sklearn.preprocessing import StandardScaler

numeric_columns = train_df.select_dtypes(include=['float64', 'int64']).columns
x_numeric = train_df[numeric_columns]


# Step 2: Scale the data
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_numeric)


In [14]:
train_df.head()

Unnamed: 0,id,BasicDemosAge,BasicDemosSex,CGASCGASScore,PhysicalBMI,PhysicalHeight,PhysicalWeight,PhysicalWaistCircumference,PhysicalDiastolicBP,PhysicalHeartRate,...,FGCFGCTLZoneMissing,FGCFGCSRRZoneMissing,FGCFGCGSDZoneMissing,FGCFGCCUZoneMissing,FGCFGCGSNDZoneMissing,FGCFGCPUZoneMissing,FGCFGCSRLZoneMissing,SDSSDSTotalRawMissing,SDSSDSTotalTMissing,PreIntEduHxcomputerinternethoursdayMissing
0,00008ff9,5,0,51.0,16.877316,46.0,50.8,22.888889,70.590164,89.145161,...,0,0,1,0,1,0,0,1,1,0
1,000fd460,9,0,,14.03559,48.0,46.0,22.0,75.0,70.0,...,0,0,1,0,1,0,0,0,0,0
2,00105258,10,1,71.0,16.648696,56.5,75.6,27.557692,65.0,94.0,...,0,0,0,0,0,0,0,0,0,0
3,00115b9f,9,0,71.0,18.292347,56.0,81.6,25.918182,60.0,97.0,...,0,0,1,0,1,0,0,0,0,0
4,0016bb22,18,1,,21.466623,63.632632,141.947368,32.428571,73.0,79.157895,...,1,1,1,1,1,1,1,1,1,1


In [15]:
train_df['sii'] = pd.to_numeric(train_df['sii'], errors = 'coerce')

unlabeled = train_df[train_df['sii'].isna()]
labeled = train_df[train_df['sii'].notna()]
labeled['sii'] = labeled['sii'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled['sii'] = labeled['sii'].astype(int)


In [None]:
pip install catboost

In [16]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    loss_function='MAE',      
    iterations=2000,
    depth=8,
    learning_rate=0.03,
    eval_metric='MAE',
    random_seed=42,
    verbose=False
)

X_labeled = labeled.drop(columns=['sii','id'])
y_labeled = labeled['sii']

model.fit(X_labeled, y_labeled)

<catboost.core.CatBoostRegressor at 0x7e2980101d30>

In [21]:
X_unlabeled = unlabeled.drop(columns=['sii','id'])
y_unlabeled = model.predict(X_unlabeled)

In [22]:
unlabeled['sii'] = y_unlabeled
combined = pd.concat([labeled, unlabeled], ignore_index=True)
X_final = combined.drop(columns=['sii','id'])
y_final = combined['sii']

final_model = CatBoostRegressor(
    loss_function='MAE',
    iterations=2000,
    depth=8,
    learning_rate=0.02,
    verbose=False
)

final_model.fit(X_final, y_final)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled['sii'] = y_unlabeled


<catboost.core.CatBoostRegressor at 0x7e296e9ffed0>

In [None]:
testdf = pd.read_csv("test.csv")
test_df = preprocess(testdf)

In [None]:
y_pred = final_model.predict(test_df)