In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import balanced_accuracy_score

# Load training and test datasets
train_data = pd.read_csv('C:\\Users\\pc\\Documents\\DAT158\\train.csv')
test_data = pd.read_csv('C:\\Users\\pc\\Documents\\DAT158\\test.csv')

# Inspect the data
print(train_data.head())
print(train_data.describe())
print(train_data.info())

   Id  HighBP  HighChol  CholCheck  BMI  Smoker  Stroke  HeartDiseaseorAttack  \
0   1       1         1          1   21       0       0                     0   
1   2       1         0          1   26       1       0                     0   
2   3       1         1          1   29       0       0                     1   
3   4       1         1          1   27       0       0                     0   
4   5       1         1          1   26       1       0                     0   

   PhysActivity  Fruits  ...  NoDocbcCost  GenHlth  MentHlth  PhysHlth  \
0             1       1  ...            0        4         0         0   
1             1       1  ...            0        3         0         0   
2             0       0  ...            0        3        15         5   
3             1       1  ...            0        2         0         0   
4             0       0  ...            0        2         0         0   

   DiffWalk  Sex  Age  Education  Income  Target  
0         0    0 

In [2]:
# Fill missing values with mean (if any)
train_data.fillna(train_data.mean(), inplace=True)
test_data.fillna(test_data.mean(), inplace=True)

# Feature scaling
scaler = StandardScaler()
features = ['BMI', 'MentHlth', 'PhysHlth', 'Age', 'Income']
train_data[features] = scaler.fit_transform(train_data[features])
test_data[features] = scaler.transform(test_data[features])

# Create a new feature for total bad health days
train_data['TotalBadHealthDays'] = train_data['MentHlth'] + train_data['PhysHlth']
test_data['TotalBadHealthDays'] = test_data['MentHlth'] + test_data['PhysHlth']

# Separate features and target in the training set
X = train_data.drop(columns=['Target', 'Id'])
y = train_data['Target']

In [3]:
# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize the Random Forest model
model = RandomForestClassifier(class_weight='balanced', random_state=42)

In [4]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='balanced_accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

Best parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}


In [5]:
# Validate the best model
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
balanced_acc = balanced_accuracy_score(y_val, y_val_pred)
print(f'Balanced accuracy on validation set: {balanced_acc}')

Balanced accuracy on validation set: 0.7490973658408622


In [6]:
# Predict on the test set
test_features = test_data.drop(columns=['Id'])
test_predictions = best_model.predict(test_features)

# Prepare the submission file
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'Target': test_predictions
})
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully.")

Submission file created successfully.
