<a href="https://colab.research.google.com/github/deeptanshurai/deeptanshu_iitg_analytics/blob/main/ai_planet_iitg_hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

# Load the datasets
train_data = pd.read_csv('/content/Train_Data.csv')
test_data = pd.read_csv('/content/Test_Data.csv')

# Map the target variable 'age_group' to 0 (Adult) and 1 (Senior)
train_data['age_group'] = train_data['age_group'].map({'Adult': 0, 'Senior': 1})

# Drop rows where age_group is NaN to ensure clean target data
train_data = train_data.dropna(subset=['age_group'])

# Separate features and target, excluding 'SEQN' as it's just an identifier
train_features = train_data.drop(['SEQN', 'age_group'], axis=1)
train_target = train_data['age_group']

# Drop rows in train_features with any missing values to align with train_target
train_features = train_features.dropna()
train_target = train_target.loc[train_features.index]

test_features = test_data.drop(['SEQN'], axis=1)

# Define categorical and numerical columns
categorical_cols = ['RIAGENDR', 'DIQ010', 'PAQ605']  # Treat PAQ605 as categorical (yes/no response)
numerical_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']

# Replace infinite values with NaN to handle them during imputation
train_features = train_features.replace([np.inf, -np.inf], np.nan)
test_features = test_features.replace([np.inf, -np.inf], np.nan)

# Impute missing values
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='mean')

# Impute categorical features
train_features[categorical_cols] = cat_imputer.fit_transform(train_features[categorical_cols])
test_features[categorical_cols] = cat_imputer.transform(test_features[categorical_cols])

# Impute numerical features
train_features[numerical_cols] = num_imputer.fit_transform(train_features[numerical_cols])
test_features[numerical_cols] = num_imputer.transform(test_features[numerical_cols])

# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(train_features[categorical_cols])

# Transform categorical features
train_encoded = encoder.transform(train_features[categorical_cols])
train_encoded_df = pd.DataFrame(train_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_cols))
test_encoded = encoder.transform(test_features[categorical_cols])
test_encoded_df = pd.DataFrame(test_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_cols))


# Drop original categorical columns and concatenate encoded ones
train_features = train_features.drop(categorical_cols, axis=1)
train_features = pd.concat([train_features, train_encoded_df], axis=1)
test_features = test_features.drop(categorical_cols, axis=1)
test_features = pd.concat([test_features, test_encoded_df], axis=1)

# Drop any remaining rows with missing values and align target
train_features = train_features.dropna()
train_target = train_target.loc[train_features.index]

# Scale numerical features (optional for Random Forest, kept for consistency)
scaler = StandardScaler()
train_features[numerical_cols] = scaler.fit_transform(train_features[numerical_cols])
test_features[numerical_cols] = scaler.transform(test_features[numerical_cols])


# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_target, test_size=0.2, random_state=42)

# Ensure y_train and y_val are integers
y_train = y_train.astype(int)
y_val = y_val.astype(int)

# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize Random Forest with balanced class weights
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Perform grid search to maximize F1-score
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_

# Evaluate the model on the validation set
y_pred = best_rf_model.predict(X_val)
print("Best Parameters:", grid_search.best_params_)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

# Predict on the test set
test_predictions = best_rf_model.predict(test_features)

# Prepare the submission file
submission = pd.DataFrame({'SEQN': test_data['SEQN'], 'age_group': test_predictions})
submission.to_csv('submission_5.csv', index=False)
print("Submission file 'submission_2.csv' has been generated.")

Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Validation Accuracy: 0.8371428571428572
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.93      0.90       289
           1       0.54      0.41      0.47        61

    accuracy                           0.84       350
   macro avg       0.71      0.67      0.69       350
weighted avg       0.82      0.84      0.83       350

Submission file 'submission_2.csv' has been generated.
