1) Loading the Data

In [5]:
import pandas as pd

# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify numerical and categorical columns
numerical_cols = train_features.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_features.select_dtypes(include=['object']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform the training data, and transform the test data
X_train = preprocessor.fit_transform(train_features)
X_test = preprocessor.transform(test_features)

y_train_xyz = train_labels['xyz_vaccine']
y_train_seasonal = train_labels['seasonal_vaccine']

from sklearn.ensemble import RandomForestClassifier

# Train the model for xyz_vaccine
model_xyz = RandomForestClassifier(n_estimators=100, random_state=42)
model_xyz.fit(X_train, y_train_xyz)

# Train the model for seasonal_vaccine
model_seasonal = RandomForestClassifier(n_estimators=100, random_state=42)
model_seasonal.fit(X_train, y_train_seasonal)
# Predict probabilities for xyz_vaccine
xyz_vaccine_probs = model_xyz.predict_proba(X_test)[:, 1]

# Predict probabilities for seasonal_vaccine
seasonal_vaccine_probs = model_seasonal.predict_proba(X_test)[:, 1]
# Create the submission dataframe
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': xyz_vaccine_probs,
    'seasonal_vaccine': seasonal_vaccine_probs
})

# Save the submission file
submission.to_csv('test_set_labels.csv', index=False)
