# Kenya Clinical Challenge
This notebook orchestrates the entire workflow for the Kenya Clinical Challenge project, including data loading, exploration, cleaning, preprocessing, feature engineering, model training, evaluation, and submission.

In [ ]:
# Import necessary libraries
import pandas as pd
from src.data_loading import load_data
from src.data_exploration import explore_data
from src.data_cleaning import clean_data
from src.preprocessing import preprocess_text
from src.feature_engineering import generate_embeddings
from src.data_splitting import split_data
from src.model_training import train_model
from src.model_evaluation import evaluate_model
from src.submission import prepare_submission


## Step 1: Load Data
Load the training and test datasets.

In [ ]:
df_train, df_test = load_data('./data/train.csv', './data/test.csv')
df_train.head()

## Step 2: Explore Data
Analyze the loaded datasets.

In [ ]:
explore_data(df_train, df_test)

## Step 3: Clean Data
Clean the datasets by handling missing values and ensuring consistency.

In [ ]:
df_train_cleaned, df_test_cleaned = clean_data(df_train, df_test)

## Step 4: Preprocess Text
Preprocess the text data in both the training and testing datasets.

In [ ]:
df_train_cleaned['Prompt'] = preprocess_text(df_train_cleaned['Prompt'])
df_test_cleaned['Prompt'] = preprocess_text(df_test_cleaned['Prompt'])

## Step 5: Feature Engineering
Generate text embeddings for the clinical prompts.

In [ ]:
df_train_cleaned['Prompt_embeddings'] = generate_embeddings(df_train_cleaned['Prompt'])
df_test_cleaned['Prompt_embeddings'] = generate_embeddings(df_test_cleaned['Prompt'])

## Step 6: Split Data
Split the training data into training and validation sets.

In [ ]:
train_data, val_data = split_data(df_train_cleaned)

## Step 7: Train Model
Train the model using the prepared datasets.

In [ ]:
model = train_model(train_data)

## Step 8: Evaluate Model
Evaluate the trained model on validation data.

In [ ]:
evaluation_results = evaluate_model(model, val_data)

## Step 9: Prepare Submission
Generate predictions for the test dataset and prepare the submission file.

In [ ]:
submission_df = prepare_submission(model, df_test_cleaned)
submission_df.to_csv('submission.csv', index=False)