## __3.0 Data Extraction__

In [16]:
# Import the necessary libraries

import pandas as pd

In [18]:
class DataExtractor:
    """
    This class handles data importation and preprocessing for the Tanzanian Water Wells project.
    """

    def load_data(self):
        """
        Loads and combines training data, and renames test data.

        Returns:
            tuple: A tuple containing the training set (DataFrame) and test set (DataFrame).
        """

        # Load data from CSVs
        df_labels = pd.read_csv("data/training-set-labels.csv")
        df_values = pd.read_csv("data/training-set-values.csv")

        # Combine training set values and labels using a left join
        df_train = df_values.merge(df_labels, on="id", how="left")
        df_train.to_csv("data/train_set.csv", index=False) 

        # Rename test data
        df_test = pd.read_csv("data/test-set-values.csv")
        df_test.to_csv("data/test_set.csv", index=False)

        return df_train, df_test



extractor = DataExtractor()

train_data, test_data = extractor.load_data()



In [23]:
print("Training data shape:", train_data.shape)


print(train_data.head().to_markdown(index=False, numalign='left', stralign='left'))




Training data shape: (59400, 41)
| id    | amount_tsh   | date_recorded   | funder       | gps_height   | installer    | longitude   | latitude   | wpt_name             | num_private   | basin                   | subvillage   | region   | region_code   | district_code   | lga       | ward       | population   | public_meeting   | recorded_by             | scheme_management   | scheme_name                 | permit   | construction_year   | extraction_type   | extraction_type_group   | extraction_type_class   | management   | management_group   | payment        | payment_type   | water_quality   | quality_group   | quantity     | quantity_group   | source               | source_type          | source_class   | waterpoint_type             | waterpoint_type_group   | status_group   |
|:------|:-------------|:----------------|:-------------|:-------------|:-------------|:------------|:-----------|:---------------------|:--------------|:------------------------|:-------------|:---------|:---

In [21]:

print("Test data shape:", test_data.shape)


print(test_data.head().to_markdown(index=False, numalign='left', stralign='left'))

Test data shape: (14850, 40)
| id    | amount_tsh   | date_recorded   | funder                 | gps_height   | installer   | longitude   | latitude   | wpt_name                | num_private   | basin                   | subvillage   | region   | region_code   | district_code   | lga           | ward         | population   | public_meeting   | recorded_by             | scheme_management   | scheme_name    | permit   | construction_year   | extraction_type   | extraction_type_group   | extraction_type_class   | management   | management_group   | payment     | payment_type   | water_quality   | quality_group   | quantity     | quantity_group   | source               | source_type          | source_class   | waterpoint_type    | waterpoint_type_group   |
|:------|:-------------|:----------------|:-----------------------|:-------------|:------------|:------------|:-----------|:------------------------|:--------------|:------------------------|:-------------|:---------|:--------------|:---

## __4.0 Data Overview__ 

In [3]:
class DataOverview(DataExtractor):
    """
    This class inherits from DataExtractor and provides methods for overviewing the 
    training and test datasets.
    """

    def show_overview(self, df, dataset_name="Dataset"):
        """
        Displays a comprehensive overview of a given dataset.

        Args:
            df (DataFrame): The DataFrame containing the data.
            dataset_name (str): A name to identify the dataset in the output. Defaults to "Dataset".
        """
        print(f"\n--- Overview of {dataset_name} ---\n")

        # Basic Info
        print("Info:")
        print(df.info())

        # First 5 and Last 5 Rows
        print("\nFirst 5 Rows:")
        print(df.head().to_markdown(index=False, numalign='left', stralign='left'))
        print("\nLast 5 Rows:")
        print(df.tail().to_markdown(index=False, numalign='left', stralign='left'))

        # Descriptive Statistics
        print("\nDescriptive Statistics:")
        print(df.describe().to_markdown(numalign='left', stralign='left'))

        # Duplicate Rows
        num_duplicates = df.duplicated().sum()
        print(f"\nNumber of Duplicate Rows: {num_duplicates}")

        # Unique Values Percentage
        unique_pct = (df.nunique() / df.shape[0] * 100).round(2)
        print("\nPercentage of Unique Values per Column:")
        print(unique_pct.to_markdown(numalign='left', stralign='left'))

        # Missing and Null Values Percentage
        missing_pct = (df.isnull().sum() / df.shape[0] * 100).round(2)
        print("\nPercentage of Missing/Null Values per Column:")
        print(missing_pct.to_markdown(numalign='left', stralign='left'))



# Create the DataOverview instance
overview = DataOverview()

# Load the training and test data
train_data, test_data = overview.load_data()


In [13]:
# Show overviews
# overview.show_overview(train_data, "Training Set")

In [12]:
# overview.show_overview(test_data, "Test Set")

## __5.0 Data Cleaning__

In [15]:
class DataCleaning(DataOverview):
    """
    Cleans the training and test datasets, and provides overviews of both the original and cleaned versions.
    """
    
    def clean_data(self, df, dataset_name="Dataset"):
        """
        Performs cleaning operations on the DataFrame and saves the results.

        Args:
            df (DataFrame): The DataFrame to clean.
            dataset_name (str): The name of the dataset ("Training" or "Test") for saving.
        """

        print(f"\n--- Cleaning {dataset_name} Set ---\n")  # Added for clarity
        
        # Extract year from date_recorded and drop original
        df["year_recorded"] = pd.to_datetime(df["date_recorded"]).dt.year
        df.drop(columns=["date_recorded"], inplace=True)

        # Impute missing categorical values 
        for col in ["funder", "installer", "subvillage", "public_meeting", "scheme_management", "wpt_name", "permit"]:
            most_frequent_value = df[col].mode()[0]
            df[col] = df[col].fillna(most_frequent_value)

        # Impute zero values in 'construction_year'
        df["construction_year"] = df["construction_year"].fillna(0)

        # Drop columns
        df.drop(columns=["num_private", "recorded_by", "scheme_name", 
                         "extraction_type_group", "extraction_type", "payment", 
                         "water_quality", "quantity_group", "source", "source_type", 
                         "waterpoint_type_group", "lga", "ward"], inplace=True)
        

        # Save cleaned data and show overview
        df.to_csv(f"data/cleaned_{dataset_name.lower()}_set.csv", index=False)
        self.show_overview(df, f"Cleaned {dataset_name} Set")

        return df

# Example Usage
cleaner = DataCleaning()
train_data, test_data = cleaner.load_data()

# Show overviews before cleaning
# cleaner.show_overview(train_data, "Original Training Set")
# cleaner.show_overview(test_data, "Original Test Set")

# Clean the data and show overviews afterwards
# train_data_cleaned = cleaner.clean_data(train_data, "Training")
# test_data_cleaned = cleaner.clean_data(test_data, "Test")


In [14]:
# Overview of cleaned train data
# # train_data_cleaned = cleaner.clean_data(train_data, "Training")

In [11]:

# test_data_cleaned = cleaner.clean_data(test_data, "Test")