# __Data Extraction__

In [3]:
# 1. extract the data from folder data/MH-CLD-2022-DS0001-bndl-data-csv_v1.zip name of csv is mhcld_puf_2022
# 2. extract only columns required: **Variables:**
""" - **Target Variable:**
  - `SAP` (Substance Abuse Problem)
- **Predictor Variables:**
  - **Primary Mental Health Condition:**
    - Exclude `MH1` values that indicate substance use disorders (value 12).
  - **Demographic Variables:**
    - `AGE` (Client Age)
    - `ETHNIC` (Client Ethnicity)
    - `RACE` (Client Race)
    - `GENDER` (Client Gender)
    - `MARSTAT` (Marital Status)
    - `EDUC` (Client Education)
    - `EMPLOY` (Employment Status)
    - `LIVARAG` (Living Arrangement)
    - `VETERAN` (Veteran Status)
    - `STATEFIP` (State FIPS Code)
"""
# 3. change the name of the columns to be more intuitive as indicated above. Ensure they are lowercase and names are separated by a "-""
# 4. create a csv file from the extracted data and name it extracted-data in the data folder. 

import pandas as pd
import zipfile

class DataExtractor:
    def __init__(self, zip_file_path, csv_file_name, output_file_name):
        """
        Initialize the DataExtractor class with the paths to the zip file, the CSV file name, and the output file name.

        Args:
            zip_file_path (str): The path to the zip file containing the CSV data.
            csv_file_name (str): The name of the CSV file within the zip file.
            output_file_name (str): The name of the output CSV file.
        """
        self.zip_file_path = zip_file_path
        self.csv_file_name = csv_file_name
        self.output_file_name = output_file_name
        self.required_columns = ['SAP', 'MH1', 'AGE', 'ETHNIC', 'RACE', 'GENDER', 'MARSTAT', 'EDUC', 'EMPLOY', 'LIVARAG', 'VETERAN', 'STATEFIP']
        self.column_names = {'SAP': 'target-variable', 'MH1': 'primary-mental-health-condition', 'AGE': 'age', 'ETHNIC': 'ethnicity', 'RACE': 'race', 'GENDER': 'gender', 'MARSTAT': 'marital-status', 'EDUC': 'education', 'EMPLOY': 'employment-status', 'LIVARAG': 'living-arrangement', 'VETERAN': 'veteran-status', 'STATEFIP': 'state-fips-code'}

    def extract_data(self):
        """
        Extract the data from the zip file, select the required columns, and rename the columns.
        Save the modified DataFrame to a CSV file.
        """
        with zipfile.ZipFile(self.zip_file_path) as zfile:
            with zfile.open(self.csv_file_name) as f:
                df = pd.read_csv(f)
        df = df[self.required_columns]
        df.rename(columns=self.column_names, inplace=True)
        df.to_csv(self.output_file_name, index=False)

# Example usage
extractor = DataExtractor('data/MH-CLD-2022-DS0001-bndl-data-csv_v1.zip', 'mhcld_puf_2022.csv', 'data/extracted-data.csv')
extractor.extract_data()