<a href="https://colab.research.google.com/github/davidmarip/DS2002-PROJECT/blob/main/DataProject1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import pandas as pd
import requests
import json
import sqlite3
import os
#importing libraries
#https://www.geeksforgeeks.org/what-is-etl-extract-transform-load/


In [27]:
def load_input_data(input_file, input_format):
    try:
        if input_format == 'csv':
            return pd.read_csv(input_file)
        elif input_format == 'json':
            with open(input_file, 'r') as f:
                data = json.load(f)
                return pd.json_normalize(data)
        else:
            raise ValueError("Unsupported input format. Choose 'csv' or 'json'.")
    except FileNotFoundError as e:
        raise FileNotFoundError(f"File {input_file} not found: {e}")
    except json.JSONDecodeError as e:
        raise ValueError(f"Error decoding JSON: {e}")

    
#loads data either from CSV or JSON, and converts the JSON data into a pandas DataFrame

In [29]:
def modify_columns(data, keep_columns=None, add_columns=None, remove_columns=None):
    # Check if columns exist before modifying
    if keep_columns:
        missing_keep = [col for col in keep_columns if col not in data.columns]
        if missing_keep:
            raise ValueError(f"Columns to keep not found: {missing_keep}")
        data = data[keep_columns]

    if add_columns:
        for column, value in add_columns.items():
            data[column] = value

    if remove_columns:
        missing_remove = [col for col in remove_columns if col not in data.columns]
        if missing_remove:
            raise ValueError(f"Columns to remove not found: {missing_remove}")
        data = data.drop(columns=remove_columns)

    return data

# to keep, add, or drop specified columns

In [31]:
def convert_and_save(data, output_file, output_format, db_table=None, db_file=None):

    if output_format == 'csv':
        data.to_csv(output_file, index=False)
        print(f"Data saved as CSV to {output_file}")
    elif output_format == 'json':
        data_json = data.to_json(orient='records', indent=4)
        with open(output_file, 'w') as f:
            f.write(data_json)
        print(f"Data saved as JSON to {output_file}")
    elif output_format == 'sqlite':
        if db_file is None or db_table is None:
            raise ValueError("For SQL output, specify both db_file and db_table.")
        conn = sqlite3.connect(db_file)  # Use db_file, not output_file
        try:
            data.to_sql(db_table, conn, if_exists='replace', index=False)
            print(f"Data saved to SQLite table '{db_table}' in database {db_file}")
        finally:
            conn.close()
    else:
        raise ValueError("Unsupported output format. Choose 'csv', 'json', or 'sqlite'.")

#convert the data format (CSV, Json, or SQL) to the desired output format

In [33]:
def store_data(data, output_format, output_file, db_table=None, db_file=None):
  convert_and_save(data, output_file, output_format, db_table, db_file)

#function to store the data in SQl or write to disk (as CSV or Json)

In [35]:
def view_sql_data(db_file, db_table):
    conn = sqlite3.connect(db_file)
    query = f"SELECT * FROM {db_table}"
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df

In [37]:
def etl_processor(input_file, input_format='csv', output_format='csv',
                  output_file='output_data', keep_columns=None, add_columns=None, remove_columns=None,
                  db_table=None, db_file=None):
    data = load_input_data(input_file, input_format)
    modified_data = modify_columns(data, keep_columns, add_columns, remove_columns)
    store_data(modified_data, output_format, output_file, db_table, db_file)
    return modified_data

The Input CSV file, US_Census_Tract_Area_2010.csv, is mounted through a local file. It's from Charlottesville open data with information about specific demographics. It contains 12 records and 353 columns of multiple demographic attributes.

In [42]:
input_file = 'Downloads/DS2002/US_Census_Tract_Area_2010.csv'  #file path VARIES BY DEVICE LOCAL STORAGE!
transformed_US2010census_data = etl_processor(
    input_file=input_file,
    input_format='csv',  # Input file format
    output_format='json',  # Desired output format ('csv', 'json', 'sql')
    output_file='transformed_US2010census_data',  # Base name for output file
    keep_columns=['ID','AREA_','Population','White','Black', 'AmIndian', 'Asian', 'Hawaiian', 'Other'],  # Columns to keep
    add_columns=None,
    remove_columns=None,
    db_table='input_file',  # For SQL output
    db_file='database.db'  # SQLite database for SQL output
)
df = pd.read_json('transformed_US2010census_data')
df.head()
# Example Usage for US_Census_Tract_Area_2010 for Charlottesville
# takes inputted cvs file and transforms (keeps columns ObjectID, ID, Area, Population, White, and Black)
# output formatted as Json saved to transformed_US2010census_data file

FileNotFoundError: File Downloads/DS2002/US_Census_Tract_Area_2010.csv not found: [Errno 2] No such file or directory: 'Downloads/DS2002/US_Census_Tract_Area_2010.csv'

In [37]:
df.info()

NameError: name 'df' is not defined

The output DataFrame is a transformed version of the US_Census_Tract_Area_2010.csv with 12 Records and only 9 columns. Of these columns includes: ID, AREA_, Population, White, Black, AmIndian, Asian, Hawaiian, Other. It focuses the data to more specific demographics and get's rid of columns such as State and County, which had the same values for each row.