<a href="https://colab.research.google.com/github/davidmarip/DS2002-PROJECT/blob/main/DataProject1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import requests
import json
import sqlite3
import os
#importing libraries

In [None]:
def load_input_data(input_file, input_format):
    if input_format == 'csv':
        data = pd.read_csv(input_file)
    elif input_format == 'json':
        with open(input_file, 'r') as f:
            data = json.load(f)
            return pd.json_normalize(data)
    else:
        raise ValueError("Unsupported input format. Choose 'csv' or 'json'.")
    return data
#loads data either from CSV or JSON, and converts the JSON data into a pandas DataFrame

In [None]:
def modify_columns(data, keep_columns=None, add_columns=None, remove_columns=None):
    if keep_columns:
        data = data[keep_columns]

    if add_columns:
        for column, value in add_columns.items():
            data[column] = value

    if remove_columns:
        data = data.drop(columns=remove_columns)

    return data

# to keep, add, or drop specified columns

In [None]:
def convert_and_save(data, output_file, output_format, db_table=None, db_file=None):

    if output_format == 'csv':
        data.to_csv(output_file, index=False)
        print(f"Data saved as CSV to {output_file}")
    elif output_format == 'json':
        data_json = data.to_json(orient='records', indent=4)
        with open(output_file, 'w') as f:
            f.write(data_json)
        print(f"Data saved as JSON to {output_file}")
    elif output_format == 'sqlite':
        if db_file is None or db_table is None:
          raise ValueError("For SQL output, specify both db_file and db_table.")
        conn = sqlite3.connect(output_file)
        data.to_sql(db_table, conn, if_exists='replace', index=False)
        conn.close()
        print(f"Data saved to SQLite table '{db_table}' in database {db_file}")
    else:
        raise ValueError("Unsupported output format. Choose 'csv', 'json', or 'sqlite'.")

#convert the data format (CSV, Json, or SQL) to the desired output format

In [None]:
def store_data(data, output_format, output_file, db_table=None, db_file=None):
  convert_and_save(data, output_file, output_format, db_table, db_file)

#function to store the data in SQl or write to disk (as CSV or Json)

In [None]:
def view_sql_data(db_file, db_table):
    conn = sqlite3.connect(db_file)
    query = f"SELECT * FROM {db_table}"
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df

In [None]:
def etl_processor(input_file, input_format='csv', output_format='csv',
                  output_file='output_data', keep_columns=None, add_columns=None, remove_columns=None,
                  db_table=None, db_file=None):
    data = load_input_data(input_file, input_format)
    modified_data = modify_columns(data, keep_columns, add_columns, remove_columns)
    store_data(modified_data, output_format, output_file, db_table, db_file)
    return modified_data

The Input CSV file, US_Census_Tract_Area_2010.csv, is mounted through a local file. It's from Charlottesville open data with information about specific demographics. It contains 12 records and 353 columns of multiple demographic attributes.

In [16]:
input_file = '/content/US_Census_Tract_Area_2010.csv'  #file path
transformed_US2010census_data = etl_processor(
    input_file=input_file,
    input_format='csv',  # Input file format
    output_format='json',  # Desired output format ('csv', 'json', 'sql')
    output_file='transformed_US2010census_data',  # Base name for output file
    keep_columns=['ID','AREA_','Population','White','Black', 'AmIndian', 'Asian', 'Hawaiian', 'Other'],  # Columns to keep
    add_columns=None,
    remove_columns=None,
    db_table='input_file',  # For SQL output
    db_file='database.db'  # SQLite database for SQL output
)
df = pd.read_json('transformed_US2010census_data')
df.head()
# Example Usage for US_Census_Tract_Area_2010 for Charlottesville
# takes inputted cvs file and transforms (keeps columns ObjectID, ID, Area, Population, White, and Black)
# output formatted as Json saved to transformed_US2010census_data file

Data saved as JSON to transformed_US2010census_data


Unnamed: 0,ID,AREA_,Population,White,Black,AmIndian,Asian,Hawaiian,Other
0,9011,1.186296,4675,3505,706,3,180,2,130
1,9030,0.474023,3305,1253,1708,21,135,0,80
2,9047,0.374655,4351,2681,355,2,1087,1,55
3,9065,0.639455,3324,1471,1588,7,44,0,122
4,9084,0.370543,5617,3936,1097,12,339,3,59


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          12 non-null     int64  
 1   AREA_       12 non-null     float64
 2   Population  12 non-null     int64  
 3   White       12 non-null     int64  
 4   Black       12 non-null     int64  
 5   AmIndian    12 non-null     int64  
 6   Asian       12 non-null     int64  
 7   Hawaiian    12 non-null     int64  
 8   Other       12 non-null     int64  
dtypes: float64(1), int64(8)
memory usage: 992.0 bytes


The output DataFrame is a transformed version of the US_Census_Tract_Area_2010.csv with 12 Records and only 9 columns. Of these columns includes: ID, AREA_, Population, White, Black, AmIndian, Asian, Hawaiian, Other. It focuses the data to more specific demographics and get's rid of columns such as State and County, which had the same values for each row.