In [12]:
pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [27]:
import pandas as pd
import json
import sqlite3
import os
from tabulate import tabulate

def load_data(file_path, file_type):
    try:
        if file_type == 'json':
            with open(file_path, 'r') as file:
                return json.load(file)
        elif file_type == 'csv':
            return pd.read_csv(file_path)
        else:
            raise ValueError("Unsupported file type")
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def json_to_csv(json_data, output_path):
    try:
        data = pd.json_normalize(json_data, 'battingSummary')
        data.to_csv(output_path, index=False)
        print(f"Converted JSON to CSV at {output_path}")
    except Exception as e:
        print(f"Error converting JSON to CSV: {e}")

def csv_to_json(csv_data, output_path):
    try:
        csv_data.to_json(output_path, orient='records', indent=4)
        print(f"Converted CSV to JSON at {output_path}")
    except Exception as e:
        print(f"Error converting CSV to JSON: {e}")

def modify_columns(data, add_columns=None, drop_columns=None):
    if add_columns:
        for col_name, value in add_columns.items():
            data[col_name] = value
    if drop_columns:
        data = data.drop(columns=drop_columns)
    return data

def save_to_db(data, table_name, db_name='etl_data.db'):
    try:
        conn = sqlite3.connect(db_name)
        data.to_sql(table_name, conn, if_exists='replace', index=False)
        print(f"Saved data to table '{table_name}' in {db_name}")
        conn.close()
    except Exception as e:
        print(f"Error saving to database: {e}")

def generate_summary(data):
    if isinstance(data, pd.DataFrame):
        num_records = len(data)
        num_columns = len(data.columns)
        print(f"Summary:")
        print(f"Number of records: {num_records}")
        print(f"Number of columns: {num_columns}")
        print("Column names:")
        for col in data.columns:
            print(f"- {col}")
        return num_records, num_columns
    else:
        print("No summary available for this data format.")
        return 0, 0

def display_sqlite_table(db_name, table_name):
    try:
        conn = sqlite3.connect(db_name)
        cursor = conn.cursor()
        cursor.execute(f"SELECT * FROM {table_name} LIMIT 5")
        rows = cursor.fetchall()
        headers = [description[0] for description in cursor.description]
        print(f"\nFirst 5 rows of table '{table_name}':")
        print(tabulate(rows, headers=headers, tablefmt="grid"))
        conn.close()
    except Exception as e:
        print(f"Error displaying SQLite table: {e}")

def etl_pipeline(input_file, input_type, output_type, modify=True):
    data = load_data(input_file, input_type)
    if data is None:
        return

    print("\n--- Data File Ingestion Summary ---")
    input_records, input_columns = generate_summary(data)

    if input_type == 'json' and output_type == 'csv':
        json_to_csv(data, 'output.csv')
        output_data = pd.read_csv('output.csv')
    elif input_type == 'csv' and output_type == 'json':
        csv_to_json(data, 'output.json')
        with open('output.json', 'r') as f:
            output_data = json.load(f)
    elif output_type == 'sql':
        if input_type == 'json':
            data = pd.json_normalize(data, 'battingSummary')
        table_name = os.path.splitext(os.path.basename(input_file))[0]
        save_to_db(data, table_name)
        display_sqlite_table('etl_data.db', table_name)
        output_data = data
    else:
        output_data = data

    if modify and isinstance(output_data, pd.DataFrame):
        output_data = modify_columns(output_data, add_columns={'NewColumn': 0})

    print("\n--- Post Processing Summary ---")
    output_records, output_columns = generate_summary(output_data)

    print("\n--- Summary Comparison ---")
    print(f"Input:  {input_records} records, {input_columns} columns")
    print(f"Output: {output_records} records, {output_columns} columns")

def main():
    input_file = input("Enter the input file name: ")
    input_type = input("Enter the input file type (csv/json): ").lower()
    output_type = input("Enter the desired output type (csv/json/sql): ").lower()

    etl_pipeline(input_file, input_type, output_type)

if __name__ == "__main__":
    main()


--- Data File Ingestion Summary ---
No summary available for this data format.
Saved data to table 't20_wc_batting_summary' in etl_data.db

First 5 rows of table 't20_wc_batting_summary':
+----------------------+---------------+--------------+------------------------+---------------------------------+--------+---------+------+------+--------+
| match                | teamInnings   |   battingPos | batsmanName            | dismissal                       |   runs |   balls |   4s |   6s |     SR |
| Namibia Vs Sri Lanka | Namibia       |            1 | Michael van Lingen     | c Pramod Madushan b Chameera    |      3 |       6 |    0 |    0 |  50    |
+----------------------+---------------+--------------+------------------------+---------------------------------+--------+---------+------+------+--------+
| Namibia Vs Sri Lanka | Namibia       |            2 | Divan la Cock          | c Shanaka b Pramod Madushan     |      9 |       9 |    1 |    0 | 100    |
+----------------------+--