# Data Wrangling 1.3 Solutions

In [None]:
import math
import numpy as np
import pandas as pd

import psycopg2

import json

import csv

import pprint

from datetime import datetime as dt

In [None]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [None]:
cursor = connection.cursor()

## You try it - read and understand the structure of the following json files:

* temp_sales_no_header.json
* temp_sales_header.json
* temp_sales_big_data.json

* temp_line_items_no_header.json
* temp_line_items_header.json
* temp_line_items_big_data.json

* temp_customers_no_header.json
* temp_customers_header.json
* temp_customers_big_data.json

* temp_products_no_header.json
* temp_products_header.json
* temp_products_big_data.json

* temp_holidays_no_header.json
* temp_holidays_header.json
* temp_holidays_big_data.json


In [None]:
def my_read_flat_json(file_name, limit_lines, limit_json):
    "read a flat json file, detect if it has a header, detect if it is in big data format"
    
    print("------------------------------------")
    print("   ", file_name)
    print("------------------------------------")
    
    f = open(file_name, "r")
    
    data = f.read()
    
    f.close()
    
    lines = data.splitlines(False)
    
    i = 0
    
    for line in lines:
        print(line)
        i += 1
        if i == limit_lines:
            break
    
    print("\n>>> Printed", limit_lines, "lines of", len(lines), "total lines.")
    
    if lines[0][0] == '{' and lines[0][-1] == '}':
        json_temp = []
        for line in lines:
            json_temp.append(json.loads(line))
    else:
        json_temp = json.loads(data)
    
    if type(json_temp) == dict:
        json_list = json_temp['data']
    else:
        json_list = json_temp
    
    i = 0
    
    for j in json_list:
        print("\n>>>JSON Object #", i, "unformatted:\n\n", j)
        print("\n>>>JSON Object #", i, "pretty printed:\n")
        pprint.pprint(j, sort_dicts=False, indent=2)
        i += 1
        if i == limit_json:
            break;

    print('\n')
    

In [None]:
my_read_flat_json("temp_sales_no_header.json", 25, 3)

In [None]:
my_read_flat_json("temp_sales_header.json", 25, 3)

In [None]:
my_read_flat_json("temp_sales_big_data.json", 25, 3)

In [None]:
my_read_flat_json("temp_line_items_no_header.json", 25, 3)

In [None]:
my_read_flat_json("temp_line_items_header.json", 25, 3)

In [None]:
my_read_flat_json("temp_line_items_big_data.json", 25, 3)

In [None]:
my_read_flat_json("temp_customers_no_header.json", 25, 3)

In [None]:
my_read_flat_json("temp_customers_header.json", 25, 3)

In [None]:
my_read_flat_json("temp_customers_big_data.json", 25, 3)

In [None]:
my_read_flat_json("temp_products_no_header.json", 25, 3)

In [None]:
my_read_flat_json("temp_products_header.json", 25, 3)

In [None]:
my_read_flat_json("temp_products_big_data.json", 25, 3)

In [None]:
my_read_flat_json("temp_holidays_no_header.json", 25, 3)

In [None]:
my_read_flat_json("temp_holidays_header.json", 25, 3)

In [None]:
my_read_flat_json("temp_holidays_big_data.json", 25, 3)


## You try it - convert the following flat json files to csv format;  read the csv files to verify that they are ready for loading into a database:


* temp_sales_no_header.json
* temp_sales_header.json
* temp_sales_big_data.json

* temp_line_items_no_header.json
* temp_line_items_header.json
* temp_line_items_big_data.json

* temp_customers_no_header.json
* temp_customers_header.json
* temp_customers_big_data.json

* temp_products_no_header.json
* temp_products_header.json
* temp_products_big_data.json

* temp_holidays_no_header.json
* temp_holidays_header.json
* temp_holidays_big_data.json


In [None]:
def my_flat_json_2_csv(file_name):
    "read a flat json file, convert to csv"
        
    f = open(file_name, "r")
    
    data = f.read()
    
    f.close()
    
    output_file_name = file_name[:-4] + "csv"
    
    f = open(output_file_name, "w")
    
    lines = data.splitlines(False)
    
    if lines[0][0] == '{' and lines[0][-1] == '}':
        json_temp = []
        for line in lines:
            json_temp.append(json.loads(line))
    else:
        json_temp = json.loads(data)
    
    if type(json_temp) == dict:
        json_list = json_temp['data']
    else:
        json_list = json_temp
    
    dw = csv.DictWriter(f, json_list[0].keys())
    dw.writeheader()
    dw.writerows(json_list)
    
    f.close()

In [None]:
def my_read_csv_file(file_name, limit):
    "read the csv file and print only the first limit rows"
    
    csv_file = open(file_name, "r")
    
    csv_data = csv.reader(csv_file)
    
    i = 0
    
    for row in csv_data:
        i += 1
        if i <= limit:
            print(row)
            
    print("\nPrinted ", min(limit, i), "lines of ", i, "total lines.")

In [None]:
my_flat_json_2_csv("temp_sales_no_header.json")

In [None]:
my_flat_json_2_csv("temp_sales_header.json")

In [None]:
my_flat_json_2_csv("temp_sales_big_data.json")

In [None]:
my_flat_json_2_csv("temp_line_items_no_header.json")

In [None]:
my_flat_json_2_csv("temp_line_items_header.json")

In [None]:
my_flat_json_2_csv("temp_line_items_big_data.json")

In [None]:
my_flat_json_2_csv("temp_customers_no_header.json")

In [None]:
my_flat_json_2_csv("temp_customers_header.json")

In [None]:
my_flat_json_2_csv("temp_customers_big_data.json")

In [None]:
my_flat_json_2_csv("temp_products_no_header.json")

In [None]:
my_flat_json_2_csv("temp_products_header.json")

In [None]:
my_flat_json_2_csv("temp_products_big_data.json")

In [None]:
my_flat_json_2_csv("temp_holidays_no_header.json")

In [None]:
my_flat_json_2_csv("temp_holidays_header.json")

In [None]:
my_flat_json_2_csv("temp_holidays_big_data.json")

In [None]:
my_read_csv_file("temp_sales_no_header.csv", 25)

In [None]:
my_read_csv_file("temp_sales_header.csv", 25)

In [None]:
my_read_csv_file("temp_sales_big_data.csv", 25)

In [None]:
my_read_csv_file("temp_line_items_no_header.csv", 25)

In [None]:
my_read_csv_file("temp_line_items_header.csv", 25)

In [None]:
my_read_csv_file("temp_line_items_big_data.csv", 25)

In [None]:
my_read_csv_file("temp_customers_no_header.csv", 25)

In [None]:
my_read_csv_file("temp_customers_header.csv", 25)

In [None]:
my_read_csv_file("temp_customers_big_data.csv", 25)

In [None]:
my_read_csv_file("temp_products_no_header.csv", 25)

In [None]:
my_read_csv_file("temp_products_header.csv", 25)

In [None]:
my_read_csv_file("temp_products_big_data.csv", 25)

In [None]:
my_read_csv_file("temp_holidays_no_header.csv", 25)

In [None]:
my_read_csv_file("temp_holidays_header.csv", 25)

In [None]:
my_read_csv_file("temp_holidays_big_data.csv", 25)

## You try it - extract the following database tables into 3 separate flat json tables (no header, header, and big data); read each file to verify:

* temp_sales to temp_sales_no_header_2.json, temp_sales_header_2.json, temp_sales_big_data_2.json
* temp_line_items to temp_line_items_no_header_2.json, temp_line_items_header_2.json, temp_line_items_big_data_2.json
* temp_customers to temp_customers_no_header_2.json, temp_customers_header_2.json, temp_customers_big_data_2.json
* temp_products to temp_products_no_header_2.json, temp_products_header_2.json, temp_products_big_data_2.json
* temp_holidays to temp_holidays_no_header_2.json, temp_holidays_header_2.json, temp_holidays_big_data_2.json


In [None]:
def my_extract_flat_json(query, file_name, file_type):
    "using the query provided, extract to json, file_type: 1 = no header, 2 = header, 3 = big_data"
    
    connection.rollback()
    
    cursor.execute(query)
    
    connection.rollback()

    rows = cursor.fetchall()
    
    list_of_json = []
    
    for row in rows:
        list_of_json.append(row[0])
        
    f = open(file_name, "w")
    
    # flat json with no headers
    if file_type == 1:
        json.dump(list_of_json, f, indent=2)
    
    # flat json with a header
    if file_type == 2:
        template = {"creator": "Acme Gourmet Meals",
                    "timestamp": dt.now().strftime("%Y-%d-%m %H:%M:%S"),
                    "file_name": file_name, 
                    "version": "12.4.7",
                    "legal": "Unauthorized use, duplication, or possession, blah, blah",
                    "data": list_of_json
                   }
        json.dump(template, f, indent=2)
        
    # flat json big data style
    if file_type == 3:
        for j in list_of_json:
            f.write(json.dumps(j) + "\n")
        
    f.close()  
    

In [None]:
query = """

select row_to_json(a) 
from (select *
      from temp_sales
      order by store_id, sale_id) as a
      
"""

my_extract_flat_json(query, "temp_sales_no_header_2.json", 1)
my_extract_flat_json(query, "temp_sales_header_2.json", 2)
my_extract_flat_json(query, "temp_sales_big_data_2.json", 3)


In [None]:
query = """

select row_to_json(a) 
from (select *
      from temp_line_items
      order by store_id, sale_id, line_item_id) as a
      
"""

my_extract_flat_json(query, "temp_line_items_no_header_2.json", 1)
my_extract_flat_json(query, "temp_line_items_header_2.json", 2)
my_extract_flat_json(query, "temp_line_items_big_data_2.json", 3)


In [None]:
query = """

select row_to_json(a) 
from (select *
      from temp_customers
      order by customer_id) as a
      
"""

my_extract_flat_json(query, "temp_customers_no_header_2.json", 1)
my_extract_flat_json(query, "temp_customers_header_2.json", 2)
my_extract_flat_json(query, "temp_customers_big_data_2.json", 3)


In [None]:
query = """

select row_to_json(a) 
from (select *
      from temp_products
      order by product_id) as a
      
"""

my_extract_flat_json(query, "temp_products_no_header_2.json", 1)
my_extract_flat_json(query, "temp_products_header_2.json", 2)
my_extract_flat_json(query, "temp_products_big_data_2.json", 3)


In [None]:
query = """

select row_to_json(a) 
from (select *
      from temp_holidays
      order by holiday_date) as a
      
"""

my_extract_flat_json(query, "temp_holidays_no_header_2.json", 1)
my_extract_flat_json(query, "temp_holidays_header_2.json", 2)
my_extract_flat_json(query, "temp_holidays_big_data_2.json", 3)


In [None]:
my_read_flat_json("temp_sales_no_header_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_sales_header_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_sales_big_data_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_line_items_no_header_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_line_items_header_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_line_items_big_data_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_customers_no_header_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_customers_header_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_customers_big_data_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_products_no_header_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_products_header_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_products_big_data_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_holidays_no_header_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_holidays_header_2.json", 25, 3)

In [None]:
my_read_flat_json("temp_holidays_big_data_2.json", 25, 3)