# Data Wrangling 1.4

In [None]:
import psycopg2

import json

import csv

from datetime import datetime as dt

In [None]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [None]:
cursor = connection.cursor()

# Lab: Reading Nested JSON Files 

In [None]:
def my_recursive_print_json(j, level = -1):
    "given a json object print it"
    
    level += 1
    
    spaces = "    "
    
    if type(j) is dict:
        dict_2_list = list(j.keys())
        for k in dict_2_list:
            print(spaces * level + k)
            my_recursive_print_json(j[k], level)
            
    elif type(j) is list:
        for (i, l) in enumerate(j):
            print(spaces * level + "[" + str(i) + "]")
            my_recursive_print_json(l, level)
                  
    else:
        print(spaces * level + "value:", str(j))
                  


In [None]:
def my_read_nested_json(file_name):
    "given a file of json, read it and parse it meaningfully"
    
    f = open(file_name, "r")
    
    j = json.load(f)
    
    f.close
    
    my_recursive_print_json(j)

In [None]:
my_read_nested_json("temp_stores_nested.json")

## You try it - we have two additional nested json files that we will extract next week;  for now, take a look at these files in the linux command line and also using the function my_read_nested_json

* temp_sales_nested.json
* temp_customers_nested.json

# Lab: Loading Nested JSON Data into SQL Database Tables

In [None]:
def my_extract_stores_nested_json_to_csv(file_name):
    "extract stores nested json to csv files"
    
    f = open(file_name, "r")
    j = json.load(f)
    f.close()   
    
    store_json_list = []
    customer_json_list = []
    sale_json_list = []
    line_item_json_list = []
    
    for store in j['stores']:
        
        store_json = {}
        store_json['store_id'] = store['store_id']
        store_json['street'] = store['street']
        store_json['city'] = store['city']
        store_json['state'] = store['state']
        store_json['latitude'] = store['latitude']
        store_json['longitude'] = store['longitude']
        
        store_json_list.append(store_json)
        
        for sale in store['sales']:
            
            sale_json = {}
            sale_json['store_id'] = store['store_id']
            sale_json['sale_id'] = sale['sale_id']
            sale_json['customer_id'] = sale['customer']['customer_id']
            sale_json['sale_date'] = sale['sale_date']
            sale_json['total_amount'] = sale['total_amount']
            
            sale_json_list.append(sale_json)
            
            customer_json_list.append(sale['customer'])
            
            for line_item in sale['line_items']:
                
                line_item_json = {}
                line_item_json['store_id'] = store['store_id']
                line_item_json['sale_id'] = sale['sale_id']
                line_item_json['line_item_id'] = line_item['line_item_id']
                line_item_json['product_id'] = line_item['product']['product_id']
                line_item_json['quantity'] = line_item['quantity']
                
                line_item_json_list.append(line_item_json)
            
    store_json_list = sorted(store_json_list, key = lambda i: i['store_id'])
    customer_json_list = sorted(customer_json_list, key = lambda i: i['customer_id'])
    sale_json_list = sorted(sale_json_list, key = lambda i: (i['store_id'], i['sale_id']))
    line_item_json_list = sorted(line_item_json_list, key = lambda i: (i['store_id'], i['sale_id'], i['line_item_id']))
    
    dedup_customer_json_list = []
    for customer in customer_json_list:
        if customer not in dedup_customer_json_list:
            dedup_customer_json_list.append(customer)
            
    f = open("temp_stores_3.csv", "w")
    dw = csv.DictWriter(f, store_json_list[0].keys())
    dw.writeheader()
    dw.writerows(store_json_list)
    f.close()
    
    f = open("temp_customers_3.csv", "w")
    dw = csv.DictWriter(f, dedup_customer_json_list[0].keys())
    dw.writeheader()
    dw.writerows(dedup_customer_json_list)
    f.close()
    
    f = open("temp_sales_3.csv", "w")
    dw = csv.DictWriter(f, sale_json_list[0].keys())
    dw.writeheader()
    dw.writerows(sale_json_list)
    f.close()
      
    f = open("temp_line_items_3.csv", "w")
    dw = csv.DictWriter(f, line_item_json_list[0].keys())
    dw.writeheader()
    dw.writerows(line_item_json_list)
    f.close()
    

In [None]:
my_extract_stores_nested_json_to_csv("temp_stores_nested.json")

## We now have csv files, which we already know how to load into database tables

In [None]:
def my_read_csv_file(file_name, limit):
    "read the csv file and print only the first limit rows"
    
    csv_file = open(file_name, "r")
    
    csv_data = csv.reader(csv_file)
    
    i = 0
    
    for row in csv_data:
        i += 1
        if i <= limit:
            print(row)
            
    print("\nPrinted ", min(limit, i), "lines of ", i, "total lines.")

In [None]:
my_read_csv_file("temp_stores_3.csv", 10)

In [None]:
my_read_csv_file("temp_customers_3.csv", 10)

In [None]:
my_read_csv_file("temp_sales_3.csv", 10)

In [None]:
my_read_csv_file("temp_line_items_3.csv", 10)

## You try it - the above version of my_extract_stores_nested_json_to_csv() does not extract the products;  copy and modify the function to also extract the products to the file temp_products_3.csv;  remember to remove duplicates;  products will be similar to the logic for customers

# Lab: Extracting Nested JSON Files

In [None]:
def my_extract_stores_nested_json(file_name):
    "extract nested json with stores at the top level to the file"
    
    connection.rollback()
    
    file_json = {"creator": "Acme Gourmet Meals",
                  "timestamp": dt.now().strftime("%Y-%d-%m %H:%M:%S"),
                  "file_name": file_name, 
                  "version": "12.4.7",
                  "legal": "Unauthorized use, duplication, or possession, blah, blah",
                  "stores": []
                }
        
    query = """
    
    select row_to_json(a)
    from (select *
          from temp_stores
          order by store_id
          ) as a
           
    """
    
    cursor.execute(query)
    
    connection.rollback()

    store_rows = cursor.fetchall()
    
    for store in store_rows:
        
        store_json = store[0]
        store_id = store_json['store_id']
        
        store_json['sales'] = []
        
        query = """

        select row_to_json(a)
        from (select *
              from temp_sales
              where store_id = %s
              order by store_id, sale_id) as a
              
        """
        
        cursor.execute(query, (store_id,))
    
        connection.rollback()
        
        sale_rows = cursor.fetchall()
        
        for sale in sale_rows:
            
            sale_json = sale[0]
            sale_id = sale_json['sale_id']
            customer_id = sale_json['customer_id']
            
            
            query = """

            select row_to_json(a)
            from (select *
                  from temp_customers
                  where customer_id = %s
                  ) as a

            """
        
            cursor.execute(query, (customer_id,))
    
            connection.rollback()
        
            customer_row = cursor.fetchone()
            
            customer_json = customer_row[0]
            
            del sale_json['customer_id']
            sale_json['customer'] = customer_json
        
            line_item_list_json = []
            
            query = """

            select row_to_json(a)
            from (select *
                  from temp_line_items
                  where store_id = %s and sale_id = %s
                  order by store_id, sale_id, line_item_id
                  ) as a

            """
        
            cursor.execute(query, (store_id, sale_id))
    
            connection.rollback()
        
            line_item_rows = cursor.fetchall()
            
            for line_item_row in line_item_rows:
                
                line_item_json = line_item_row[0]
                
                del line_item_json['store_id']
                del line_item_json['sale_id']
                
                line_item_list_json.append(line_item_json)

            sale_json['line_items'] = line_item_list_json
            
            del sale_json['store_id']

            store_json['sales'].append(sale_json)         
    
        file_json['stores'].append(store_json)
    
        
    f = open(file_name, "w")
    
    json.dump(file_json, f, indent=2)

    f.close()  

In [None]:
my_extract_stores_nested_json("temp_stores_nested_2.json")

In [None]:
my_read_nested_json("temp_stores_nested_2.json")

## the function my_extract_stores_nested_json() does not extract the products into the json file;  copy the function and modify it to also add products;  compare to the file we looked at earlier temp_stores_nested.json to see the format