# Data Wrangling 1.2 Solutions

In [None]:
import csv

import math
import numpy as np
import pandas as pd

import psycopg2

In [None]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [None]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [None]:
cursor = connection.cursor()

## You try it - read and print out some lines from the following csv files: temp_line_items.csv, temp_customers.csv, temp_products, temp_holidays

In [None]:
def my_read_csv_file(file_name, limit):
    "read the csv file and print only the first limit rows"
    
    csv_file = open(file_name, "r")
    
    csv_data = csv.reader(csv_file)
    
    i = 0
    
    for row in csv_data:
        i += 1
        if i <= limit:
            print(row)
            
    print("\nPrinted ", min(limit, i), "lines of ", i, "total lines.")

In [None]:
my_read_csv_file("temp_line_items.csv", limit=10)

In [None]:
my_read_csv_file("temp_customers.csv", limit=10)

In [None]:
my_read_csv_file("temp_products.csv", limit=10)

In [None]:
my_read_csv_file("temp_holidays.csv", limit=100)

## You try it - 
* load the file temp_holidays.csv into the table temp_holidays 
* temp_products.csv into table temp_products
* temp_line_items into table temp_line_items 
* verify the loads with a query

In [None]:
#
# load the csv files into the database tables in foreign key order
#

connection.rollback()

query = """

copy temp_holidays
from '/user/labs/week_06/temp_holidays.csv' delimiter ',' NULL '' csv header;

copy temp_products
from '/user/labs/week_06/temp_products.csv' delimiter ',' NULL '' csv header;

copy temp_line_items
from '/user/labs/week_06/temp_line_items.csv' delimiter ',' NULL '' csv header;


"""

cursor.execute(query)

connection.commit()

In [None]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_holidays;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

In [None]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_products;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

In [None]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from temp_line_items;

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

## You try it - 
* extract the table temp_sales table to temp_sales_2.csv
* table temp_line_items to temp_line_items_2.csv
* table temp_customers to temp_customers_2.csv 
* table temp_products to temp_products_2.csv
* table temp_holidays to temp_holidays_2.csv
* verify by reading the csv files

In [None]:
connection.rollback()
    
query = """
    
copy (select * 
      from temp_sales 
      order by store_id, sale_id)
to '/user/labs/week_06/temp_sales_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_line_items 
      order by store_id, sale_id, line_item_id)
to '/user/labs/week_06/temp_line_items_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_customers
      order by customer_id)
to '/user/labs/week_06/temp_customers_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_products
      order by product_id)
to '/user/labs/week_06/temp_products_2.csv' delimiter ',' NULL '' csv header;

copy (select * 
      from temp_holidays
      order by holiday_date)
to '/user/labs/week_06/temp_holidays_2.csv' delimiter ',' NULL '' csv header;


"""

cursor.execute(query)
    
connection.commit()


In [None]:
my_read_csv_file("temp_sales_2.csv", limit=10)

In [None]:
my_read_csv_file("temp_line_items_2.csv", limit=10)

In [None]:
my_read_csv_file("temp_customers_2.csv", limit=10)

In [None]:
my_read_csv_file("temp_products_2.csv", limit=10)

In [None]:
my_read_csv_file("temp_holidays_2.csv", limit=10)