In [None]:
# Phils Section Start

# ETL Project

In [None]:
# Dependencies
import pandas as pd
import numpy as np

# Database credentials
from config import username, password

# Used for making database connection.
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

# Used to abstract classes into tables.
from sqlalchemy.ext.declarative import declarative_base

# Used to declare column types.
from sqlalchemy import Column, Integer, String, Float, ForeignKey

# Used to load pandas dataframe into sql.
import d6tstack.utils
import time

# Ignore warnings.
import warnings
warnings.filterwarnings('ignore')

# Used for reading, writing to, and zipping files/folders.
from pathlib import Path
import os, zipfile
import shutil
import glob

In [None]:
# Constants
COMMODITY_TRADE_CSV = os.path.join(".", "Resources", "commodity_trade_statistics_data.csv")
FINANCIAL_AID_CSV =  os.path.join(".", "Resources", "june-9-data-csv-1.csv")
DB_NAME = "trade_db"
COMMODITY_TABLE = "commodity"
COMMODITY_CATEGORY_TABLE = "commodity_category"
COMMODITY_CODE_TABLE = "commodity_code"
COUNTRY_TABLE = "country"
cfg_uri_psql = f"postgresql+psycopg2://{username}:{password}@localhost/{DB_NAME}"

## Unzip data files in Resources folder

Running the following cell will extract the data zip files in the Resources folder, which contains the csv files needed for this project.

In [None]:
# Running this cell will unzip the data files in the Resources folder.
extension = ".zip"
extracted_dir_name = "."

# Get the current working directory.
# Need to be in the root directory of this project for this to work.
cwd_dir_name = os.getcwd()
print(f"The current working directory is {cwd_dir_name}.")

os.chdir("Resources") # change directory from working dir to dir with the zip file(s) .
# This should be the "Resources" folder.
dir_name = os.getcwd()
print(f"You are now in the following directory: {dir_name}.")

for item in os.listdir(dir_name): # loop through the items in the directory.
    if item.endswith(extension): # check for ".zip" extension"
        try:
            file_name = os.path.abspath(item) # get full path of files
            zip_ref = zipfile.ZipFile(file_name) # create zipfile object
            unzipped_directory = os.path.join(extracted_dir_name) # reference to the directory where the zip files will be extracted.
            zip_ref.extractall(unzipped_directory) # extract file to dir
            zip_ref.close() # close file
            print(f"Successfully unzipped {item} into the following folder:{dir_name}.")
        except:
            print(f"Error trying to unzip data file(s).")
            print(f"Make sure that the files are closed and you have the correct file/folder permissions.")
            
# Go up one directory into the project root directory.
os.chdir(os.path.normpath(os.getcwd() + os.sep + os.pardir))
print(os.path.normpath(os.getcwd() + os.sep + os.pardir))

## Store commodity csv into pandas dataframe

In [None]:
commodity_trade_df = pd.read_csv(COMMODITY_TRADE_CSV, low_memory=False, encoding ="utf-8")

commodity_trade_df.to_hdf('commodity_trade.h5', key='df', mode='w')

In [None]:
# Found out this read_hsf is faster than read_csv when working with really large datasets.
commodity_trade_df = pd.read_hdf('commodity_trade.h5', 'df')

commodity_trade_df

## Rename columns for commodity dataframe

In [None]:
commodity_trade_renamed_columns = commodity_trade_df.rename(columns={
    "comm_code": "commodity_code",
    "commodity": "commodity_description",
    "flow": "trade_flow",
    "trade_usd": "trade_value_usd",  
})

commodity_trade_renamed_columns

## Drop null values from commodity dataframe

In [None]:
commodity_trade_no_null = commodity_trade_renamed_columns.dropna(how="any")

commodity_trade_no_null

## Verify commodity dataframe count

In [None]:
commodity_trade_no_null.count()

## Add auto incrementing id column to commodity dataframe

In [None]:
commodity_trade_no_null.insert(0, 'id', range(0, 0 + len(commodity_trade_no_null)))

commodity_trade_no_null

## Create new category dataframe

In [None]:
# Create new category data frame with split value columns. 
category_df = commodity_trade_no_null["category"].str.split("_", n = 1, expand = True) 
  
# Make separate category_id column from new category data frame.
commodity_trade_no_null["category_id"]= category_df[0] 
  
# Dropping old category column. 
commodity_trade_no_null.drop(columns =["category"], inplace = True) 

commodity_trade_no_null

In [None]:
# Rename columns to be something more meaningful.
category_df = category_df.rename(columns={
    0: "category_id",
    1: "category_name"
})

# Drop duplicate categories.
category_df.drop_duplicates("category_id", inplace=True)

category_df

## Create new commodity codes dataframe

In [None]:
# Create new commodity codes dataframe that contains commodity code and commodity description columns.
commodity_codes_df = commodity_trade_no_null.loc[:,["commodity_code", "commodity_description"]]

# Drop duplicate codes
commodity_codes_df.drop_duplicates("commodity_code", inplace=True)

# Drop nulls
commodity_codes_df.dropna(how="any", inplace=True)

commodity_codes_df

In [None]:
# Remove commodity_description column from previous dataframe.
del commodity_trade_no_null["commodity_description"]

commodity_trade_no_null

## Create new countries dataframe

In [None]:
# Create new countries dataframe that contains country name and unique id columns.
countries_df = commodity_trade_no_null.loc[:,["country_or_area"]]

# Drop duplicate codes
countries_df.drop_duplicates("country_or_area", inplace=True)

# Drop nulls
countries_df.dropna(how="any", inplace=True)

countries_df

### Add auto-incrementing id column to countries dataframe


In [None]:
countries_df.insert(0, 'id', range(0, 0 + len(countries_df)))

countries_df

## Merge countries dataframe with commodity dataframe on country name

In [None]:
# Merge on country name using pandas.
countries_commodities_merged = pd.merge(commodity_trade_no_null, countries_df, on="country_or_area", how="outer")

# Rename columns
countries_commodities_merged = countries_commodities_merged.rename(columns={
    "id_y": "country_id",
    "id_x": "id"
})

# Drop country name column.
del countries_commodities_merged["country_or_area"]

countries_commodities_merged

## Connect to local database

In [None]:
rds_connection_string = f"{username}:{password}@localhost:5432/{DB_NAME}"
engine = create_engine(f'postgresql://{rds_connection_string}')

## Create classes/schemas that will be associated with tables in the database.

In [None]:
# Sets an object to utilize the default declarative base in SQL Alchemy.
Base = declarative_base()

In [None]:
# Create classes and define schemas for different tables
class CommodityCategory(Base):
    __tablename__ = COMMODITY_CATEGORY_TABLE
    category_id = Column(String(255), primary_key=True, nullable=False)
    category_name = Column(String(255), nullable=False)
    
class CommodityCode(Base):
    __tablename__ = COMMODITY_CODE_TABLE
    commodity_code = Column(String(255), primary_key=True, nullable=False)
    commodity_description = Column(String(255), nullable=True)
    
class Country(Base):
    __tablename__ = COUNTRY_TABLE
    id = Column(Integer, primary_key=True, nullable=False)
    country_or_area = Column(String(255), nullable=False)

class CommodityTrade(Base):
    __tablename__ = COMMODITY_TABLE
    id = Column(Integer, primary_key=True, nullable=False)
    year = Column(Integer, nullable=False)
    comodity_code = Column(String(255), ForeignKey(f"{COMMODITY_CODE_TABLE}.commodity_code"), nullable=False)
    trade_flow = Column(String(255), nullable=False)
    trade_value_usd = Column(String(255), nullable=False)
    weight_kg = Column(Float, nullable=False)
    quantity_name = Column(String(255), nullable=False)
    quantity = Column(Float, nullable=False)
    category_id = Column(String(255), ForeignKey(f"{COMMODITY_CATEGORY_TABLE}.category_id"), nullable=False)
    country_id = Column(Integer, ForeignKey(f"{COUNTRY_TABLE}.id"), nullable=False)

# Create (if not already in existence) the table associated with class.
Base.metadata.create_all(engine)

## Check for tables

In [None]:
engine.table_names()

## Use pandas to load commodity dataframe into sql

In [None]:
# Doesnt work yet...
# start_time = time.time()
# d6tstack.utils.pd_to_psql(commodity_trade_no_null, cfg_uri_psql, COMMODITY_TABLE, if_exists='append',sep='\t')
# print("Time to load commodity dataframe into sql:")
# print("--- %s seconds ---" % (time.time() - start_time))

## Use pandas to load commodity category dataframe into sql.

In [None]:
start_time = time.time()
d6tstack.utils.pd_to_psql(category_df, cfg_uri_psql, COMMODITY_CATEGORY_TABLE, if_exists='append')
print("Time to load category dataframe into sql:")
print("--- %s seconds ---" % (time.time() - start_time))

## Use pandas to load commodity codes dataframe into sql

In [None]:
# Doesn't work yet...
# start_time = time.time()
# d6tstack.utils.pd_to_psql(commodity_codes_df, cfg_uri_psql, COMMODITY_CODE_TABLE, if_exists='append', sep='\t')
# print("Time to load codes dataframe into sql:")
# print("--- %s seconds ---" % (time.time() - start_time))

## Use pandas to load country dataframe into sql

In [None]:
start_time = time.time()
d6tstack.utils.pd_to_psql(countries_df, cfg_uri_psql, COUNTRY_TABLE, if_exists='append', sep='\t')
print("Time to load country dataframe into sql:")
print("--- %s seconds ---" % (time.time() - start_time))

## Create session object to connect to database

In [None]:
session = Session(bind=engine)

## Confirm category data has been added by querying the commedity category table

In [None]:
category_list = session.query(CommodityCategory).limit(10)
for category in category_list:
    print(f"id: {category.category_id}, category name: {category.category_name}")

In [None]:
## Confirm country data has been added by quering the country table.
country_list = session.query(Country).limit(10)
for country in country_list:
    print(f"id: {country.id}, country name: {country.country_or_area}")

In [None]:
# Phils Section End

In [None]:
#Connors Section Start

In [None]:
#############################
#       DO NOT RUN ##########
#############################
# Dependencies
import pandas as pd
import numpy as np

# Database credentials
from config import username, password

# Used for making database connection.
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

# Used to abstract classes into tables.
from sqlalchemy.ext.declarative import declarative_base

# Used to declare column types.
from sqlalchemy import Column, Integer, String, Float, ForeignKey

# Used to load pandas dataframe into sql.
import d6tstack.utils
import time

# Ignore warnings.
import warnings
warnings.filterwarnings('ignore')

# Used for reading, writing to, and zipping files/folders.
from pathlib import Path
import os, zipfile
import shutil
import glob

In [None]:
#############################
#       DO NOT RUN ##########
#############################
# Constants
COMMODITY_TRADE_CSV = os.path.join(".", "Resources", "commodity_trade_statistics_data.csv")
FINANCIAL_AID_CSV =  os.path.join(".", "Resources", "june-9-data-csv-1.csv")
DB_NAME = "trade_db"
COMMODITY_TABLE = "commodity"
COMMODITY_CATEGORY_TABLE = "commodity_category"
COMMODITY_CODE_TABLE = "commodity_code"
cfg_uri_psql = f"postgresql+psycopg2://{username}:{password}@localhost/{DB_NAME}"
FINANCIAL_AID_TABLE = 'financial_aid'

In [None]:
# Running this cell will unzip the data files in the Resources folder.
extension = ".zip"
extracted_dir_name = "."

# Get the current working directory.
# Need to be in the root directory of this project for this to work.
cwd_dir_name = os.getcwd()
print(f"The current working directory is {cwd_dir_name}.")

os.chdir("Resources") # change directory from working dir to dir with the zip file(s) .
# This should be the "Resources" folder.
dir_name = os.getcwd()
print(f"You are now in the following directory: {dir_name}.")

for item in os.listdir(dir_name): # loop through the items in the directory.
    if item.endswith(extension): # check for ".zip" extension"
        try:
            file_name = os.path.abspath(item) # get full path of files
            zip_ref = zipfile.ZipFile(file_name) # create zipfile object
            unzipped_directory = os.path.join(extracted_dir_name) # reference to the directory where the zip files will be extracted.
            zip_ref.extractall(unzipped_directory) # extract file to dir
            zip_ref.close() # close file
            print(f"Successfully unzipped {item} into the following folder:{dir_name}.")
        except:
            print(f"Error trying to unzip data file(s).")
            print(f"Make sure that the files are closed and you have the correct file/folder permissions.")
            
# Go up one directory into the project root directory.
os.chdir(os.path.normpath(os.getcwd() + os.sep + os.pardir))
print(os.path.normpath(os.getcwd() + os.sep + os.pardir))

In [None]:
financial_df = pd.read_csv(FINANCIAL_AID_CSV)
financial_df.head()

In [None]:
financial_df.count()

In [None]:
cleaned_financial_df = financial_df[['Donor Country','Donor Type','Aid Type','Receiver','Amount','Currency','USD Amount']]
cleaned_financial_df = cleaned_financial_df.dropna()
cleaned_financial_df.count()

In [None]:
rn_cleaned_financial_df = cleaned_financial_df.rename(columns={
                                                     'Donor Country': 'donor_country',
                                                     'Donor Type': 'donor_type',
                                                     'Aid Type': 'aid_type',
                                                     'Receiver': 'receiver',
                                                     'Amount': 'amount',
                                                     'Currency': 'currency',
                                                     'USD Amount': 'USD_amount'})

rn_cleaned_financial_df.head()

In [None]:
rn_cleaned_financial_df.donor_country.unique()

In [None]:
# Dropping bad values
rn_cleaned_financial_df = rn_cleaned_financial_df.drop(rn_cleaned_financial_df[rn_cleaned_financial_df.donor_country.isin(['Not Applicable\r\n','Not Applicable','Not Known'])].index)

rn_cleaned_financial_df.donor_country.unique()
sorted_df = combined_cleaned_df.sort_values(by='donor_country')
sorted_df.donor_country.unique()
sorted_df.to_excel('connor.xlsx')

In [None]:
combined_cleaned_df = rn_cleaned_financial_df.replace({
    'HOLY SEE (VATICAN CITY STATE)':'Italy',
    'Korea, Republic of': 'Rep. of Korea',
    'TAIWAN, PROVINCE OF CHINA': 'China',
    'CANADA': 'Canada',
    'Baharain': 'Bahrain',
    'Czech Republic': 'Czech Rep.',
    'Monaco': 'Morocco',
    'United States': 'USA',
    'Hong Kong': 'China'
})

In [None]:
countries_df.country_or_area.unique()
countries_df.to_excel('phil.xlsx')

In [None]:
combined_cleaned_df.insert(0, 'id', range(0, 0 + len(combined_cleaned_df)))

In [None]:
combined_cleaned_df.head(5)

In [None]:
combined_w_country_id = pd.merge(combined_cleaned_df,countries_df,how='inner',left_on='donor_country',right_on='country_or_area')
combined_w_country_id

In [None]:
combined_w_country_id = combined_w_country_id.rename(columns={'id': 'country_id'})
#del combined_w_country_id['donor_country']
#del combined_w_country_id['country_or_area']

In [None]:

combined_w_country_id.insert(0, 'id', range(0, 0 + len(combined_w_country_id)))
combined_w_country_id

In [None]:
#AidCategory.__table__.drop()

class AidCategory(Base):
    __tablename__ = FINANCIAL_AID_TABLE
    id = Column(Integer, primary_key=True)
    donor_type = Column(String(255))
    aid_type = Column(String(255))
    receiver = Column(String(255))
    amount = Column(Integer)
    currency = Column(String(255))
    USD_amount = Column(Integer)
    country_id = Column(Integer, ForeignKey(f"{COUNTRY_TABLE}.id"), nullable=False)
    
Base.metadata.create_all(engine)

In [None]:
engine.table_names()

In [None]:
#start_time = time.time()
#d6tstack.utils.pd_to_psql(combined_w_country_id, cfg_uri_psql, FINANCIAL_AID_TABLE, if_exists='append', sep='\t')
#print("Time to load aid dataframe into sql:")
#print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
#Connors Section End