In [1]:
# import packages
import pandas as pd
import os

In [2]:
# set your directory here
os.chdir("/Users/vigadam/Dropbox/work/data_book/da_data_repo/")

data_in = "asia-industry/raw/"
data_out = "asia-industry/clean/"

In [3]:
# usa imports
df = pd.read_excel(
    data_in + "usa-imports.xls",
    sheet_name="FRED Graph",
    skiprows=range(0, 10),
    usecols="A:B",
)
df.columns = ["date", "usa_imp_sa"]
df["year"] = pd.DatetimeIndex(df["date"]).year
df["month"] = pd.DatetimeIndex(df["date"]).month
df["time"] = pd.to_datetime(df[["year", "month"]].assign(day=1)).dt.to_period("M")
df = df[["time", "year", "month", "usa_imp_sa"]]
df.to_csv(data_out + "usa-imports.csv", index=False)

In [4]:
# asia monthly industrial production plus some others (exchange rate)
df = pd.read_csv(
    data_in + "worldbank-monthly-asia-2019_long.csv",
    quotechar='"',
    delimiter=",",
    encoding="utf-8",
)
df = df[["Series", "Country", "Country Code", "Time", "Value"]]
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "")

In [5]:
# create time
df[["year", "month"]] = df["time"].str.split(expand=True, pat="M")
df = df.dropna(axis=0, subset=["month"])
df["year"] = pd.to_datetime(df["year"]).dt.year
df["month"] = pd.to_datetime(df["time"], format="%YM%m").dt.month

# filter by year
df = df.loc[df["year"] > 1990]

In [6]:
# rename variables
df.replace(
    {
        "series": {
            "Industrial Production, constant US$,,,": "ind_prod_const",
            "CPI Price, seas. adj.,,,": "cpi_sa",
            "Exchange rate, new LCU per USD extended backward, period average,,": "exchnage_rate_vs_usd",
            "Industrial Production, constant US$, seas. adj.,,": "ind_prod_const_sa",
            "Nominal Effecive Exchange Rate,,,,": "exchange_rate_neer",
            "Real Effective Exchange Rate,,,,": "exchange_rate_reer",
            "CPI Price, % y-o-y, not seas. adj.,,": "cpi_yoy_nsa",
        }
    },
    inplace=True,
)

In [7]:
# tidy data: have variables as columns (long to wide dataset)
df = (
    df.pivot_table(
        index=["country", "year", "month", "countrycode"],
        columns="series",
        values="value",
    )
    .reset_index()
)

In [8]:
# set order
df["time"] = pd.to_datetime(df[["year", "month"]].assign(day=1)).dt.to_period("M")
df = df.sort_values(by=["time"]).reset_index(drop=True)
order = ["time", "year", "month", "country", "countrycode"]
df = df.reindex(
    columns=order + [column for column in df.columns if column not in order]
)

df.to_csv(data_out + "asia-indprod_tidy.csv", index=False)

In [9]:
# merge the two dataset
df_right = pd.read_csv(
    data_out + "asia-indprod_tidy.csv", delimiter=",", dtype="unicode"
)
df_left = pd.read_csv(data_out + "usa-imports.csv", delimiter=",", dtype="unicode")

merged = pd.merge(df_right, df_left, how="inner", on=["time","year","month"])

merged.to_csv(data_out + "asia-industry_tidy.csv", index=False)