In [1]:
import pandas as pd
from pandas import IndexSlice as idx

In [2]:
df_nace = pd.read_csv("../data/tulli/international_trade_country_NACE_breakdown.csv", index_col=[0,1,2,3])
df_reg = pd.read_csv("../data/tulli/tulli_international_trade_region_economy-sector_breakdown.csv", encoding="iso-8859-1", index_col=[0,1,2], skiprows=1)
df_empl = pd.read_csv("../data/Regional_economic_and_environmental_data/Employment_MKregion_breakdown_more_industries.csv", encoding="iso-8859-1", index_col=[0,1,2,3])

In [3]:
df_empl.index.get_level_values(2).unique().to_list()

['Total',
 'A Agriculture, forestry and fishing (01-03)',
 'B Mining and quarrying (05-09)',
 '10-12 Food industry etc.',
 '13-15 Textile, clothing and leather industry',
 '16 Manufacture of wood and of products of wood and cork, except furniture; manufacture of articles of straw and plaiting materials',
 '17, 18 Paper industry; Printing',
 '19-22 Chemical industry',
 '23 Manufacture of other non-metallic mineral products',
 '24-25 Manufacture of basic metals and fabricated metal products, except machinery and equipment',
 '26, 27 Manufacture of electrical and electronic products',
 '28 Manufacture of machinery and equipment n.e.c.',
 '29, 30 Manufacture of transport equipment',
 '31-33 Manufacture of furniture, other manufacturing; repair and installation of machinery and equipment',
 'D, E Electricity, gas, steam and air conditioning and water supply; sewerage and waste management (35-39)',
 'F Construction (41-43)',
 'G Wholesale and retail trade; repair of motor vehicles and motorc

In [4]:
# Normalize df_empl using Total rows
totals = df_empl.loc[idx[:, :, "Total", :], :].groupby(["Area", "Industry", "Transaction"]).sum()
totals.index = totals.index.droplevel(1)
df_empl_scaled = df_empl.div(totals, axis=0)
df_empl_scaled.drop("Total", level=2, inplace=True)
df_empl_scaled = df_empl_scaled.groupby(["Area", "Industry"]).sum()
df_empl_scaled

Unnamed: 0_level_0,Unnamed: 1_level_0,2015 Original series,2016 Original series,2017 Original series,2018 Original series,2019 Original series,2020 Original series
Area,Industry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MK01 Uusimaa,10-12 Food industry etc.,0.011569,0.011296,0.011143,0.010508,0.010011,0.010742
MK01 Uusimaa,"13-15 Textile, clothing and leather industry",0.001805,0.001681,0.001750,0.001690,0.001660,0.001651
MK01 Uusimaa,"16 Manufacture of wood and of products of wood and cork, except furniture; manufacture of articles of straw and plaiting materials",0.001544,0.001579,0.001545,0.001540,0.001779,0.002021
MK01 Uusimaa,"17, 18 Paper industry; Printing",0.007650,0.006484,0.006362,0.006090,0.005696,0.006100
MK01 Uusimaa,19-22 Chemical industry,0.013405,0.012967,0.012913,0.012686,0.012445,0.012852
...,...,...,...,...,...,...,...
WHOLE COUNTRY,O Public administration and defence; compulsory social security (84),0.065298,0.064531,0.064174,0.063859,0.063867,0.066517
WHOLE COUNTRY,P Education (85),0.064466,0.062954,0.062417,0.061574,0.061848,0.063467
WHOLE COUNTRY,Q Human health and social work activities (86-88),0.159442,0.160855,0.161566,0.162446,0.165501,0.168979
WHOLE COUNTRY,"R, S Other service activities (90-96)",0.048498,0.048635,0.048482,0.047713,0.048461,0.047114


In [5]:
# df_reg has 21 unique region values, df_empl has 19
# Remove Total and unknown region values from df_reg
df_reg.drop("21 Total", level=1, inplace=True)
df_reg_unknown = df_reg.loc[(slice(None), ["20 Unknown"], slice(None)), :] # might be useful later
df_reg.drop("20 Unknown", level=1, inplace=True)

# make df_reg have the same regions as df_empl
regs_reg = df_reg.index.get_level_values(1).unique().to_list()
regs_empl = df_empl_scaled.index.get_level_values(0).unique().to_list()
mapping = {regs_reg[i] : regs_empl[i] for i in range(19)}
index = df_reg.index.to_list()
index = [(ind[0], mapping[ind[1]], ind[2]) for ind in index]
index = pd.MultiIndex.from_tuples(index, names=["Time", "Area", "Direction"])
df_reg.index = index

In [6]:
# Pick last month of each year from regional trade data
idx = pd.IndexSlice
df_reg_year_aligned = df_reg.loc[idx[[i for i in range(201512, 202112, 100)], :, :], :]

In [7]:
# Move the time dimension to the index
flattened = df_empl_scaled.to_numpy().flatten()
ind = df_empl_scaled.index.to_list()
new_ind = []
for i in ind:
    for d in [201512, 201612, 201712, 201812, 201912, 202012]:
        new_ind.append((i[0], i[1], d))
df_empl_scaled_new = pd.DataFrame(flattened, index=pd.MultiIndex.from_tuples(new_ind, names=["Area", "Industry", "Time"]), columns=["Employment"])
df_empl_scaled_new = df_empl_scaled_new.reorder_levels(["Time", "Area", "Industry"]).sort_index()

In [8]:
# Combine df_empl_scaled_new and df_reg_year_aligned into one dataframe and save it
comb = df_empl_scaled_new.join(df_reg_year_aligned["Cum. statistical value (euro) from the beginning of the year"], how="inner")
comb["Trade value"] = comb["Employment"] * comb["Cum. statistical value (euro) from the beginning of the year"]
comb.drop(["Employment", "Cum. statistical value (euro) from the beginning of the year"], axis=1, inplace=True)

In [72]:
# Save combined table and employment breakdown
comb.to_csv("../data/combined/trade_by_region_industry_direction_breakdown(employment).csv")
df_empl_scaled_new.to_csv("../data/region/employment_by_industry_distribution_per_region.csv")

In [193]:
df_income = pd.read_csv("../data/Regional_economic_and_environmental_data/Income_production_by_area.csv", index_col=[0,1,2,3], encoding="iso-8859-1")

In [194]:
ind_to_drop = [
    '01 Crop and animal production, hunting and related service activities',
 '02, 03 Forestry; Fishing',
 'B-F Secondary production (05-43)',
 '05-09, 13-15, 19-23, 31-39 Other industry',
 'G-T Services (45-98)',
 '681, 68209, 683, M, N Real estate activities; Professional, scientific and technical activities;  Office administrative and other',
 '16-18 Manufacture of wood products, paper and paper products; printing and reproduction of recorded media',
 '24-30 Metal industry'
]
df_income.drop(ind_to_drop, level=2, inplace=True)

In [195]:
# Choose MK regions and whole country from df_income
regs = df_income.index.get_level_values(0).unique().to_list()
regs = [i for i in regs if i.startswith("MK")]
regs += ["WHOLE COUNTRY"]
df_income = df_income.loc[idx[regs, :, :, :]]

In [196]:
# Choose GVA, drop 2021, convert everything into a series
df_income = df_income.loc[idx[:, 'B1GPH Gross value added at basic prices', :, :]]
df_income = df_income.droplevel(2)
df_income.columns = [i for i in range(201512,202212, 100)]
df_income.columns = df_income.columns.rename("Time")
df_income.drop(columns=202112, inplace=True)
df_income_ser = df_income.stack().to_frame().astype("float64")

In [197]:
# Get total values, drop them and normalise GVAs
totals = df_income_ser.loc[idx[:, "Total", :]]
df_income_ser.drop("Total", level=1, inplace=True)
df_income_ser_scaled = df_income_ser.div(totals, axis=0)

In [200]:
# Combine df_income_ser_scaled and df_reg_year_aligned into one dataframe and save it
comb = df_income_ser_scaled.join(df_reg_year_aligned["Cum. statistical value (euro) from the beginning of the year"], how="inner")
comb["Trade value"] = comb[0] * comb["Cum. statistical value (euro) from the beginning of the year"]
comb.drop([0, "Cum. statistical value (euro) from the beginning of the year"], axis=1, inplace=True)

In [201]:
comb.to_csv("../data/combined/trade_by_region_industry_direction_breakdown(GVA).csv")