In [321]:
import pandas as pd
import numpy as np

In [322]:
def display_df_info(dataframe):
    print(f"Shape is: {dataframe.shape}")
    display(dataframe.head())

def get_filtered_data(path_csv_file, element_type, new_value_name):
    data = pd.read_csv(path_csv_file)
    data = data[(data['country_code'].isna() == False) & (data['country_code'] != 'OWID_WRL')]
    data.rename(columns={'year': 'date'}, inplace=True)
    data['date'] = pd.to_datetime(data['date'], errors='coerce')
    data['year'] = data['date'].dt.year
    data.drop('date', axis=1, inplace=True)

    data = data[data['element'] == element_type]
    data.rename(columns={'value': new_value_name}, inplace=True)
    
    return data[['country_code', 'year', new_value_name]]

def fill_na_with_mean(dataframe, column_name):
    mean = dataframe[column_name].mean()
    dataframe[column_name] = dataframe[column_name].fillna(mean)

    return dataframe

# `UN_paper_pulp_import_export.csv`

In [323]:
file = 'wood-pulp-business/data/cleaned/UN_paper_pulp_import_export.csv'

## `Productions`

In [324]:
paper_pulp_production_data = get_filtered_data(file, 'Production', 'paper_pulp_prod_tonnes')
display_df_info(paper_pulp_production_data)

Shape is: (4346, 3)


Unnamed: 0,country_code,year,paper_pulp_prod_tonnes
402,ALB,2020,0
403,ALB,2019,0
404,ALB,2018,0
405,ALB,2017,0
406,ALB,2016,0


## `Exports`

In [325]:
paper_pulp_export_data = get_filtered_data(file, 'Export Quantity', 'paper_pulp_export_tonnes')
display_df_info(paper_pulp_export_data)

Shape is: (4663, 3)


Unnamed: 0,country_code,year,paper_pulp_export_tonnes
56,AFG,2020,22
57,AFG,2019,22
58,AFG,2018,22
59,AFG,2017,19
60,AFG,2016,19


## `Imports`

In [326]:
paper_pulp_import_data = get_filtered_data(file, 'Import Quantity', 'paper_pulp_import_tonnes')
display_df_info(paper_pulp_import_data)

Shape is: (6358, 3)


Unnamed: 0,country_code,year,paper_pulp_import_tonnes
0,AFG,2020,233
1,AFG,2019,646
2,AFG,2018,140
3,AFG,2017,45
4,AFG,2016,291


# `UN_wood_pulp_import_export.csv`

In [327]:
file = 'wood-pulp-business/data/cleaned/UN_wood_pulp_import_export.csv'

## `Productions`

In [328]:
wood_pulp_production_data = get_filtered_data(file, 'Production', 'wood_pulp_production_tonnes')
display_df_info(wood_pulp_production_data)

Shape is: (3645, 3)


Unnamed: 0,country_code,year,wood_pulp_production_tonnes
408,ALB,2020,0
409,ALB,2019,0
410,ALB,2018,0
411,ALB,2017,0
412,ALB,2016,0


## `Exports`

In [329]:
wood_pulp_export_data = get_filtered_data(file, 'Export Quantity', 'wood_pulp_export_tonnes')
display_df_info(wood_pulp_export_data)

Shape is: (4555, 3)


Unnamed: 0,country_code,year,wood_pulp_export_tonnes
56,AFG,2020,38
57,AFG,2019,38
58,AFG,2018,38
59,AFG,2017,35
60,AFG,2016,35


## `Imports`

In [330]:
wood_pulp_import_data = get_filtered_data(file, 'Import Quantity', 'wood_pulp_import_tonnes')
display_df_info(wood_pulp_import_data)

Shape is: (6303, 3)


Unnamed: 0,country_code,year,wood_pulp_import_tonnes
0,AFG,2020,245
1,AFG,2019,622
2,AFG,2018,127
3,AFG,2017,32
4,AFG,2016,167


# `Concat Dataframes`

In [331]:
data = pd.merge(paper_pulp_production_data, paper_pulp_export_data, on=['country_code', 'year'], how='outer')
data = pd.merge(data, paper_pulp_import_data, on=['country_code', 'year'], how='outer')

data = pd.merge(data, wood_pulp_production_data, on=['country_code', 'year'], how='outer')
data = pd.merge(data, wood_pulp_export_data, on=['country_code', 'year'], how='outer')
data = pd.merge(data, wood_pulp_import_data, on=['country_code', 'year'], how='outer')

display_df_info(data)

Shape is: (11322, 8)


Unnamed: 0,country_code,year,paper_pulp_prod_tonnes,paper_pulp_export_tonnes,paper_pulp_import_tonnes,wood_pulp_production_tonnes,wood_pulp_export_tonnes,wood_pulp_import_tonnes
0,ALB,2020,0.0,0.0,1794.0,0.0,0.0,29.0
1,ALB,2019,0.0,0.0,23.0,0.0,0.0,23.0
2,ALB,2018,0.0,0.0,1.0,0.0,0.0,1.0
3,ALB,2017,0.0,0.0,0.0,0.0,0.0,0.0
4,ALB,2016,0.0,10.0,4436.0,0.0,25.0,4465.0


# `Concat Paper Prices from producer_paper_price_evolution.csv`

In [332]:
paper_price_data = pd.read_csv('wood-pulp-business/data/cleaned/producer_paper_price_evolution.csv')

paper_price_data.columns = ['date', 'paper_price']
paper_price_data['date'] = pd.to_datetime(paper_price_data['date'], errors='coerce')
paper_price_data['year'] = paper_price_data['date'].dt.year
paper_price_data.drop('date', axis=1, inplace=True)

display_df_info(paper_price_data)

Shape is: (912, 2)


Unnamed: 0,paper_price,year
0,19.7,1947
1,20.1,1947
2,20.1,1947
3,20.5,1947
4,20.7,1947


In [333]:
paper_price_data = paper_price_data.groupby('year')['paper_price'].mean().reset_index()
display_df_info(paper_price_data)

Shape is: (76, 2)


Unnamed: 0,year,paper_price
0,1947,20.916667
1,1948,23.025
2,1949,23.05
3,1950,23.8
4,1951,26.908333


In [334]:
data = pd.merge(data, paper_price_data, on=['year'], how='outer')
display_df_info(data)

Shape is: (11338, 9)


Unnamed: 0,country_code,year,paper_pulp_prod_tonnes,paper_pulp_export_tonnes,paper_pulp_import_tonnes,wood_pulp_production_tonnes,wood_pulp_export_tonnes,wood_pulp_import_tonnes,paper_price
0,ALB,2020,0.0,0.0,1794.0,0.0,0.0,29.0,190.983333
1,DZA,2020,2000.0,24.0,102614.0,,24.0,102617.0,190.983333
2,AGO,2020,0.0,706.0,17642.0,0.0,706.0,17643.0,190.983333
3,ARG,2020,733000.0,193689.0,238613.0,620000.0,193670.0,238536.0,190.983333
4,ARM,2020,0.0,0.0,43.0,0.0,0.0,33.0,190.983333


# `Concat Wood Pulp Prices from producer_wood_pulp_price_evolution.csv`

In [335]:
wood_pulp_price_data = pd.read_csv('wood-pulp-business/data/cleaned/producer_wood_pulp_price_evolution.csv')

wood_pulp_price_data.columns = ['date', 'wood_pulp_price']
wood_pulp_price_data['date'] = pd.to_datetime(wood_pulp_price_data['date'], errors='coerce')
wood_pulp_price_data['year'] = wood_pulp_price_data['date'].dt.year
wood_pulp_price_data.drop('date', axis=1, inplace=True)

display_df_info(wood_pulp_price_data)

Shape is: (1164, 2)


Unnamed: 0,wood_pulp_price,year
0,10.3,1926
1,10.3,1926
2,10.4,1926
3,10.4,1926
4,10.4,1926


In [336]:
wood_pulp_price_data = wood_pulp_price_data.groupby('year')['wood_pulp_price'].mean().reset_index()
display_df_info(wood_pulp_price_data)

Shape is: (97, 2)


Unnamed: 0,year,wood_pulp_price
0,1926,10.1
1,1927,9.316667
2,1928,8.966667
3,1929,8.925
4,1930,8.741667


In [337]:
data = pd.merge(data, wood_pulp_price_data, on=['year'], how='outer')
display_df_info(data)

Shape is: (11359, 10)


Unnamed: 0,country_code,year,paper_pulp_prod_tonnes,paper_pulp_export_tonnes,paper_pulp_import_tonnes,wood_pulp_production_tonnes,wood_pulp_export_tonnes,wood_pulp_import_tonnes,paper_price,wood_pulp_price
0,ALB,2020,0.0,0.0,1794.0,0.0,0.0,29.0,190.983333,141.641667
1,DZA,2020,2000.0,24.0,102614.0,,24.0,102617.0,190.983333,141.641667
2,AGO,2020,0.0,706.0,17642.0,0.0,706.0,17643.0,190.983333,141.641667
3,ARG,2020,733000.0,193689.0,238613.0,620000.0,193670.0,238536.0,190.983333,141.641667
4,ARM,2020,0.0,0.0,43.0,0.0,0.0,33.0,190.983333,141.641667


# `Work on the model`

In [338]:
data.to_csv('data.csv')

In [339]:
cols = ['paper_pulp_prod_tonnes', 'paper_pulp_export_tonnes', 'paper_pulp_import_tonnes', 'wood_pulp_production_tonnes', 'wood_pulp_export_tonnes', 'wood_pulp_import_tonnes', 'paper_price']

for col in cols:
    data = fill_na_with_mean(data, col)

In [340]:
data.isna().sum()

country_code                     37
year                              0
paper_pulp_prod_tonnes            0
paper_pulp_export_tonnes       2313
paper_pulp_import_tonnes        656
wood_pulp_production_tonnes    3380
wood_pulp_export_tonnes        2421
wood_pulp_import_tonnes         711
paper_price                      21
wood_pulp_price                   0
dtype: int64