In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import pickle
from IPython.display import clear_output

# `Functions`

In [None]:
def display_df_info(dataframe):
    print(f"Shape is: {dataframe.shape}")
    display(dataframe.head())


def get_filtered_data(path_csv_file, element_type, new_value_name):
    data = pd.read_csv(path_csv_file)
    data = data[(data['country_code'].isna() == False) & (data['country_code'] != 'OWID_WRL')]
    data.rename(columns={'year': 'date'}, inplace=True)
    data['date'] = pd.to_datetime(data['date'], errors='coerce')
    data['year'] = data['date'].dt.year
    data.drop('date', axis=1, inplace=True)

    data = data[data['element'] == element_type]
    data.rename(columns={'value': new_value_name}, inplace=True)
    
    return data[['country_code', 'year', new_value_name]]


def fill_na_with_mean(dataframe, column_name):
    mean = dataframe[column_name].mean()
    dataframe[column_name] = dataframe[column_name].fillna(mean)

    return dataframe


def display_heatmap(dataframe):
    corr=dataframe.corr()

    mask=np.triu(np.ones_like(corr, dtype=bool))     # generate a mask for the upper triangle

    f, ax=plt.subplots(figsize=(11, 9))                 # set up the matplotlib figure

    cmap=sns.diverging_palette(220, 10, as_cmap=True)   # generate a custom diverging colormap

    sns.heatmap(corr, mask=mask, cmap=cmap,             # draw the heatmap with the mask and correct aspect ratio
                vmax=.3, center=0, square=True,
                linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
def compare_models(models, dataframe, label):
    
    fitted_models_list = []
    min_max_scaler_list = []

    r2_list = []
    mse_list = []
    rmse_list = []
    mae_list = []

    for model in models:

        print(f"====== {model} ======")

        dataframe = dataframe.sample(frac=1)

        y = dataframe[label]
        X = dataframe.drop(label, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Scaling data = X_train
        min_max_scaler = MinMaxScaler().fit(X_train)
        X_train_normalized = min_max_scaler.transform(X_train)
        X_train_normalized = pd.DataFrame(X_train_normalized)

        # Scaling data = X_test
        X_test_normalized = min_max_scaler.transform(X_test)
        X_test_normalized = pd.DataFrame(X_test_normalized)

        model.fit(X_train_normalized, y_train)

        # Make predictions on the test data
        y_pred = model.predict(X_test_normalized)

        # R2 validation
        r2 = r2_score(y_test, y_pred)
        print("R2:", r2)

        # MSE validation
        mse=mean_squared_error(y_test, y_pred)
        print("MSE:", mse)

        # RMSE validation
        rmse = np.sqrt(mse)
        print("RMSE:", rmse)

        # MAE validation
        mae=mean_absolute_error(y_test, y_pred)
        print("MAE:", mae)

        fitted_models_list.append({
            'model': model,
            'min_max_scaler' : min_max_scaler
        })
        min_max_scaler_list.append(min_max_scaler)

        r2_list.append(r2)
        mse_list.append(mse)
        rmse_list.append(rmse)
        mae_list.append(mae)


    summary = {
        'Algorithm': [item['model'] for item in fitted_models_list],
        'R2': r2_list,
        'MSE': mse_list,
        'RMSE': rmse_list,
        'MAE': mae_list
    }
    summary = pd.DataFrame(summary)

    return {
        'summary' : summary,
        'models' : fitted_models_list
    }

In [None]:
def display_ordered_rmse(dataframe_summary):
    clear_output()
    display(dataframe_summary.sort_values(by='RMSE'))

# `UN_paper_pulp_import_export.csv`

In [None]:
file = 'wood-pulp-business/data/cleaned/UN_paper_pulp_import_export.csv'

## `Productions`

In [None]:
paper_pulp_production_data = get_filtered_data(file, 'Production', 'paper_pulp_prod_tonnes')
paper_pulp_production_data.to_csv('data/paper_pulp_production_data.csv', index=False)
display_df_info(paper_pulp_production_data)

## `Exports`

In [None]:
paper_pulp_export_data = get_filtered_data(file, 'Export Quantity', 'paper_pulp_export_tonnes')
paper_pulp_export_data.to_csv('data/paper_pulp_export_data.csv', index=False)
display_df_info(paper_pulp_export_data)

## `Imports`

In [None]:
paper_pulp_import_data = get_filtered_data(file, 'Import Quantity', 'paper_pulp_import_tonnes')
paper_pulp_import_data.to_csv('data/paper_pulp_import_data.csv', index=False)
display_df_info(paper_pulp_import_data)

# `UN_wood_pulp_import_export.csv`

In [None]:
file = 'wood-pulp-business/data/cleaned/UN_wood_pulp_import_export.csv'

## `Productions`

In [None]:
wood_pulp_production_data = get_filtered_data(file, 'Production', 'wood_pulp_production_tonnes')
wood_pulp_production_data.to_csv('data/wood_pulp_production_data.csv', index=False)
display_df_info(wood_pulp_production_data)

## `Exports`

In [None]:
wood_pulp_export_data = get_filtered_data(file, 'Export Quantity', 'wood_pulp_export_tonnes')
wood_pulp_export_data.to_csv('data/wood_pulp_export_data.csv', index=False)
display_df_info(wood_pulp_export_data)

## `Imports`

In [None]:
wood_pulp_import_data = get_filtered_data(file, 'Import Quantity', 'wood_pulp_import_tonnes')
wood_pulp_import_data.to_csv('data/wood_pulp_import_data.csv', index=False)
display_df_info(wood_pulp_import_data)

# `Concat Dataframes`

In [None]:
data = pd.merge(paper_pulp_production_data, paper_pulp_export_data, on=['country_code', 'year'], how='outer')
data = pd.merge(data, paper_pulp_import_data, on=['country_code', 'year'], how='outer')

data = pd.merge(data, wood_pulp_production_data, on=['country_code', 'year'], how='outer')
data = pd.merge(data, wood_pulp_export_data, on=['country_code', 'year'], how='outer')
data = pd.merge(data, wood_pulp_import_data, on=['country_code', 'year'], how='outer')

display_df_info(data)

# `Concat Paper Prices from producer_paper_price_evolution.csv`

In [None]:
paper_price_data = pd.read_csv('wood-pulp-business/data/cleaned/producer_paper_price_evolution.csv')

paper_price_data.columns = ['date', 'paper_price']
paper_price_data['date'] = pd.to_datetime(paper_price_data['date'], errors='coerce')
paper_price_data['year'] = paper_price_data['date'].dt.year
paper_price_data.drop('date', axis=1, inplace=True)

display_df_info(paper_price_data)

In [None]:
paper_price_data = paper_price_data.groupby('year')['paper_price'].mean().reset_index()
display_df_info(paper_price_data)

In [None]:
data = pd.merge(data, paper_price_data, on=['year'], how='outer')
display_df_info(data)

# `Concat Wood Pulp Prices from producer_wood_pulp_price_evolution.csv`

In [None]:
wood_pulp_price_data = pd.read_csv('wood-pulp-business/data/cleaned/producer_wood_pulp_price_evolution.csv')

wood_pulp_price_data.columns = ['date', 'wood_pulp_price']
wood_pulp_price_data['date'] = pd.to_datetime(wood_pulp_price_data['date'], errors='coerce')
wood_pulp_price_data['year'] = wood_pulp_price_data['date'].dt.year
wood_pulp_price_data.drop('date', axis=1, inplace=True)

display_df_info(wood_pulp_price_data)

In [None]:
wood_pulp_price_data = wood_pulp_price_data.groupby('year')['wood_pulp_price'].mean().reset_index()
display_df_info(wood_pulp_price_data)

In [None]:
data = pd.merge(data, wood_pulp_price_data, on=['year'], how='outer')
display_df_info(data)

# `Work on the model`

In [None]:
data.to_csv('data/data.csv', index=False)

In [None]:
cols = ['paper_pulp_prod_tonnes', 'paper_pulp_export_tonnes', 'paper_pulp_import_tonnes', 'wood_pulp_production_tonnes', 'wood_pulp_export_tonnes', 'wood_pulp_import_tonnes', 'paper_price']

for col in cols:
    data = fill_na_with_mean(data, col)

In [None]:
data.isna().sum()

In [None]:
data.drop('country_code', axis=1, inplace=True)

In [None]:
display_df_info(data)

In [None]:
data.to_csv('data/data_cleaned.csv', index=False)

## `Check correlations`

In [None]:
display_heatmap(data)

## `Test algorithms`

In [None]:
models_to_test = [
    LinearRegression(n_jobs=-1),
    Lasso(),
    Ridge(),
    ElasticNet(),
    XGBRegressor(),
    LGBMRegressor(n_jobs=-1),
    DecisionTreeRegressor(),
    KNeighborsRegressor(n_jobs=-1),
    MLPRegressor(),
    RandomForestRegressor(n_jobs=-1)
]

In [None]:
results_paper_price = compare_models(models=models_to_test, dataframe=data.drop('wood_pulp_price', axis=1), label='paper_price')
display_ordered_rmse(results_paper_price['summary'])

In [None]:
# LinearRegression(n_jobs=-1) seems the best

paper_price_model_with_scaler = results_paper_price['models'][0]

with open(f'data/models/paper_price_model_with_scaler.pkl', 'wb') as file:
    pickle.dump(paper_price_model_with_scaler, file)

In [None]:
results_wood_pulp_price = compare_models(models=models_to_test, dataframe=data.drop('paper_price', axis=1), label='wood_pulp_price')
display_ordered_rmse(results_wood_pulp_price['summary'])

In [None]:
# KNeighborsRegressor(n_jobs=-1) seems the best

wood_pulp_price_model_with_scaler = results_wood_pulp_price['models'][-3]

with open(f'data/models/wood_pulp_price_model_with_scaler.pkl', 'wb') as file:
    pickle.dump(wood_pulp_price_model_with_scaler, file)