In [1]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
#import required libraries
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

## Model: Decision Tree

In [4]:
# Function to calculate MAPE - Mean Absolute Percentage Error
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Function to load data, clean it, and apply a decision tree
def load_and_apply_decision_tree(file_path, country_name):
    
    data = pd.read_csv(file_path)
    
    # Drop the 'COUNTRY' column since it's a categorical variable
    if 'COUNTRY' in data.columns:
        data = data.drop('COUNTRY', axis=1)
    
    # Ensure 'ID' and 'TARGET' are in the dataframe
    if 'ID' not in data.columns or 'TARGET' not in data.columns:
        raise ValueError("The dataframe does not contain 'ID' and/or 'TARGET' columns.")
    
    # Split the dataframe into features and target
    X = data.drop(['ID', 'TARGET'], axis=1)
    y = data['TARGET']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize the GridSearchCV object with a Decision Tree Regressor
    tree_reg = DecisionTreeRegressor(random_state=42)
    param_grid = {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20],
        'min_samples_leaf': [1, 5, 10]
    }
    grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Print the best parameters found by GridSearchCV   
    print(f'{country_name} - Best parameters: {grid_search.best_params_}')

    # Use the best estimator found by the grid search to make predictions
    best_tree_reg = grid_search.best_estimator_
    predictions = best_tree_reg.predict(X_test)
    
    # Calculate and print the performance metrics
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    #mape = mean_absolute_percentage_error(y_test, predictions)
    
    print(f'{country_name} - MSE: {mse}')
    print(f'{country_name} - MAE: {mae}')
    #print(f'{country_name} - MAPE: {mape} %')

# Apply the decision tree model to the datasets
load_and_apply_decision_tree('X_train_DE.csv', 'Germany')
load_and_apply_decision_tree('X_train_FR.csv', 'France')


Germany - Best parameters: {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 2}
Germany - MSE: 1.4868843256264321
Germany - MAE: 0.7677330995277721
France - Best parameters: {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 2}
France - MSE: 1.0878491892743125
France - MAE: 0.652670261328456
