In [1]:
#import any library dependencies
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [23]:
#read in the data
dataset = pd.read_csv("country_pred_delta_forest.csv")
df = dataset.drop(columns=["countryname","Area","forestgroup","gdpgroup","datagroup","popgroup","Unnamed: 0"])
df = df[df["Country.Code"] != "NRU"]
df = df[df["Country.Code"] != "QAT"]
df = df[df['Country.Code'] != 'SMR']
#create a lag by country
df["delta_forest_lag_1"] = df.groupby(["Country.Code"])["delta_forest"].shift(1)

In [24]:
#for_naive.head()
#countries["USA"]
df.head()

Unnamed: 0,Country.Code,yr,pct_forest,gdp,pop,forest_area,x_6796_722511,x_6796_723011,x_6796_723111,x_6796_724311,...,x_6716_5110,x_6717_5110,IncomeGroup.x,lag_gdp,lag_pop,lag_forest,delta_forest,delta_gdp,delta_pop,delta_forest_lag_1
0,ABW,1991,2.333333,872138700.0,64622.0,4.2,0.0,0.0,0.0,0.0,...,0.0,0.42,High income,764887100.0,62149.0,4.2,0.0,0.122975,0.038269,
1,ABW,1992,2.333333,958463200.0,68235.0,4.2,0.0,0.0,0.0,0.0,...,0.0,0.42,High income,872138700.0,64622.0,4.2,0.0,0.090066,0.052949,0.0
2,ABW,1993,2.333333,1082980000.0,72504.0,4.2,0.0,0.0,0.0,0.0,...,0.0,0.42,High income,958463200.0,68235.0,4.2,0.0,0.114976,0.05888,0.0
3,ABW,1994,2.333333,1245688000.0,76700.0,4.2,0.0,0.0,0.0,0.0,...,0.0,0.42,High income,1082980000.0,72504.0,4.2,0.0,0.130617,0.054707,0.0
4,ABW,1995,2.333333,1320475000.0,80324.0,4.2,0.0,0.0,0.0,0.0,...,0.0,0.42,High income,1245688000.0,76700.0,4.2,0.0,0.056636,0.045117,0.0


In [25]:
def persistance_model(x):
    return x

In [26]:
#split dataset by country
def to_country(data, cntry_ix):
    countries = dict()
    #get unique country codes
    country = np.unique(data[ : , cntry_ix])
    #group by country
    for c in country:
        select = data[:, cntry_ix] == c
        countries[c] = data[select, :]
    return countries

In [27]:
values = df.values
countries = to_country(values,0)
print('Total countries: %d' % len(countries))

Total countries: 197


In [28]:
#split into train and test sets
def split_train_test(countries, row_in_country):
    train, test = list(), list()
    #First 21 years for train
    cut_point = 2011
    #list out countries
    for x, rows in countries.items():
        #split by position
        train_rows = rows[rows[:,row_in_country] <= cut_point, :]
        test_rows = rows[rows[:,row_in_country]> cut_point, :]
        if len(train_rows) == 0 or len(test_rows) == 0:
            print("Dropping country=%s: train=%s, test=%s" % (x,train_rows.shape,test_rows.shape))
            continue
        #sort with country id, position, year, targets
        indices = [0,1,53,56]
        train.append(train_rows[: ,indices])
        test.append(test_rows[:, indices])
    return train, test

train, test = split_train_test(countries,1)

In [29]:
test

[array([['ABW', 2012, 0.0, 0.0],
        ['ABW', 2013, 0.0, 0.0],
        ['ABW', 2014, 0.0, 0.0],
        ['ABW', 2015, 0.0, 0.0],
        ['ABW', 2016, 0.0, 0.0]], dtype=object),
 array([['AFG', 2012, 0.0, 0.0],
        ['AFG', 2013, 0.0, 0.0],
        ['AFG', 2014, 0.0, 0.0],
        ['AFG', 2015, 0.0, 0.0],
        ['AFG', 2016, 0.0, 0.0]], dtype=object),
 array([['AGO', 2012, -0.00214322389334072, -0.00213864030833264],
        ['AGO', 2013, -0.0021477597967171, -0.00214322389334072],
        ['AGO', 2014, -0.00215245011397563, -0.0021477597967171],
        ['AGO', 2015, -0.00215709314937085, -0.00215245011397563],
        ['AGO', 2016, -0.00216175625898772, -0.00215709314937085]],
       dtype=object),
 array([['ALB', 2012, -0.00123972980526705, -0.00123811605779376],
        ['ALB', 2013, -0.0012412686430013, -0.00123972980526705],
        ['ALB', 2014, -0.0012427321935037099, -0.0012412686430013],
        ['ALB', 2015, -0.0012443577092106, -0.0012427321935037099],
        ['ALB

In [30]:
test_y = list()
predictions = list()
for x in test:
    for row in x:
        true = row[3]
        yhat = persistance_model(row[2])
        predictions.append(yhat)
        test_y.append(true)
test_mse = mean_squared_error(test_y,predictions)
test_mae = mean_absolute_error(test_y,predictions)
test_rmse = np.sqrt(test_mse)

In [32]:
print("Test MSE: %f" % test_mse)
print("Test MAE: %f" % test_mae)
print("Test RMSE: %f" % test_rmse)

Test MSE: 0.000002
Test MAE: 0.000233
Test RMSE: 0.001340


In [None]:
# convert the test dataset in chunks to [chunk][variable][time] format
def prepare_test_forecasts(test_chunks):
    predictions = list()
    # enumerate chunks to forecast
    for rows in test_chunks:
    # enumerate targets for chunk
    chunk_predictions = list()
    for j in range(3, rows.shape[1]):
        yhat = rows[:, j]
        chunk_predictions.append(yhat)
        chunk_predictions = array(chunk_predictions)
        predictions.append(chunk_predictions)
    return np.array(predictions)

# calculate the error between an actual and predicted value
def calculate_error(actual, predicted):
    # give the full actual value if predicted is nan
    if isnan(predicted):
        return abs(actual)
    # calculate abs difference
    return abs(actual - predicted)