# Machine Learning

This notebook has to objective to predict the next year value of the GDP. The process is archived by using all the indicators available in the dataset and also the countries which are in the same cluster as the one selected.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.patches as mpatches
import seaborn as sb
import pandas as pd
import os
import ipywidgets as widgets
import warnings
warnings.filterwarnings("ignore")
from turtle import color
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
 
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')
 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer
from Project.Utils.visualize import  get_zone
from sklearn.neighbors import KNeighborsRegressor
from Project.Utils.visualize import normalize_by_country


Set all the needed variables.

In [2]:
output_path = os.getcwd() + '/Output/'
cluster_path = output_path + '/Cluster/'
df_gold = pd.read_csv(output_path + 'GoldDataframe.csv', index_col = ['Country', 'Year', 'Region'])
df_cluster = pd.read_csv(cluster_path + 'All indicators.csv')
country_list = list(np.sort(df_gold.index.get_level_values('Country').unique()))

Get a dataframe with all the countries in the same cluster. It uses the dataframes generated in the cluster notebook. Before returning it the dataframe is normalized.

In [3]:
def get_cluster_countries(country, df_gold, df_cluster):
    #Given a country, obatin data from the same countries in the cluster. 
    country_cluster_target = df_cluster.loc[df_cluster['Country'] == country]['Cluster'].item()
    country_cluster_list = df_cluster.loc[df_cluster['Cluster'] == country_cluster_target]['Country'].tolist()

    df = df_gold.loc[df_gold.index.get_level_values('Country').isin(country_cluster_list)].sort_index(level = 1)
    df = normalize_by_country(df)
    df.dropna(axis=1, inplace=True)
    df.reset_index(drop = True, inplace=True)
    
    return df, country_cluster_list

Algortihm to generate the model that will predict the gdp value. It uses the ElasticNetCV which adapts really well to our case, allowing us to change this parameters:
- max_iter: number of iterations
- tol: is the tolerance for the stopping criteria.

The score variable represents the precision of the model.

In [4]:
def machine_algorithm(df, country_cluster_list):
      #Separate Features and target
      X_Target= df.iloc[len(country_cluster_list):]  ##Original gdp
      X_Target = X_Target.loc[:, X_Target.columns == "GDP"] 
      feature_df= df.iloc[:-len(country_cluster_list)]  ##Indicators
      feature_df= feature_df.loc[:, feature_df.columns != 'GDP']
      #Split the data
      X_train, X_test, y_train, y_test = train_test_split(feature_df, X_Target, test_size=0.3, random_state=0)
      alphas = [0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1]
      elastic_cv= ElasticNetCV(alphas=alphas, cv=5, max_iter = 100000, tol = 0.001)
      model = elastic_cv.fit(X_train, y_train)
      ypred = model.predict(X_test)
      score = model.score(X_test, y_test)
      mse = mean_squared_error(y_test, ypred)
      #print("R2:{0:.4f}, MSE:{1:.4f}, RMSE:{2:.4f}"
      #      .format(score, mse, np.sqrt(mse)))
      return model, score, feature_df

In first place, normalize the dataframe target and make a copy. In the variable prediction is stored  the predictions for all the years, we will only read the last one, corresponding to 2021.

After all the procedures, plot the data and analyze the prediction.

In [5]:
def plot_machine(model, score, feature_df, country, df_gold):
    df_country = df_gold.loc[df_gold.index.get_level_values('Country') == country]
    df_country_original  = df_country.copy()
    df_country = normalize_by_country(df_country)

    pred_df  = df_country.copy().dropna(axis=1)
    pred_df= pred_df.loc[:, pred_df.columns != 'GDP']
    pred_df= pred_df.loc[:, pred_df.columns.isin(feature_df.columns)] # You have to use the same indicators as the trained model (all countries must have data for the indicator, if not is dropped)
    pred_df.drop(pred_df.head(-1).index,inplace=True) # Save only the last year to be used as input in the model
    prediction  = model.predict(pred_df)
    next_year_predict = prediction[-1] * df_country_original['GDP'].max()

    aux_line = np.array([df_country_original['GDP'][-1], next_year_predict])
    std_1 = 1 - score

    plt.plot(2021, next_year_predict, 'ro', label='Prediction') ##Dot
    plt.fill_between(np.array([2020,2021]), aux_line * (1 - std_1), aux_line * (1 + std_1), color='r', alpha=0.2) ##Fill
    plt.plot([2020,2021], aux_line, 'b-', color = 'red', linestyle='--') ##Discontinous line
    plt.plot(df_country_original.index.get_level_values('Year').values, df_country_original['GDP'], 'b-', label='Original', color = 'blue') ##Original GDP

    plt.legend()
    plt.title('Following year GDP prediction')
    plt.show()


Widget to call all the above methods.

In [6]:
def table_machine(zone):
    df, country_cluster_list = get_cluster_countries(zone, df_gold, df_cluster)
    model, score, feature_df = machine_algorithm(df, country_cluster_list)
    plot_machine(model, score, feature_df, zone, df_gold)


zone_drop_machine = widgets.Dropdown(
    options= country_list,
    value ='Afghanistan',
    description='Zone:',
)

widgets.interact(table_machine, zone = zone_drop_machine)

interactive(children=(Dropdown(description='Zone:', options=('Afghanistan', 'Albania', 'Algeria', 'Angola', 'A…

<function __main__.table_machine(zone)>