# Analysis of Brazilian Development Indicators

In [84]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Importing database
original_df = pd.read_csv('Data.csv')

# Transposing database for setting features as columns
df_transpose = original_df.T

# Selecting rows and columns that are missing to many entries
rows_to_drop = ['Country Name', 'Country Code', 'Series Code']
columns_to_drop = [4,14,17,19,25,28,30,41,55,56,57,58,59]

# Dropping selected rows and columns
df_dropped_rows = df_transpose.drop(rows_to_drop)
df_dropped_columns_and_rows = df_dropped_rows.drop(columns_to_drop,axis=1)

# Substituting the remaining missing entries by NaN
df_smaller = df_dropped_columns_and_rows.replace('..', np.nan)

# Dropping header for imputation and analysis
df_headless = df_smaller.drop('Series Name')

# Imputing NaN using the mean strategy
imp_mean = SimpleImputer(strategy='mean')
df = pd.DataFrame(imp_mean.fit_transform(df_headless))

# Checking the most important quantities
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
count,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,...,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0
mean,201425000.0,0.880462,8515770.0,23.778737,6.33,1736959000000.0,8446.666667,2829747000000.0,13795.0,3.35,...,13.144457,78.055551,21.488646,109.252277,483210300000.0,42.410489,38457.727273,2867352000.0,60657630000.0,626417300.0
std,19080900.0,0.367124,0.0,2.156543,5.719035,629006500000.0,2884.807645,778278500000.0,3068.93378,0.497722,...,1.836142,3.723009,5.903429,15.064965,147182300000.0,17.330506,28150.646959,1137732000.0,26277120000.0,352199800.0
min,150706400.0,0.459584,8515770.0,18.031099,1.9,389692200000.0,2590.0,970854100000.0,6440.0,2.3,...,9.001535,72.222233,11.599412,66.442953,120311900000.0,22.638332,-1761.0,573000000.0,989000000.0,174380000.0
25%,203025200.0,0.740875,8515770.0,23.778737,3.85,1691209000000.0,7907.5,2937705000000.0,14135.0,3.175,...,12.270721,75.555556,18.363595,108.776068,483603700000.0,31.87553,16896.25,2686055000.0,44275880000.0,273542500.0
50%,207682300.0,0.802512,8515770.0,24.4461,5.3,1824244000000.0,8780.0,3074636000000.0,14570.0,3.325,...,13.144457,78.055551,19.991748,112.346801,543327300000.0,41.0863,36882.863636,2818348000.0,69029950000.0,619473700.0
75%,212136200.0,0.860463,8515770.0,24.996036,5.9325,1985862000000.0,9455.0,3134553000000.0,15332.5,3.525,...,14.355218,78.819439,22.977458,115.770594,557125400000.0,50.358831,59643.5,3301771000.0,75948950000.0,936410000.0
max,215313500.0,1.809856,8515770.0,25.507625,24.0,2571102000000.0,12750.0,3716656000000.0,17260.0,4.5,...,15.999958,84.444433,32.622387,125.759743,606484600000.0,86.182161,82589.0,4969231000.0,91502100000.0,1155730000.0


In [85]:
# Setting the target for prediction
y = df.loc[:,32] # target is GDP Growth (annual %)

In [86]:
# Defining the correlation matrix
df_corr = df.corr()

# Transforming the correlation matrix into an upper triangular matrix
upper_df_corr = df_corr.where(np.triu(np.ones(df_corr.shape), k=1).astype(bool))

# Selecting columns that present at least one entry of correlation greater than 0.95
high_corr_columns = [column for column in upper_df_corr.columns if any(upper_df_corr[column] > 0.95)]

# Defining the features to be used based on the previously selected columns
X = df.loc[:,high_corr_columns]

# Checking features to be used
X.head()

Unnamed: 0,6,7,8,10,14,20,21,22,24,26,28,35,36,39,40,41
0,2590.0,970854100000.0,6440.0,65.985,63.2,5888980.0,0.735736,2.87473,464989100000.0,2736.970678,33.271883,2.363841,0.000443,11.599412,66.442953,120311900000.0
1,3910.0,1536382000000.0,8740.0,69.737,34.7,5510886.0,0.991168,2.227694,655448200000.0,5.606065,23.006619,1.730726,13.184557,17.356368,100.0,242511700000.0
2,12750.0,3090348000000.0,15320.0,73.918,16.7,5069631.6,1.190697,1.204556,2472819000000.0,7.504565,21.22481,1.329446,134.392933,19.606204,125.759743,483734800000.0
3,12080.0,3127765000000.0,15370.0,74.306,16.3,5054239.8,1.148596,1.189409,2456044000000.0,7.84671,20.472463,1.330244,137.977627,18.797955,121.483706,556919600000.0
4,10160.0,2958026000000.0,14420.0,74.332,15.9,5038848.0,1.128069,1.170641,1802212000000.0,7.566175,19.360044,1.365517,125.647707,20.377292,107.347442,543397400000.0


In [104]:
# Splitting dataframe for training and validation of the model
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# Applying Random Forest model
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)

# Checking the model's MAE
y_prediction = forest_model.predict(val_X)
print("The model's mean absolute error is:")
print(mean_absolute_error(val_y, y_prediction))

# Checking model's predictions
prediction = forest_model.predict(X)
difference = y-prediction
print("The predictions are:")
print(prediction)

# Checking prediction's standard deviation
print("The standard deviation associated to the predicted values is:")
print(np.std(prediction))

# Checking mean difference between true and predicted values
print("The mean difference between the actual values and predicted values is:")
print(difference.mean())


The model's mean absolute error is:
0.6538899120975662
The predictions are:
[28.18969306 28.15879452 28.19917786 27.8097134  28.52904271 29.28317788
 28.50493314 28.67445285 28.36574661 26.41237762 27.56616526 27.64957419]
The standard deviation associated to the predicted values is:
0.6824228691778765
The mean difference between the actual values and predicted values is:
0.09674765638843397


# Conclusions

In this work, a Random Forest regression was applied to a dataset of Brazilian development indicators through the years 1990 to 2022 in order to predict Brazil's GDP. The result was a model of 0.65 mean average error, what implies the model was successful in its predictions. Note, however, that in order for this model to be capable of predicting future values for GDP, it would need to be fed with data from other development indicators of the year it is intended to predict.