In [None]:
# %load_ext pycodestyle_magic
# %%pycodestyle
# https://stackoverflow.com/a/54278757

import pandas as pd
import numpy as np

weo_subject_code = 'WEO Subject Code'
estimates_after = 'Estimates Start After'
iso_col = 'ISO'
country_name = 'Country'
usecols = [iso_col, weo_subject_code, country_name]
for i in range(1980,2020):
    usecols.append(str(i))
usecols.append(estimates_after)

df = pd.read_table(
    'WEOOct2020all.xls',
    # nrows=10,
    encoding='UTF-16-LE',
    usecols=usecols,
)


In [None]:
''' select top 10 GDP per capita countries '''

# filtering gdp per capita 
gdppc = 'NGDPRPPPPC'
gdp_per_capita_common_dollar = lambda col: col[weo_subject_code] == gdppc
gdp_per_capita_df = df.loc[gdp_per_capita_common_dollar]

# https://stackoverflow.com/a/52065957
#gdp_per_capita_df = gdp_per_capita_df['2019'].astype('str').str.replace(',','')

# creating dataframe for sorting
# https://stackoverflow.com/a/57064872
year_col = '2019'
year_col_before = str(int(year_col)-1)
gdp_increase_col = f'GDP Increase from {year_col_before} to {year_col}'

# requires converting to numeric value for sorting
gdp_per_capita_df[year_col] = gdp_per_capita_df[year_col].replace(regex=',',value='').astype(float)
gdp_per_capita_df[year_col_before] = gdp_per_capita_df[year_col_before].replace(regex=',',value='').astype(float)
gdp_per_capita_df = gdp_per_capita_df[[country_name, year_col, year_col_before]]
gdp_per_capita_df = gdp_per_capita_df.set_index(country_name)

# calculate difference between two columns row by row
# https://towardsdatascience.com/time-series-modeling-using-scikit-pandas-and-numpy-682e3b8db8d1
#gdp_per_capita_df.loc[:,gdp_increase_col] = gdp_per_capita_df.loc[:,year_col_before].diff()
#gdp_per_capita_df[gdp_increase_col] = gdp_per_capita_df[year_col] - gdp_per_capita_df[year_col]

# ran into issues that calculation showed zeros every where
# lost datatype :/
c = []
for row in gdp_per_capita_df.itertuples():
   #c.append([row[Index], row[year_col], row[year_col_before], row[year_col] - row[year_col_before]])
   c.append([row[0], row[1], row[2], row[1] - row[2]])

delta_col = np.array(c)
delta_df = pd.DataFrame(delta_col, columns=[country_name, year_col, year_col_before, gdp_increase_col])
delta_df = delta_df.set_index(country_name).astype(float)
delta_df = delta_df.dropna()

# sorting on gdp increase col
delta_df = delta_df.sort_values(gdp_increase_col, ascending=False, na_position='last')

# select top 10
delta_df[:10]

# select first row of column
# https://stackoverflow.com/a/25254087
#gdp_per_capita_df.iloc[0, gdp_per_capita_df.columns.get_loc(year_col)]

# https://stackoverflow.com/a/64307654
#df[gdp_per_capita_common_dollar].values
# for i, row in enumerate(df[gdp_per_capita_common_dollar].values):
#     if row[2] < 1:
#         print(i,row)

# df_only_iso =  df[gdp_per_capita_common_dollar][iso_col]
# df_only_iso


In [None]:

oecd_countries_all_caps = {
    'AUSTRIA':'',
    'AUSTRALIA':'',
    'BELGIUM':'',
    'CANADA':'',
    'CHILE':'',
    'COLOMBIA':'',
    'CZECH REPUBLIC':'',
    'DENMARK':'',
    'ESTONIA':'',
    'FINLAND':'',
    'FRANCE':'',
    'GERMANY':'',
    'GREECE':'',
    'HUNGARY':'',
    'ICELAND':'',
    'IRELAND':'',
    'ISRAEL':'',
    'ITALY':'',
    'JAPAN':'',
    'KOREA':'',
    'LATVIA':'',
    'LITHUANIA':'',
    'LUXEMBOURG':'',
    'MEXICO':'',
    'NETHERLANDS':'',
    'NEW ZEALAND':'',
    'NORWAY':'',
    'POLAND':'',
    'PORTUGAL':'',
    'SLOVAK REPUBLIC':'',
    'SLOVENIA':'',
    'SPAIN':'',
    'SWEDEN':'',
    'SWITZERLAND':'',
    'TURKEY':'',
    'UNITED KINGDOM':'',
    'UNITED STATES':''}
# properly formatting OECD country names
oecd_countries = {}
for key in oecd_countries_all_caps:
    oecd_countries[key.title()] = ''
# oecd_countries


In [None]:
import matplotlib.pyplot as plt

'''
plot of OECD countries population
'''

population_key = 'LP'
# selecting dataframe based on two columns
# population_df = df.loc[ (df[weo_subject_code] == population_key) & (df[country_name] == 'Lithuania')]

# creating dataframe on population
population_df = df.loc[(df[weo_subject_code] == population_key)]

# sets index and index is stored for future
population_df = population_df.set_index(country_name)

# filtering oecd contries
population_df = population_df.loc[oecd_countries]

# do not convert to string for filtering
# rather convert dataframe to floats
decade = []
for i in range(2010,2020):
    decade.append(str(i))

# https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html
# selecting decade colums
population_df = population_df[decade]
# converting to plotable time series, transpose
population_df = population_df.T

# wrong data type is set needs to cast to numeris type
population_df = population_df.astype(float)

# select only limited subset of countries instead of all as it population change is better visible
population_df = population_df[['Lithuania', 'Latvia', 'Iceland']]


plt.figure()
population_df.plot()


In [None]:
'''save all countries GDP in different PNG files'''

# Gross domestic product, current prices
# Values are based upon GDP in national currency converted to U.S. dollars
gdp_key = 'NGDPD'

gdp_df = df.loc[lambda df: df[weo_subject_code] == gdp_key]

# setting index country name
gdp_df = gdp_df.set_index(country_name)

# filtering non required columns
selected_cols = []

for col in gdp_df.columns:
    if col == iso_col or col == weo_subject_code or col == estimates_after:
       continue

    selected_cols.append(col)

# select country name column and period columns
gdp_df = gdp_df[selected_cols]

# https://stackoverflow.com/a/49896522
# applies lambda to rows, cleans numeric values of thousands separator
gdp_df = gdp_df.apply(lambda df: df.str.replace(',','').astype(float), axis=0)

# prepare folder for pics
figure_folder_name = 'figures'
import os
if not os.path.exists(figure_folder_name):
    os.makedirs(figure_folder_name)

# remove [0:1] to save all
for country in gdp_df.index[0:1]:
    # print(country)
    # https://stackoverflow.com/a/45379210
    fig = gdp_df.loc[country].plot().get_figure()
    fig.savefig(f'{figure_folder_name}\{country}.png',format='png',transparent=False)
    # closes the plot, as no need to display, while saving
    # https://stackoverflow.com/a/15713545
    plt.close(fig)



In [None]:
''' find lowest common denominator for year 2015 '''
# create dataframe of WEO codes and 2015 year
common_denominator_df = df[[weo_subject_code,'2015']]

# removes all na values
common_denominator_df = common_denominator_df.dropna()

# selecting only WEO code and grouping to display
common_denominator_df = common_denominator_df[[weo_subject_code]]
common_denominator_df = common_denominator_df.groupby([weo_subject_code])

# common_denominator_df.apply(print)
# https://stackoverflow.com/a/36951842
# simple print
lowest_common_denom = []
for key in common_denominator_df.groups.keys():
   lowest_common_denom.append(key)

# uncoment for display 
# lowest_common_denom


In [None]:
''' K-Means clustering '''
from sklearn.cluster import KMeans

# still requires GDP_key
volume_of_exported_goods_key = 'TXG_RPCH'


kmeans = KMeans(n_clusters=5)

In [None]:
''' GDP per capita prediction '''

from sklearn.linear_model import LinearRegression
# from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score
from joblib import dump, load

# select non GDP related weo keys
gdp_weo_key = 'GDP'

country_select = lambda df: df[country_name]=='Germany'
# use regular expr with bitmask to filter out all GDP related fields
# https://stackoverflow.com/a/17097777
input_features = df[
      ~df[weo_subject_code].str.contains(gdp_weo_key, na=False)
      & ~df[weo_subject_code].str.contains('PPPC', na=False)].fillna(0.0)

#drop_clumns = [country_name,iso_col, weo_subject_code, estimates_after]
drop_clumns = [country_name, iso_col, estimates_after]

input_features = input_features.loc[country_select]
#input_features = input_features.set_index(country_name)
input_features = input_features.drop(columns=drop_clumns)
input_features = input_features.T

# save feature column codes
feature_weo_codes = []

# building feature data frame
cleand_features = []
i = 0
for row in input_features.itertuples():
    # index 
    if row[0] == weo_subject_code:
          feature_weo_codes = row[1:]
          continue

    float_fts = []
    
    for x in row[1:]:
        if type(x) == str:
            float_fts.append(float(row[1].replace(',','')))
        else:
            float_fts.append(float(x))
    cleand_features.append(float_fts)

# cleand_features  
print(feature_weo_codes)

drop_columns_inlcuding_weo = [country_name,iso_col, estimates_after,weo_subject_code]

gdp_data = df.loc[gdp_per_capita_common_dollar].dropna()
gdp_data = gdp_data.loc[country_select].drop(columns=drop_columns_inlcuding_weo)
# gdp_data = gdp_data.apply(lambda df: df.str.replace(',','').astype(float), axis=0)
cleand_result = []
for row in gdp_data.T.itertuples():
    cleand_result.append(float(row[1].replace(',','')))

#cleand_result

training_features = cleand_features[:-10]
training_gdp = cleand_result[:-10]

training_features_test = cleand_features[-10:]
training_gdp_test = cleand_result[-10:]

# training
# https://machinelearningmastery.com/make-predictions-scikit-learn/
# https://machinelearningmastery.com/calculate-feature-importance-with-python/

model = LinearRegression()
model.fit(training_features, training_gdp)

# # get importance
importance = model.coef_
# summarize feature importance
for i,v in enumerate(importance):
      if v > 100:
	      print('Feature: %0d, Score: %.5f, weo_key: %s' % (i,v, feature_weo_codes[i]))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

# prediction
# predict = training_features_test
# result = training_gdp_test

# result_predicted = model.predict(predict)

# # https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
# # The coefficients
# print('Coefficients: \n', model.coef_)
# # The mean squared error
# print('Mean squared error: %.2f'
#       % mean_squared_error(result, result_predicted))
# # The coefficient of determination: 1 is perfect prediction
# print('Coefficient of determination: %.2f'
#       % r2_score(result, result_predicted))

# # print(predict,result,result_predicted[0])

# # https://scikit-learn.org/stable/modules/model_persistence.html
# file_name = 'filename.joblib'
# dump(model, file_name)
# model2 = load(file_name)

# result_predicted2 = model2.predict(predict)
# print('Coefficients: \n', model2.coef_
#       % mean_squared_error(result, result_predicted2))
# print('Coefficient of determination: %.2f'
#       % r2_score(result, result_predicted2))



In [None]:
float('11142.727')
'''Feature: 0, Score: 1.42880
Feature: 1, Score: 1280.82544
Feature: 2, Score: 9573.63457
Feature: 3, Score: -415.47826
Feature: 4, Score: -178.41465
Feature: 5, Score: 249.30285
Feature: 6, Score: -241.76799
Feature: 7, Score: 132.76052
Feature: 8, Score: -118.34678
Feature: 9, Score: 229.52691
Feature: 10, Score: -74.05452
Feature: 11, Score: 31.97825'''

feature_weo_codes.groupby([weo_subject_code])


In [None]:
# import io
# t="""a,b,c
# 0,1,2
# 3,4,5"""
# pd.read_csv(io.StringIO(t), header=0, usecols=['a'])


# plt.close('all')

# ts = pd.Series(np.random.randn(1000),
#    index=pd.date_range('1/1/2000', periods=1000))

#ts
# ts = ts.cumsum()
# ts.plot()

# plot_test_df = pd.DataFrame(np.random.randn(1000, 4),
#    index=ts.index, columns=list('ABCD'))

#plot_test_df


#plot_test_df = plot_test_df.cumsum()

# plt.figure()

# plot_test_df.plot()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
# generate regression dataset
X, y = make_regression(n_samples=100, n_features=2, noise=0.1, random_state=1)
# fit final model
model = LinearRegression()
model.fit(X, y)

In [None]:
Xnew = [[-1.07296862, -0.52817175]]
# make a prediction
ynew = model.predict(Xnew)
# show the inputs and predicted outputs
print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))

In [None]:
type(y)