# HIPOTESE: Como a economia afeta o setor publico e privado de maneiras diferentes

## Importação das livrarias

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Lendo dataset

In [4]:
df_economy = pd.read_csv('data/BRICS Development Indicators/Economy_Data.csv', sep=';')
df_private_sector = pd.read_csv('data/BRICS Development Indicators/PrivateSector_Data.csv', sep=';')
df_public_sector = pd.read_csv('data/BRICS Development Indicators/PublicSector_Indicators.csv', sep=';')

## Seleção de 'Features'

In [10]:
df_economy.head()

Unnamed: 0,SeriesName,SeriesCode,CountryName,CountryCode,Year,Value
0,Adjusted net national income (annual % growth),NY.ADJ.NNTY.KD.ZG,Brazil,BRA,1970.0,
1,Adjusted net national income (annual % growth),NY.ADJ.NNTY.KD.ZG,China,CHN,1970.0,
2,Adjusted net national income (annual % growth),NY.ADJ.NNTY.KD.ZG,India,IND,1970.0,
3,Adjusted net national income (annual % growth),NY.ADJ.NNTY.KD.ZG,Russian Federation,RUS,1970.0,
4,Adjusted net national income (annual % growth),NY.ADJ.NNTY.KD.ZG,South Africa,ZAF,1970.0,


In [42]:
df_economy = df_economy.dropna()

In [82]:
# pip install category_encoders

In [46]:
from category_encoders.ordinal import OrdinalEncoder

In [48]:
oencoder = OrdinalEncoder(
mapping = [{
    'col': 'CountryName',
    'mapping': {'Brazil': 1, 'China': 2, 'India': 3, 'Russian Federation': 4, 'South Africa': 5}
    }])

In [49]:
df_economy.loc[:, 'Country Encoded'] = oencoder.fit_transform(df_economy['CountryName'])['CountryName'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [52]:
df_economy.head()

Unnamed: 0,SeriesName,SeriesCode,CountryName,CountryCode,Year,Value,Country Encoded
5,Adjusted net national income (constant 2010 US$),NY.ADJ.NNTY.KD,Brazil,BRA,1970.0,391897400000.0,1
7,Adjusted net national income (constant 2010 US$),NY.ADJ.NNTY.KD,India,IND,1970.0,191533500000.0,3
10,Adjusted net national income (current US$),NY.ADJ.NNTY.CD,Brazil,BRA,1970.0,37860210000.0,1
11,Adjusted net national income (current US$),NY.ADJ.NNTY.CD,China,CHN,1970.0,85255610000.0,2
12,Adjusted net national income (current US$),NY.ADJ.NNTY.CD,India,IND,1970.0,57767200000.0,3


In [59]:
df_econ_series = df_economy.drop(['CountryCode', 'SeriesCode', 'CountryName'], axis=1).set_index('SeriesName').dropna(axis=0).T
df_econ_series

SeriesName,Adjusted net national income (constant 2010 US$),Adjusted net national income (constant 2010 US$).1,Adjusted net national income (current US$),Adjusted net national income (current US$).1,Adjusted net national income (current US$).2,Adjusted net national income per capita (constant 2010 US$),Adjusted net national income per capita (constant 2010 US$).1,Adjusted net national income per capita (current US$),Adjusted net national income per capita (current US$).1,Adjusted net national income per capita (current US$).2,...,"Travel services (% of service imports, BoP)","Travel services (% of service imports, BoP).1","Travel services (% of service imports, BoP).2","Travel services (% of service imports, BoP).3","Travel services (% of service imports, BoP).4","Use of IMF credit (DOD, current US$)","Use of IMF credit (DOD, current US$).1","Use of IMF credit (DOD, current US$).2","Use of IMF credit (DOD, current US$).3","Use of IMF credit (DOD, current US$).4"
Year,1970.0,1970.0,1970.0,1970.0,1970.0,1970.0,1970.0,1970.0,1970.0,1970.0,...,2019.0,2019.0,2019.0,2019.0,2019.0,2019.0,2019.0,2019.0,2019.0,2019.0
Value,391897400000.0,191533500000.0,37860210000.0,85255610000.0,57767200000.0,4120.323413,344.987343,398.053948,104.184348,104.049464,...,25.359912,50.369229,17.554619,36.517225,20.041388,3992325000.0,9665492000.0,5501237000.0,7843114000.0,2468918000.0
Country Encoded,1.0,3.0,1.0,2.0,3.0,1.0,3.0,1.0,2.0,3.0,...,1.0,2.0,3.0,4.0,5.0,1.0,2.0,3.0,4.0,5.0


### Particionando

In [30]:
from sklearn.model_selection import train_test_split

In [60]:
df_train, df_test = train_test_split(df_econ_series, test_size=0.3)

In [61]:
x_train = df_train
x_test = df_test

### Treshold

In [62]:
from sklearn.feature_selection import VarianceThreshold

In [63]:
vt = VarianceThreshold(threshold=0)

In [64]:
vt.fit(x_train)

VarianceThreshold(threshold=0)

In [70]:
df_variance = pd.DataFrame({'Feature' : df_economy['SeriesName'],
                          'Variance' : vt.variances_}).sort_values('Variance', ascending=True)

In [113]:
df_variance['Feature'].values.tolist()

['Adjusted savings: education expenditure (% of GNI)',
 'Adjusted savings: education expenditure (% of GNI)',
 'Agriculture, forestry, and fishing, value added (annual % growth)',
 'Agriculture, forestry, and fishing, value added (annual % growth)',
 'PPP conversion factor, private consumption (LCU per international $)',
 'Adjusted savings: carbon dioxide damage (% of GNI)',
 'Foreign direct investment, net inflows (% of GDP)',
 'Industry (including construction), value added (annual % growth)',
 'Adjusted savings: education expenditure (% of GNI)',
 'Adjusted savings: carbon dioxide damage (% of GNI)',
 'Adjusted savings: energy depletion (% of GNI)',
 'Total debt service (% of GNI)',
 'Adjusted savings: net national savings (% of GNI)',
 'Adjusted savings: education expenditure (% of GNI)',
 'Agriculture, forestry, and fishing, value added (annual % growth)',
 'Households and NPISHs Final consumption expenditure (annual % growth)',
 'Adjusted net national income per capita (annual % 

### Filters

Aplicaremos dois filtros, 'ANOVA' e 'mutual information'

In [None]:
df_public_sector['Value'].dropna()

In [108]:
df_economy['CountryCode'].values.tolist()

['BRA',
 'IND',
 'BRA',
 'CHN',
 'IND',
 'BRA',
 'IND',
 'BRA',
 'CHN',
 'IND',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'RUS',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'RUS',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'BRA',
 'CHN',
 'IND',
 'RUS',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'RUS',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'ZAF',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'ZAF',
 'ZAF',
 'IND',
 'ZAF',
 'BRA',
 'IND',
 'ZAF',
 'ZAF',
 'ZAF',
 'ZAF',
 'BRA',
 'IND',
 'BRA',
 'IND',
 'BRA',
 'CHN',
 'IND',
 'ZAF',
 'BRA',
 'IND',
 'ZAF',
 'BRA',
 'CHN',
 'IND',


In [79]:
y_train, y_test = train_test_split(df_public_sector['Value'].dropna(), test_size=0.3)

In [77]:
from sklearn.feature_selection import f_regression

In [80]:
f, p = f_regression(x_train, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [2, 18786]