# Dados do IDH

In [1]:
import numpy as np
import pandas as pd

!pip install pycountry
import pycountry



Could not install packages due to an EnvironmentError: [Errno 2] No such file or directory: 'd:\\anaconda\\envs\\icd\\lib\\site-packages\\sphinxcontrib_qthelp-1.0.2.dist-info\\METADATA'

You are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


## Leitura dos dados

- **Fonte**:
    - https://www.kaggle.com/ruslankl/european-union-lgbt-survey-2012 (LGBT-Survey)
    - http://hdr.undp.org/en/indicators/137506 (Human Development Recovery)
    

- **Obs.:** Convertemos os dados da segunda fonte para o formato csv (vide última célula deste notebook). Tivemos que remover um país que estava dando erro na leitura (isso não influenciará a nossa análise).

In [2]:
df_DailyLife = pd.read_csv('Data/european-union-lgbt-survey-2012/LGBT_Survey_DailyLife.csv')
df_HDI = pd.read_csv('Data/cleaned-survey-data-2012/clean_hdi.csv')

# A primeira coluna indica o rank do país no ano de 2017
df_HDI.head()

Unnamed: 0,(2017),Country,1990,1991,1992,1993,1994,1995,1996,1997,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,168,Afghanistan,..,..,..,..,..,..,..,..,...,0.437,0.453,0.463,0.471,0.482,0.487,0.491,0.493,0.494,0.498
1,68,Albania,0.645,0.626,0.610,0.613,0.619,0.632,0.641,0.641,...,0.724,0.729,0.741,0.752,0.767,0.771,0.773,0.776,0.782,0.785
2,85,Algeria,0.577,0.581,0.587,0.591,0.595,0.600,0.608,0.617,...,0.709,0.719,0.729,0.736,0.74,0.745,0.747,0.749,0.753,0.754
3,35,Andorra,..,..,..,..,..,..,..,..,...,0.831,0.83,0.828,0.827,0.849,0.85,0.853,0.854,0.856,0.858
4,147,Angola,..,..,..,..,..,..,..,..,...,0.502,0.522,0.52,0.535,0.543,0.554,0.564,0.572,0.577,0.581


## Limpeza dos dados

In [3]:
# Pegando apenas a coluna que indica o IDH no ano de 2012
df_HDI = df_HDI[['Country', '2012']]

# Pegando apenas os países da União Europeia
countries = df_DailyLife['CountryCode'].unique()
df_HDI = df_HDI.loc[df_HDI['Country'].isin(countries)]

# Ordenando por ordem crescente e reiniciando os índices
df_HDI = df_HDI.sort_values(by='2012')
df_HDI.reset_index(drop=True, inplace=True)

df_HDI.head()

Unnamed: 0,Country,2012
0,Bulgaria,0.786
1,Romania,0.795
2,Croatia,0.816
3,Latvia,0.824
4,Portugal,0.829


## Salvando em um novo CSV

In [4]:
df_HDI.to_csv('Data/cleaned-survey-data-2012/HDI_EU-2012.csv', index=False)

- Aqui temos o código que fizemos para realizar as conversões explicadas na observação do começo deste notebook.
- Deixamos comentado justamente por termos mexido manualmente depois devido àquele erro.

In [5]:
# import csv
# import string

# input_file = open('human_development_index.csv', 'r')
# output_file = open('fix_hdi.csv', 'w')
# specials = '"'

# for line in input_file:
#     new_line = str.replace(line,specials,'')
#     output_file.write(new_line)
    
# input_file.close()
# output_file.close()

# Dados da pesquisa

In [6]:
df_DailyLife                    = pd.read_csv('Data/european-union-lgbt-survey-2012/LGBT_Survey_DailyLife.csv')
df_RightsAwareness              = pd.read_csv('Data/european-union-lgbt-survey-2012/LGBT_Survey_RightsAwareness.csv')
df_Discrimination               = pd.read_csv('Data/european-union-lgbt-survey-2012/LGBT_Survey_Discrimination.csv')
df_ViolenceAndHarrassment       = pd.read_csv('Data/european-union-lgbt-survey-2012/LGBT_Survey_ViolenceAndHarassment.csv')
df_SubsetSize                   = pd.read_csv('Data/european-union-lgbt-survey-2012/LGBT_Survey_SubsetSize.csv')
df_TransgenderSpecificQuestions = pd.read_csv('Data/european-union-lgbt-survey-2012/LGBT_Survey_TransgenderSpecificQuestions.csv')

## Limpeza dos dados

- Limpeza retirada do site: https://www.kaggle.com/ruslankl/lgbt-survey-analysis
- A limpeza se dará da seguinte forma:
    1. Renomearemos a coluna CountryCode para CountryName
    2. Adicionamos o ID do país
    3. Tirando os espaços desnecessários em notes
    4. Substituindo notes que são [1] (not a number) por np.NaN, e retirando essa coluna dos dados
    5. Convertendo os valores de porcentagem para float

In [7]:
countries = {}
for country in pycountry.countries:
    countries[country.name] = country.alpha_2

def clean_data(df):
    df = df.rename(columns={'CountryCode': 'CountryName'})
    codes = [countries.get(country, 'Unknown code') for country in df['CountryName']]
    df['CountryID'] = codes
    df.loc[df['CountryName'] == 'Czech Republic', 'CountryID'] = 'CZ'
    df.loc[df['notes'] == ' [1] ', 'notes'] = '[1]'
    df.loc[df['notes'] == '[1]', 'percentage'] = np.NaN
    df.drop('notes', axis=1, inplace=True)
    df['percentage'] = df['percentage'].astype('float')
    return df

In [8]:
df_DailyLife                    = clean_data(df_DailyLife)
df_Discrimination               = clean_data(df_Discrimination)
df_RightsAwareness              = clean_data(df_RightsAwareness)
df_TransgenderSpecificQuestions = clean_data(df_TransgenderSpecificQuestions)
df_ViolenceAndHarrassment       = clean_data(df_ViolenceAndHarrassment)

df_DailyLife.head()

Unnamed: 0,CountryName,subset,question_code,question_label,answer,percentage,CountryID
0,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Very widespread,8.0,AT
1,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Fairly widespread,34.0,AT
2,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Fairly rare,45.0,AT
3,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Very rare,9.0,AT
4,Austria,Lesbian,b1_a,"In your opinion, how widespread is offensive l...",Don`t know,4.0,AT


## Salvando em um novo CSV

In [9]:
def save_as_csv(df, filename):
    df.to_csv('Data/cleaned-survey-data-2012/' + filename, index=False)
    
save_as_csv(df_DailyLife, 'DailyLife.csv')
save_as_csv(df_Discrimination, 'Discrimination.csv')
save_as_csv(df_RightsAwareness, 'RightsAwareness.csv')
save_as_csv(df_SubsetSize, 'SubsetSize.csv')
save_as_csv(df_TransgenderSpecificQuestions, 'TransgenderSpecificQuestions.csv')
save_as_csv(df_ViolenceAndHarrassment, 'ViolenceAndHarrassment.csv')