# Limpando dados da tabela 'paises'
Padronizando os dados da tabela 'Faculdades' e adicionando as coordenadas geográficas para realização do heatmap.

## Importar biblioteca

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpds
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2
from geopy.geocoders import Nominatim
import folium
from folium import plugins
from folium.plugins import MarkerCluster

import warnings
warnings.filterwarnings('ignore')

## Abrir arquivo csv e avaliar dados

In [4]:
data_paises = pd.read_csv('rank_paises_6oct_2020.csv')

In [5]:
data_paises

Unnamed: 0,rank,sigla,pais,resolvidos,estudantes
0,105,SG,Singapore,322,18
1,106,BV,Bouvet Island,321,18
2,107,SB,Solomon Islands,291,1
3,108,GW,Guinea-bissau,289,5
4,109,BY,Belarus,280,15
...,...,...,...,...,...
236,237,NC,New Caledonia,0,0
237,238,MP,Northern Mariana Islands,0,0
238,239,RW,Rwanda,0,0
239,240,LC,Saint Lucia,0,0


## Usar rank como index e ordenar

In [6]:
data_paises.set_index('rank', inplace=True)
data_paises

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
105,SG,Singapore,322,18
106,BV,Bouvet Island,321,18
107,SB,Solomon Islands,291,1
108,GW,Guinea-bissau,289,5
109,BY,Belarus,280,15
...,...,...,...,...
237,NC,New Caledonia,0,0
238,MP,Northern Mariana Islands,0,0
239,RW,Rwanda,0,0
240,LC,Saint Lucia,0,0


In [7]:
data_paises.sort_index(inplace=True)
data_paises.head()

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,BR,Brazil,4299388,113824
2,BD,Bangladesh,1233339,32571
3,EG,Egypt,178149,6648
4,ID,Indonesia,51537,1610
5,MX,Mexico,48732,1812


## Criar atributo com sigla geo

In [8]:
def get_continent(col):
    try:
        cn_a2_code =  country_name_to_country_alpha2(col)
    except:
        cn_a2_code = 'Unknown' 
    try:
        cn_continent = country_alpha2_to_continent_code(cn_a2_code)
    except:
        cn_continent = 'Unknown' 
    return (cn_a2_code, cn_continent)

In [9]:
data_paises['code geo'] = [get_continent(row) for row in data_paises['pais']]


In [10]:
data_paises.head(30)

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,BR,Brazil,4299388,113824,"(BR, SA)"
2,BD,Bangladesh,1233339,32571,"(BD, AS)"
3,EG,Egypt,178149,6648,"(EG, AF)"
4,ID,Indonesia,51537,1610,"(ID, AS)"
5,MX,Mexico,48732,1812,"(MX, NA)"
6,AR,Argentina,45329,1326,"(AR, SA)"
7,IN,India,42911,2160,"(IN, AS)"
8,CO,Colombia,37275,1298,"(CO, SA)"
9,KG,Kyrgyzstan,29795,355,"(KG, AS)"
10,US,United States,26701,1068,"(US, NA)"


## Tratar Unknown values

In [11]:
data_paises.loc[data_paises['code geo'] ==  ('Unknown', 'Unknown')]

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
17,PS,Palestinian Territories,7585,282,"(Unknown, Unknown)"
36,GB,United Kingdom (Great Britain),2459,58,"(Unknown, Unknown)"
69,VA,Vatican City (Holy See),700,15,"(Unknown, Unknown)"
73,HR,Croatia (hrvatska),633,38,"(Unknown, Unknown)"
97,DZ,Algeria (El Djazaïr),345,18,"(Unknown, Unknown)"
100,SM,San Marino (Republic of),342,1,"(Unknown, Unknown)"
108,GW,Guinea-bissau,289,5,"(Unknown, Unknown)"
133,CS,Serbia and Montenegro,173,11,"(Unknown, Unknown)"
146,TD,Chad (T'Chad),132,3,"(Unknown, Unknown)"
159,HM,Heard Island and Mcdonald Islands,96,1,"(Unknown, Unknown)"


In [12]:
data_paises.drop([236, 229, 221], inplace=True)

In [13]:
nomes = {
    'Palestinian Territories': 'Palestine',
    'United Kingdom (Great Britain)': 'United Kingdom',
    'Vatican City (Holy See)': 'Italy',
    'Croatia (hrvatska)' : 'Croatia',
    'Algeria (El Djazaïr)': 'Algeria',
    'San Marino (Republic of)': 'San Marino',
    'Serbia and Montenegro': 'Serbia',
    'Chad (T\'Chad)': 'Chad',
    'Faeroe Islands': 'Faroe Islands',
    'Congo, Republic Of' : 'Congo',
    'RÉunion':'Réunion'
}

In [14]:
unk = data_paises.loc[data_paises['code geo'] ==  ('Unknown', 'Unknown')]

In [15]:
unk

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
17,PS,Palestinian Territories,7585,282,"(Unknown, Unknown)"
36,GB,United Kingdom (Great Britain),2459,58,"(Unknown, Unknown)"
69,VA,Vatican City (Holy See),700,15,"(Unknown, Unknown)"
73,HR,Croatia (hrvatska),633,38,"(Unknown, Unknown)"
97,DZ,Algeria (El Djazaïr),345,18,"(Unknown, Unknown)"
100,SM,San Marino (Republic of),342,1,"(Unknown, Unknown)"
108,GW,Guinea-bissau,289,5,"(Unknown, Unknown)"
133,CS,Serbia and Montenegro,173,11,"(Unknown, Unknown)"
146,TD,Chad (T'Chad),132,3,"(Unknown, Unknown)"
159,HM,Heard Island and Mcdonald Islands,96,1,"(Unknown, Unknown)"


In [16]:
unk['pais'] = unk['pais'].map(nomes)

In [17]:
unk

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
17,PS,Palestine,7585,282,"(Unknown, Unknown)"
36,GB,United Kingdom,2459,58,"(Unknown, Unknown)"
69,VA,Italy,700,15,"(Unknown, Unknown)"
73,HR,Croatia,633,38,"(Unknown, Unknown)"
97,DZ,Algeria,345,18,"(Unknown, Unknown)"
100,SM,San Marino,342,1,"(Unknown, Unknown)"
108,GW,,289,5,"(Unknown, Unknown)"
133,CS,Serbia,173,11,"(Unknown, Unknown)"
146,TD,Chad,132,3,"(Unknown, Unknown)"
159,HM,,96,1,"(Unknown, Unknown)"


## Remover dados antigos da tabela data_paises e dados com erro da tabela unk

In [18]:
data_paises.drop([17, 36, 69, 73, 97, 100, 108, 133, 146, 159, 167, 169, 175, 201, 232, 215], inplace=True)

In [19]:
unk = unk.dropna()

In [20]:
unk

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
17,PS,Palestine,7585,282,"(Unknown, Unknown)"
36,GB,United Kingdom,2459,58,"(Unknown, Unknown)"
69,VA,Italy,700,15,"(Unknown, Unknown)"
73,HR,Croatia,633,38,"(Unknown, Unknown)"
97,DZ,Algeria,345,18,"(Unknown, Unknown)"
100,SM,San Marino,342,1,"(Unknown, Unknown)"
133,CS,Serbia,173,11,"(Unknown, Unknown)"
146,TD,Chad,132,3,"(Unknown, Unknown)"
167,FO,Faroe Islands,81,1,"(Unknown, Unknown)"
169,CG,Congo,75,2,"(Unknown, Unknown)"


In [21]:
data_paises.loc[data_paises['code geo'] ==  ('Unknown', 'Unknown')]

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


## Adicionar code geo aos países da tabela unk

In [22]:
unk['code geo'] = [get_continent(row) for row in unk['pais']]

In [23]:
unk

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
17,PS,Palestine,7585,282,"(PS, AS)"
36,GB,United Kingdom,2459,58,"(GB, EU)"
69,VA,Italy,700,15,"(IT, EU)"
73,HR,Croatia,633,38,"(HR, EU)"
97,DZ,Algeria,345,18,"(DZ, AF)"
100,SM,San Marino,342,1,"(SM, EU)"
133,CS,Serbia,173,11,"(RS, EU)"
146,TD,Chad,132,3,"(TD, AF)"
167,FO,Faroe Islands,81,1,"(FO, EU)"
169,CG,Congo,75,2,"(CG, AF)"


## Unir as tabelas

In [24]:
unir = [data_paises, unk]

In [25]:
data_paises_gcode = pd.concat(unir)

In [26]:
data_paises_gcode

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,BR,Brazil,4299388,113824,"(BR, SA)"
2,BD,Bangladesh,1233339,32571,"(BD, AS)"
3,EG,Egypt,178149,6648,"(EG, AF)"
4,ID,Indonesia,51537,1610,"(ID, AS)"
5,MX,Mexico,48732,1812,"(MX, NA)"
...,...,...,...,...,...
133,CS,Serbia,173,11,"(RS, EU)"
146,TD,Chad,132,3,"(TD, AF)"
167,FO,Faroe Islands,81,1,"(FO, EU)"
169,CG,Congo,75,2,"(CG, AF)"


## Remover valores duplicados

In [27]:
data_paises_gcode.loc[data_paises_gcode['code geo'].duplicated() == True]

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
36,GB,United Kingdom,2459,58,"(GB, EU)"
69,VA,Italy,700,15,"(IT, EU)"


In [28]:
data_paises_gcode.loc[data_paises_gcode['code geo'] ==   ('GB', 'EU')]

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
66,UK,Great Britain,765,23,"(GB, EU)"
36,GB,United Kingdom,2459,58,"(GB, EU)"


In [29]:
data_paises_gcode.loc[data_paises_gcode['code geo'] ==   ('IT', 'EU')]

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
58,IT,Italy,869,36,"(IT, EU)"
69,VA,Italy,700,15,"(IT, EU)"


In [30]:
italia_novo = 15+ 36
italia_novo

51

In [31]:
data_paises_gcode.at[58, 'resolvidos'] = 1569
data_paises_gcode.at[58, 'estudantes'] = 51

In [32]:
data_paises_gcode.drop([69], inplace=True)

In [33]:
data_paises_gcode.loc[data_paises_gcode['code geo'] ==   ('IT', 'EU')]

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
58,IT,Italy,1569,51,"(IT, EU)"


In [34]:
data_paises_gcode.at[36, 'resolvidos'] = 3224
data_paises_gcode.at[36, 'estudantes'] = 81
data_paises_gcode.at[36, 'sigla'] = 'UK'

In [35]:
data_paises_gcode.drop([66], inplace=True)

In [36]:
data_paises_gcode.loc[data_paises_gcode['code geo'] ==   ('GB', 'EU')]

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
36,UK,United Kingdom,3224,81,"(GB, EU)"


In [37]:
data_paises_gcode.loc[data_paises_gcode['code geo'].duplicated() == True]

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [38]:
data_paises_gcode['pais'][51] = 'United States Virgin Islands'

## Sort()

In [39]:
data_paises_gcode.sort_index(inplace=True)

## Primeira etapa de tratamento finalizada

In [40]:
data_paises_gcode

Unnamed: 0_level_0,sigla,pais,resolvidos,estudantes,code geo
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,BR,Brazil,4299388,113824,"(BR, SA)"
2,BD,Bangladesh,1233339,32571,"(BD, AS)"
3,EG,Egypt,178149,6648,"(EG, AF)"
4,ID,Indonesia,51537,1610,"(ID, AS)"
5,MX,Mexico,48732,1812,"(MX, NA)"
...,...,...,...,...,...
237,NC,New Caledonia,0,0,"(NC, OC)"
238,MP,Northern Mariana Islands,0,0,"(MP, OC)"
239,RW,Rwanda,0,0,"(RW, AF)"
240,LC,Saint Lucia,0,0,"(LC, NA)"


##  Coletando Longitude e Latitude

In [41]:
geolocator = Nominatim(user_agent='test')
def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan

# Separando país de continente

In [42]:
data_paises_gcode['Country'] = [x[0] for x in data_paises_gcode['code geo']]

In [43]:
data_paises_gcode['Continent'] = [x[1] for x in data_paises_gcode['code geo']]

In [44]:
data_paises_gcode.drop(['code geo', 'sigla'], axis=1, inplace=True)

In [45]:
data_paises_gcode

Unnamed: 0_level_0,pais,resolvidos,estudantes,Country,Continent
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Brazil,4299388,113824,BR,SA
2,Bangladesh,1233339,32571,BD,AS
3,Egypt,178149,6648,EG,AF
4,Indonesia,51537,1610,ID,AS
5,Mexico,48732,1812,MX,
...,...,...,...,...,...
237,New Caledonia,0,0,NC,OC
238,Northern Mariana Islands,0,0,MP,OC
239,Rwanda,0,0,RW,AF
240,Saint Lucia,0,0,LC,


# Aplicando método geolocate para coletar a lat/long

In [46]:
data_paises_gcode['Geolocate'] = [geolocate(row) for row in data_paises_gcode['pais']]

In [47]:
data_paises_gcode

Unnamed: 0_level_0,pais,resolvidos,estudantes,Country,Continent,Geolocate
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Brazil,4299388,113824,BR,SA,"(-10.3333333, -53.2)"
2,Bangladesh,1233339,32571,BD,AS,"(24.4768783, 90.2932426)"
3,Egypt,178149,6648,EG,AF,"(26.2540493, 29.2675469)"
4,Indonesia,51537,1610,ID,AS,"(-2.4833826, 117.8902853)"
5,Mexico,48732,1812,MX,,"(19.4326296, -99.1331785)"
...,...,...,...,...,...,...
237,New Caledonia,0,0,NC,OC,"(-20.454288599999998, 164.55660583077983)"
238,Northern Mariana Islands,0,0,MP,OC,"(14.149020499999999, 145.21345248318923)"
239,Rwanda,0,0,RW,AF,"(-1.9646631, 30.0644358)"
240,Saint Lucia,0,0,LC,,"(13.8250489, -60.975036)"


# Separando lat/long

In [48]:
data_paises_gcode['Latitude'] = [x[0] for x in data_paises_gcode['Geolocate']]
data_paises_gcode['Longitude'] = [x[1] for x in data_paises_gcode['Geolocate']]

In [49]:
data_paises_gcode.drop(['Geolocate'], axis=1, inplace=True)

In [50]:
data_paises_gcode.head()

Unnamed: 0_level_0,pais,resolvidos,estudantes,Country,Continent,Latitude,Longitude
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Brazil,4299388,113824,BR,SA,-10.333333,-53.2
2,Bangladesh,1233339,32571,BD,AS,24.476878,90.293243
3,Egypt,178149,6648,EG,AF,26.254049,29.267547
4,Indonesia,51537,1610,ID,AS,-2.483383,117.890285
5,Mexico,48732,1812,MX,,19.43263,-99.133178


In [51]:
world_map= folium.Map(tiles="cartodbpositron")
marker_cluster = MarkerCluster().add_to(world_map)

## Gerando mapa CircleMarker - Distribuição de Países

In [52]:
for i in range(len(data_paises_gcode)):
        lat = data_paises_gcode.iloc[i]['Latitude']
        long = data_paises_gcode.iloc[i]['Longitude']
        radius=20
        popup_text = """-Pais : {} </br>
        
                    </br>-Alunos : {}"""
        popup_text = popup_text.format(data_paises_gcode.iloc[i]['pais'],
                                   data_paises_gcode.iloc[i]['estudantes']
                                   )
        folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)

In [53]:
world_map

## Heatmap - Distribuição de Países

In [54]:
stationArr = data_paises_gcode[['Latitude', 'Longitude']]
world_map.add_children(plugins.HeatMap(stationArr, radius=15))

## Versão por quantidade de alunos - (em test)

In [55]:
#world_map_aluno= folium.Map(tiles="cartodbpositron")
#marker_cluster_aluno = MarkerCluster().add_to(world_map_aluno)

In [57]:
#for i in range(1, len(data_paises_gcode)):
#    for x in range(data_paises_gcode['estudantes'][i]):
#        lat = data_paises_gcode.iloc[i]['Latitude']
#        long = data_paises_gcode.iloc[i]['Longitude']
#        radius=20
#        popup_text = "-Pais : {}"
#        popup_text = popup_text.format(data_paises_gcode.iloc[i]['pais']
#                                   )
#        folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster_aluno)

In [None]:
#world_map_aluno

In [None]:
#stationArr_aluno = data_paises_gcode[['Latitude', 'Longitude']]
#world_map_aluno.add_children(plugins.HeatMap(stationArr_aluno, radius=15))