# Limpieza de datos

Limpieza de data set para Visualización de la información.

In [1]:
import pandas as pd
import geopandas as gpd
import re
import numpy as np
from warnings import WarningMessage

WarningMessage = False

In [2]:
# Dataset de Kaggle
data1 = pd.read_csv('data/15_to_30_victim_count.csv')
data2 = pd.read_csv('data/5_to_14_victim_count.csv')
data3 = pd.read_csv('data/Highest_victim_count.csv')
data4 = pd.read_csv('data/Lessthan_5_victim_count.csv')

#Mapa en geojson
mapa = gpd.read_file('https://raw.githubusercontent.com/simonepri/geo-maps/master/previews/countries-coastline.geo.json')

data = pd.concat([data1,data2,data3,data4], ignore_index=True)
data = data.rename(columns={"Proven victims":"proven_victims","Possible victims":"possible_victims" })

# Dicccionario para ISO A3 code para países
country_dict = pd.read_csv('https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv',
                           encoding = 'UTF-8')

## Limpieza de país

Genero pais 1, 2 y 3 para campos con más de un país. Se eliminan los que tienen más de 3 países para simplicidad del análisis

In [3]:
# Generar pais 1-2-3
pais = data["Country"].str.split("\r\n", n = 2, expand = True)

data['pais_1'] = pais[0]
data['pais_2'] = pais[1]
data['pais_3'] = pais[2]

cols = ['pais_1','pais_2','pais_3']

for col in cols:
    data[col] = (data[col].str.replace('United States of America','United States ').replace(' \(suspected\)',"", regex=True)
    .replace('Soviet Union','Russia').replace("United States Mexico","United States").replace(' \(claimed\)',"", regex=True).replace(" \(alleged\)'',", regex=True).replace("East Germany","Germany")
    .replace("West Germany","Germany", regex=True).replace('Austria-Hungary',"Austria").replace("^East Germany","Germany",  regex=True).replace('German Empire','Germany')
    .replace('Allied-occupied Germany',"Germany").replace('Kingdom of Romania',"Romania")
    )


# Limpieza específica de pais 2 y 3
data.pais_2 = (data.pais_2.replace(' \(alleged\)',"", regex=True).replace("\r\n.*","", regex=True).replace('\n.*',"",regex=True)
)
data.pais_3 = (data.pais_3.replace(' \(alleged\)',"", regex=True).replace("\r\n.*","", regex=True).replace('\n.*',"",regex=True)
)


## Limpieza campo año

Limpio años y divido en año de inicio y año de termino.

In [4]:
# limpio datos 
data['Years active'] = (data['Years active'].replace('late ',"", regex=True).replace("s","", regex=True).replace(' and earlier',"", regex=True)
.replace("\?","", regex=True).replace('c.',"", regex=True).replace('30 June ',"", regex=True).replace('preent',"2020", regex=True)
.replace('23 July ',"", regex=True)
)

# divido en año inicio y termino

años  = data["Years active"].str.split(" to ", expand = True)
data['año_inicio'] = pd.to_numeric(años[0],"ignore")
data['año_termino'] = pd.to_numeric(años[1], "ignore")

In [5]:
for i in range(0,len(data.año_termino)):
    if pd.isna(data.año_termino[i]):
        data.año_termino[i] = data.año_inicio[i]

data['año_termino'] = data['año_termino'].map(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.año_termino[i] = data.año_inicio[i]


## Limpieza campo "víctimas probadas" y "posibles víctimas"

Limpieza del campo `proven_victims` y `possible_victims` . Algunos campos tienen rangos, "+", "~", etc. 

* Víctimas probadas

In [6]:
data.proven_victims = (data.proven_victims.replace(".*–","", regex=True).replace("\\+","", regex=True)
)
data.proven_victims = pd.to_numeric(data.proven_victims)

* Vícitimas posibles

In [7]:
data.possible_victims = (data.possible_victims.replace(".*–","", regex=True).replace("\\+","", regex=True)
.replace("Unknown","0").replace("\\?","", regex = True).replace("-","", regex = True)
.replace("\\~","", regex=True)
)
data.possible_victims = pd.to_numeric(data.possible_victims)

# reemplazo valores nulos o 0
for i in range(0,len(data.possible_victims)):
    if pd.isna(data.possible_victims[i]) or data.possible_victims[i]== 0:
        data.possible_victims[i] = data.proven_victims[i]

data['possible_victims'] = data['possible_victims'].map(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.possible_victims[i] = data.proven_victims[i]


## Limpieza de Country para match con diccionario de países

In [8]:
country_dict = country_dict[['name','alpha-3']]
country_dict = country_dict.rename(columns={'alpha-3':'A3'})
country_dict.A3 = country_dict.A3.str.replace('"','')
country_dict.head()

Unnamed: 0,name,A3
0,Afghanistan,AFG
1,Åland Islands,ALA
2,Albania,ALB
3,Algeria,DZA
4,American Samoa,ASM


In [12]:
country_dict = (country_dict.replace('Iran \\(Islamic Republic of\\)','Iran', regex=True).replace("Russian Federation","Russia")
.replace("United States of America","United States").replace("United Kingdom of Great Britain and Northern Ireland","United Kingdom")
.replace("Venezuela \\(Bolivarian Republic of\\)","Venezuela", regex=True).replace("Korea, Republic of","South Korea")
.replace("Iraq","Iraq", regex=True)
)

In [17]:
paises = pd.merge(data.pais_3,country_dict, left_on= 'pais_3' , right_on='name', how='left')
paises = paises[pd.isna(paises['name' ] )]
paises.pais_3.value_counts()

Czechoslovakia    1
 Iran             1
Name: pais_3, dtype: int64

Perderemos 3 países que no tienen homologación y Swaziland no encontré geojson con Alpha-3.  
De país 2 perdemos Iraq, es un bug o problema de Encoding.  
De país 3 perdemos Iran, es un bug o problema de Encoding.

In [19]:
data_clean = data[['Name','pais_1','pais_2','pais_3','año_inicio','año_termino', 'proven_victims', 'proven_victims', 'Notes']]
data_clean.describe()

Unnamed: 0,año_inicio,año_termino,proven_victims,proven_victims.1
count,305.0,305.0,305.0,305.0
mean,1978.134426,1985.491803,16.537705,16.537705
std,26.488297,25.515103,16.184566,16.184566
min,1880.0,1906.0,3.0,3.0
25%,1972.0,1979.0,8.0,8.0
50%,1984.0,1992.0,12.0,12.0
75%,1996.0,2003.0,18.0,18.0
max,2016.0,2020.0,138.0,138.0


## Añadir visitas de wikipedia

https://stackoverflow.com/questions/66709281/get-page-views-of-a-wikipedia-page-using-an-api