# Laboratorio - Joins con Pandas

Las operaciones de tipo merge aparecen frecuentemente cuando debemos combinar datos de diversas fuentes. 

En este ejemplo vamos a ver datos de población y área por estado en EEUU y vamos a crear un ranking de los estados por su densidad de población total en el año 2010.


In [101]:
# Crear dataframes con los archivos state-population.csv, state-areas.csv y state-abbrevs.csv
import pandas as pd
statepopulation = pd.read_csv('../Data/state-population.csv')
stateareas = pd.read_csv('../Data/state-areas.csv')
stateabbrevs = pd.read_csv('../Data/state-abbrevs.csv')

In [102]:
from IPython.display import display
display(statepopulation.head(1))
display(stateareas.head(1))
display(stateabbrevs.head(1))

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0


Unnamed: 0,state,area (sq. mi)
0,Alabama,52423


Unnamed: 0,state,abbreviation
0,Alabama,AL


In [103]:
# Unir la tabla de población con las abreviaturas para obtener el nombre completo del Estado
popu = pd.merge(statepopulation, stateabbrevs, left_on='state/region', right_on='abbreviation', how='left')
popu.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AL,under18,2012,1117489.0,Alabama,AL
1,AL,total,2012,4817528.0,Alabama,AL
2,AL,under18,2010,1130966.0,Alabama,AL
3,AL,total,2010,4785570.0,Alabama,AL
4,AL,under18,2011,1125763.0,Alabama,AL


In [104]:
# Eliminar las columnas duplicadas
po = popu.drop(['state/region', 'abbreviation'], axis=1)
po.head()

Unnamed: 0,ages,year,population,state
0,under18,2012,1117489.0,Alabama
1,total,2012,4817528.0,Alabama
2,under18,2010,1130966.0,Alabama
3,total,2010,4785570.0,Alabama
4,under18,2011,1125763.0,Alabama


In [105]:
# Verificar la existencia de valores nulos
po.isnull().head()

Unnamed: 0,ages,year,population,state
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


In [106]:
# Inverstigar los valores nulos ¿Siguen algún patrón los datos faltantes?
po[po.isnull().any(axis = 1)].head()

Unnamed: 0,ages,year,population,state
2448,under18,1990,,
2449,total,1990,,
2450,total,1991,,
2451,under18,1991,,
2452,total,1993,,


In [107]:
# ¿Qué pasa con el estado de Puerto Rico? Corregir la tabla para incluir el Estado cuando la abreviatura no funciona
tmp = pd.merge(statepopulation, stateabbrevs, left_on='state/region', right_on='abbreviation', how='left')
tmp[tmp.isnull().any(axis = 1)]['state/region'].unique()

array(['PR', 'USA'], dtype=object)

In [108]:
pr = pd.DataFrame({'state': ['Puerto Rico'], 'abbreviation': ['PR']})
newstateabbrevs = stateabbrevs.append(pr, ignore_index=True)
newstateabbrevs.tail()

Unnamed: 0,abbreviation,state
47,WA,Washington
48,WV,West Virginia
49,WI,Wisconsin
50,WY,Wyoming
51,PR,Puerto Rico


In [109]:
popu = pd.merge(statepopulation, newstateabbrevs, left_on='state/region', right_on='abbreviation', how='left')
#popu = popu.dropna()
popu = popu.drop(['state/region', 'abbreviation'], axis=1)
popu

Unnamed: 0,ages,year,population,state
0,under18,2012,1117489.0,Alabama
1,total,2012,4817528.0,Alabama
2,under18,2010,1130966.0,Alabama
3,total,2010,4785570.0,Alabama
4,under18,2011,1125763.0,Alabama
5,total,2011,4801627.0,Alabama
6,total,2009,4757938.0,Alabama
7,under18,2009,1134192.0,Alabama
8,under18,2013,1111481.0,Alabama
9,total,2013,4833722.0,Alabama


In [115]:
# Unir la tabla de población y abreviaturas con la de áreas
display(popu.loc[popu['state'] == 'Puerto Rico'])
display(stateareas.head(1))

Unnamed: 0,ages,year,population,state
2448,under18,1990,,Puerto Rico
2449,total,1990,,Puerto Rico
2450,total,1991,,Puerto Rico
2451,under18,1991,,Puerto Rico
2452,total,1993,,Puerto Rico
2453,under18,1993,,Puerto Rico
2454,under18,1992,,Puerto Rico
2455,total,1992,,Puerto Rico
2456,under18,1994,,Puerto Rico
2457,total,1994,,Puerto Rico


Unnamed: 0,state,area (sq. mi)
0,Alabama,52423


In [111]:
popu_areas = pd.merge(popu, stateareas, how='left')
display(stateareas.head())
display(popu_areas.head())

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


Unnamed: 0,ages,year,population,state,area (sq. mi)
0,under18,2012,1117489.0,Alabama,52423.0
1,total,2012,4817528.0,Alabama,52423.0
2,under18,2010,1130966.0,Alabama,52423.0
3,total,2010,4785570.0,Alabama,52423.0
4,under18,2011,1125763.0,Alabama,52423.0


In [112]:
popu_areas[popu_areas.isnull().any(axis = 1)]

Unnamed: 0,ages,year,population,state,area (sq. mi)
2448,under18,1990,,Puerto Rico,3515.0
2449,total,1990,,Puerto Rico,3515.0
2450,total,1991,,Puerto Rico,3515.0
2451,under18,1991,,Puerto Rico,3515.0
2452,total,1993,,Puerto Rico,3515.0
2453,under18,1993,,Puerto Rico,3515.0
2454,under18,1992,,Puerto Rico,3515.0
2455,total,1992,,Puerto Rico,3515.0
2456,under18,1994,,Puerto Rico,3515.0
2457,total,1994,,Puerto Rico,3515.0


In [116]:
# Identificar los datos faltantes
# falta el dato de poblacion de puerto rico de 1990 a 1999, y el estado de USA

In [119]:
# Descartar los datos faltantes, si fuera conveniente con dropna
popu_areas = popu_areas.dropna()
popu_areas.sample(5)
# display(popu_areas.loc[popu_areas['state'] == 'Puerto Rico'])

Unnamed: 0,ages,year,population,state,area (sq. mi)
1275,under18,1999,231133.0,Montana,147046.0
2395,under18,2010,1336094.0,Wisconsin,65503.0
1044,total,2007,6431559.0,Massachusetts,10555.0
1886,under18,1998,241760.0,Rhode Island,1545.0
1617,under18,2006,2166393.0,North Carolina,53821.0


In [129]:
# Filtrar la tabla para el año 2010 y el tipo de población relevante 
popu_areas_2010 = popu_areas.loc[(popu_areas['year'] == 2010) & (popu_areas['ages'] == 'total')]
popu_areas_2010

Unnamed: 0,ages,year,population,state,area (sq. mi)
3,total,2010,4785570.0,Alabama,52423.0
91,total,2010,713868.0,Alaska,656425.0
101,total,2010,6408790.0,Arizona,114006.0
189,total,2010,2922280.0,Arkansas,53182.0
197,total,2010,37333601.0,California,163707.0
283,total,2010,5048196.0,Colorado,104100.0
293,total,2010,3579210.0,Connecticut,5544.0
379,total,2010,899711.0,Delaware,1954.0
389,total,2010,605125.0,District of Columbia,68.0
475,total,2010,18846054.0,Florida,65758.0


In [148]:
# Calcular la densidad de población y generar el ranking
def calc_pop_density(row):
    row['density'] = row['population'] / row['area (sq. mi)']
    return row

%matplotlib inline
popu_areas_2010 = popu_areas_2010.apply(calc_pop_density, axis = 1).sort_values('density', ascending=False)
popu_areas_2010
# popu_areas_2010[['state', 'density']].plot(kind='barh', figsize=(10, 20))

Unnamed: 0,ages,year,population,state,area (sq. mi),density
389,total,2010,605125.0,District of Columbia,68.0,8898.897059
2490,total,2010,3721208.0,Puerto Rico,3515.0,1058.665149
1445,total,2010,8802707.0,New Jersey,8722.0,1009.253268
1914,total,2010,1052669.0,Rhode Island,1545.0,681.339159
293,total,2010,3579210.0,Connecticut,5544.0,645.600649
1050,total,2010,6563263.0,Massachusetts,10555.0,621.815538
965,total,2010,5787193.0,Maryland,12407.0,466.445797
379,total,2010,899711.0,Delaware,1954.0,460.445752
1541,total,2010,19398228.0,New York,54475.0,356.094135
475,total,2010,18846054.0,Florida,65758.0,286.597129
