In [1]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random as rd
%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [2]:
from scipy.stats.mstats import winsorize

## Carga de datos

Se cargan las columnas con formatos adecuados según el tipo de datos.  
Se usa como índice building_id para train_values y train_labels  

In [3]:
# Train values
# Levantar los datos de cada edificio/casa
df = pd.read_csv('datos/train_values.csv',nrows=0)
columnas = df.columns.to_list()
datatypes = {
    **{columnas[0]:np.int32}\
    , **{column:np.int16 for column in columnas[1:8] + columnas[27:28]}\
    , **{column:'category' for column in columnas[8:15] + columnas[26:27]}\
    , **{column:np.bool for column in columnas[15:26] + columnas[28:39]}
}
train_values = pd.read_csv('datos/train_values.csv',dtype=datatypes)
train_values = train_values.set_index("building_id")
columnas = train_values.columns.to_list()

In [4]:
#Train labels
train_labels = pd.read_csv('datos/train_labels.csv',index_col='building_id')

In [5]:
test_values = pd.read_csv('datos/test_values.csv', index_col='building_id')

## Mean encoding geo_level_id_1

In [6]:
df = train_values.copy()
df = df[['geo_level_1_id']]
df = df.join(train_labels,on='building_id',how='inner')
df['count_high_damage'] = df['damage_grade'] == 3
df

Unnamed: 0_level_0,geo_level_1_id,damage_grade,count_high_damage
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
802906,6,3,True
28830,8,2,False
94947,21,3,True
590882,22,2,False
201944,11,3,True
...,...,...,...
688636,25,2,False
669485,17,3,True
602512,17,3,True
151409,26,2,False


In [7]:
mean_damage = df.groupby('geo_level_1_id').agg('mean')[['count_high_damage']]
mean_damage

Unnamed: 0_level_0,count_high_damage
geo_level_1_id,Unnamed: 1_level_1
0,0.15
1,0.11
2,0.25
3,0.36
4,0.2
5,0.09
6,0.25
7,0.35
8,0.52
9,0.17


In [8]:
df2 = test_values.copy()

In [9]:
df2

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,596,11307,3,20,7,6,t,r,n,...,0,0,0,0,0,0,0,0,0,0
99355,6,141,11987,2,25,13,5,t,r,n,...,1,0,0,0,0,0,0,0,0,0
890251,22,19,10044,2,5,4,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
745817,26,39,633,1,0,19,3,t,r,x,...,0,0,1,0,0,0,0,0,0,0
421793,17,289,7970,3,15,8,7,t,r,q,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310028,4,605,3623,3,70,20,6,t,r,q,...,1,0,0,0,0,0,0,0,0,0
663567,10,1407,11907,3,25,6,7,n,r,n,...,0,0,0,0,0,0,0,0,0,0
1049160,22,1136,7712,1,50,3,3,t,r,n,...,0,0,0,0,0,0,0,0,0,0
442785,6,1041,912,2,5,9,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df2.join(mean_damage,on='geo_level_1_id',how='inner').sample(20)[['geo_level_1_id','count_high_damage']]

Unnamed: 0_level_0,geo_level_1_id,count_high_damage
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1
27369,4,0.2
897669,20,0.12
178409,4,0.2
509005,21,0.58
830693,17,0.81
682779,6,0.25
137776,26,0.09
217577,11,0.38
163491,21,0.58
131509,22,0.13


In [11]:
df.groupby('geo_level_1_id').transform('mean')

Unnamed: 0_level_0,damage_grade,count_high_damage
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1
802906,2.16,0.25
28830,2.49,0.52
94947,2.56,0.58
590882,2.00,0.13
201944,2.34,0.38
...,...,...
688636,2.05,0.14
669485,2.79,0.81
602512,2.79,0.81
151409,1.73,0.09


In [12]:
df['mean_high_damage'] = df.groupby('geo_level_1_id').transform('mean')['count_high_damage']

In [13]:
df

Unnamed: 0_level_0,geo_level_1_id,damage_grade,count_high_damage,mean_high_damage
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
802906,6,3,True,0.25
28830,8,2,False,0.52
94947,21,3,True,0.58
590882,22,2,False,0.13
201944,11,3,True,0.38
...,...,...,...,...
688636,25,2,False,0.14
669485,17,3,True,0.81
602512,17,3,True,0.81
151409,26,2,False,0.09
