# Modelación Supervisada

## Importar Librerías

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score

## Importar Tabal de Datos Limpia y Reducida

In [2]:
data = pd.read_csv('real_estate.csv') 

## Estructura de Datos

In [3]:
data.head()

Unnamed: 0,d_created_on,c_lat,c_lon,c_price,c_surface_covered_in_m2,v_estado,v_anio_2015,v_anio_2016,v_mes_2,v_mes_3,...,descr_comedor,descr_id,descr_nocnok,descr_recamaras,descr_sala,descr_sistema,v_region_norte,v_region_sur,v_tamanio_grande,v_tamanio_mediano
0,2014-01-02,19.497295,-96.87925,750000.0,90.0,Veracruz de Ignacio de la Llave,0,0,0,0,...,1,0,0,2,1,0,0,0,0,1
1,2014-01-02,19.200068,-96.170279,140000.0,174.5,Veracruz de Ignacio de la Llave,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,2014-01-02,19.497295,-96.87925,440000.0,60.0,Veracruz de Ignacio de la Llave,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
3,2014-01-02,19.497295,-96.87925,2950000.0,177.0,Veracruz de Ignacio de la Llave,0,0,0,0,...,1,0,0,0,1,0,0,0,0,1
4,2014-01-02,19.497295,-96.87925,750000.0,70.0,Veracruz de Ignacio de la Llave,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1


## Enriquecimiento de Tabla

In [4]:
estados = list(data['v_estado'].value_counts().index.sort_values())

In [5]:
# datos de superficie por inegi
superficie = [5616,
71450,
73909,
57507,
73311,
247455,
151562,
5627,
1495,
123317,
22351,
30608,
63596,
20813,
78588,
58599,
4879,
27857,
64156,
93757,
34306,
11699,
44705,
61137,
58200,
179355,
24731,
80249,
4016,
71826,
39524,
75284]
# diccionario de superficie
sup_estado = dict(zip(estados, superficie))

In [6]:
# Población
poblacion = [1312544,
3315766,
712029,
899931,
5217908,
3556574,
2954915,
711235,
8918653,
1754754,
16187608,
5853677,
3533251,
2858359,
7844830,
4584471,
1903811,
1181050,
5119504,
3967889,
6168883,
2038372,
1501562,
2717820,
2966321,
2850330,
2395272,
3441698,
1272847,
8112505,
2097175,
1579209]
# diccionario de población
pob_estado = dict(zip(estados, poblacion))

In [7]:
# tasa hipotecaria promedio por año datos Banxico
tasa_hipotecaria = {2013: 0.119145454545455, 2014: 0.107691666666667,
                   2015: 0.103133333333333, 2016: 0.102191666666667}

In [8]:
## nuevas variables
# población
data['c_poblacion'] = data['v_estado'].apply(lambda x: pob_estado[x])
# superficie
data['c_superficie'] = data['v_estado'].apply(lambda x: sup_estado[x])
# densidad poblacional
data['c_densidad'] = data['c_poblacion'] / data['c_superficie']
# tasa hipotecaria
data['c_tasa'] = pd.to_datetime(data['d_created_on']).apply(lambda x: tasa_hipotecaria[x.year])
# dummy estados
data = pd.get_dummies(data, columns = ['v_estado'], drop_first = True)

In [9]:
data.columns

Index(['d_created_on', 'c_lat', 'c_lon', 'c_price', 'c_surface_covered_in_m2',
       'v_anio_2015', 'v_anio_2016', 'v_mes_2', 'v_mes_3', 'v_mes_4',
       'v_mes_5', 'v_mes_6', 'v_mes_7', 'v_mes_8', 'v_mes_9', 'v_mes_10',
       'v_mes_11', 'v_mes_12', 'v_dia_semama_Jueves', 'v_dia_semama_Lunes',
       'v_dia_semama_Martes', 'v_dia_semama_Miercoles', 'v_dia_semama_Sábado',
       'v_dia_semama_Viernes', 'v_operation_sell', 'v_property_type_house',
       'v_property_type_store', 'v_conteo_geo', 'title_casa', 'title_centro',
       'title_departamento', 'title_fracc', 'title_local', 'title_lomas',
       'title_renta', 'title_residencial', 'title_san', 'title_venta',
       'descr_baao', 'descr_br', 'descr_casa', 'descr_cocina', 'descr_comedor',
       'descr_id', 'descr_nocnok', 'descr_recamaras', 'descr_sala',
       'descr_sistema', 'v_region_norte', 'v_region_sur', 'v_tamanio_grande',
       'v_tamanio_mediano', 'c_poblacion', 'c_superficie', 'c_densidad',
       'c_tasa', 'v_esta

In [10]:
columnas = ['c_lat', 'c_lon', 'c_surface_covered_in_m2',
            'c_poblacion', 'c_superficie', 'c_densidad','c_tasa', 'v_conteo_geo',
       'v_anio_2015', 'v_anio_2016', 'v_mes_2', 'v_mes_3', 'v_mes_4',
       'v_mes_5', 'v_mes_6', 'v_mes_7', 'v_mes_8', 'v_mes_9', 'v_mes_10',
       'v_mes_11', 'v_mes_12', 'v_dia_semama_Jueves', 'v_dia_semama_Lunes',
       'v_dia_semama_Martes', 'v_dia_semama_Miercoles', 'v_dia_semama_Sábado',
       'v_dia_semama_Viernes', 'v_operation_sell', 'v_property_type_house',
       'v_property_type_store', 'title_casa', 'title_centro',
       'title_departamento', 'title_fracc', 'title_local', 'title_lomas',
       'title_renta', 'title_residencial', 'title_san', 'title_venta',
       'descr_baao', 'descr_br', 'descr_casa', 'descr_cocina', 'descr_comedor',
       'descr_id', 'descr_nocnok', 'descr_recamaras', 'descr_sala',
       'descr_sistema', 'v_region_norte', 'v_region_sur', 'v_tamanio_grande',
       'v_tamanio_mediano', 'v_estado_Baja California', 'v_estado_Baja California Sur',
       'v_estado_Campeche', 'v_estado_Chiapas', 'v_estado_Chihuahua',
       'v_estado_Coahuila de Zaragoza', 'v_estado_Colima',
       'v_estado_Distrito Federal', 'v_estado_Durango',
       'v_estado_Estado de MÃ©xico', 'v_estado_Guanajuato',
       'v_estado_Guerrero', 'v_estado_Hidalgo', 'v_estado_Jalisco',
       'v_estado_Morelos', 'v_estado_Nayarit', 'v_estado_Nuevo LeÃ³n',
       'v_estado_Oaxaca', 'v_estado_Puebla', 'v_estado_QuerÃ©taro',
       'v_estado_Quintana Roo', 'v_estado_San Luis PotosÃ­',
       'v_estado_Sinaloa', 'v_estado_Sonora', 'v_estado_Tabasco',
       'v_estado_Tamaulipas', 'v_estado_Tlaxcala',
       'v_estado_Veracruz de Ignacio de la Llave', 'v_estado_YucatÃ¡n',
       'v_estado_Zacatecas']

In [11]:
# información de los datos
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125684 entries, 0 to 125683
Data columns (total 86 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   d_created_on                              125684 non-null  object 
 1   c_lat                                     125684 non-null  float64
 2   c_lon                                     125684 non-null  float64
 3   c_price                                   125684 non-null  float64
 4   c_surface_covered_in_m2                   125684 non-null  float64
 5   v_anio_2015                               125684 non-null  int64  
 6   v_anio_2016                               125684 non-null  int64  
 7   v_mes_2                                   125684 non-null  int64  
 8   v_mes_3                                   125684 non-null  int64  
 9   v_mes_4                                   125684 non-null  int64  
 10  v_mes_5             

## División Cojunto de Datos

In [12]:
X = data.loc[:, columnas]
y = data.loc[:, 'c_price']

## y reshape
y = np.array(y).reshape(-1, 1)

In [13]:
## train test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [14]:
X_train.reset_index(drop = True, inplace = True)
X_test.reset_index(drop = True, inplace = True)

In [15]:
X_train.head()

Unnamed: 0,c_lat,c_lon,c_surface_covered_in_m2,c_poblacion,c_superficie,c_densidad,c_tasa,v_conteo_geo,v_anio_2015,v_anio_2016,...,v_estado_Quintana Roo,v_estado_San Luis PotosÃ­,v_estado_Sinaloa,v_estado_Sonora,v_estado_Tabasco,v_estado_Tamaulipas,v_estado_Tlaxcala,v_estado_Veracruz de Ignacio de la Llave,v_estado_YucatÃ¡n,v_estado_Zacatecas
0,20.666897,-103.372797,78.0,7844830,78588,99.822237,0.102192,1.0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,18.957352,-99.260544,317.0,4584471,58599,78.234629,0.102192,973.0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,25.706009,-100.227507,250.0,1181050,27857,42.396884,0.102192,1.0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,25.600464,-103.421458,180.0,1272847,4016,316.943974,0.103133,1.0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,18.957352,-99.260544,380.0,4584471,58599,78.234629,0.102192,973.0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X_train.columns

Index(['c_lat', 'c_lon', 'c_surface_covered_in_m2', 'c_poblacion',
       'c_superficie', 'c_densidad', 'c_tasa', 'v_conteo_geo', 'v_anio_2015',
       'v_anio_2016', 'v_mes_2', 'v_mes_3', 'v_mes_4', 'v_mes_5', 'v_mes_6',
       'v_mes_7', 'v_mes_8', 'v_mes_9', 'v_mes_10', 'v_mes_11', 'v_mes_12',
       'v_dia_semama_Jueves', 'v_dia_semama_Lunes', 'v_dia_semama_Martes',
       'v_dia_semama_Miercoles', 'v_dia_semama_Sábado', 'v_dia_semama_Viernes',
       'v_operation_sell', 'v_property_type_house', 'v_property_type_store',
       'title_casa', 'title_centro', 'title_departamento', 'title_fracc',
       'title_local', 'title_lomas', 'title_renta', 'title_residencial',
       'title_san', 'title_venta', 'descr_baao', 'descr_br', 'descr_casa',
       'descr_cocina', 'descr_comedor', 'descr_id', 'descr_nocnok',
       'descr_recamaras', 'descr_sala', 'descr_sistema', 'v_region_norte',
       'v_region_sur', 'v_tamanio_grande', 'v_tamanio_mediano',
       'v_estado_Baja California', 'v_est

## Standar Scaler

In [17]:
X_train_feat_scal = X_train.iloc[:, 0:8]
X_test_feat_scal = X_test.iloc[:, 0:8]
col_names = X_train.iloc[:, 0:8].columns

In [18]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
# X scaler
X_train_ss = ss.fit_transform(X_train_feat_scal)
X_test_ss = ss.transform(X_test_feat_scal)

In [19]:
## unión de conjunto de datos
X_train_ss = pd.DataFrame(X_train_ss, columns = col_names).join(X_train.iloc[:, 8:])
X_test_ss = pd.DataFrame(X_test_ss, columns = col_names).join(X_test.iloc[:, 8:])

In [20]:
from sklearn.feature_selection import SelectKBest
kb = SelectKBest(k = 53)
kb.fit(X_train_ss, y_train)

"""from sklearn.decomposition import PCA
pca = PCA(15)
pca.fit(X_train_ss, y_train)"""

  return f(**kwargs)
  f = msb / msw


'from sklearn.decomposition import PCA\npca = PCA(15)\npca.fit(X_train_ss, y_train)'

In [21]:
X_train.columns[kb.get_support()]

Index(['c_lat', 'c_lon', 'c_surface_covered_in_m2', 'c_poblacion',
       'c_superficie', 'c_densidad', 'v_conteo_geo', 'v_anio_2015',
       'v_anio_2016', 'v_mes_10', 'v_dia_semama_Sábado', 'v_operation_sell',
       'v_property_type_house', 'v_property_type_store', 'title_casa',
       'title_centro', 'title_departamento', 'title_local', 'title_lomas',
       'title_renta', 'title_residencial', 'title_san', 'title_venta',
       'descr_baao', 'descr_br', 'descr_casa', 'descr_cocina', 'descr_comedor',
       'descr_id', 'descr_nocnok', 'descr_recamaras', 'descr_sala',
       'descr_sistema', 'v_region_norte', 'v_tamanio_grande',
       'v_tamanio_mediano', 'v_estado_Baja California',
       'v_estado_Baja California Sur', 'v_estado_Chihuahua',
       'v_estado_Coahuila de Zaragoza', 'v_estado_Colima',
       'v_estado_Distrito Federal', 'v_estado_Durango',
       'v_estado_Estado de MÃ©xico', 'v_estado_Hidalgo', 'v_estado_Jalisco',
       'v_estado_Nayarit', 'v_estado_Nuevo LeÃ³n', '

In [22]:
X_train_ss = X_train_ss[X_train.columns[kb.get_support()]]
X_test_ss = X_test_ss[X_train.columns[kb.get_support()]]

"""X_train_ss = pca.transform(X_train_ss)
X_test_ss = pca.transform(X_test_ss)"""

'X_train_ss = pca.transform(X_train_ss)\nX_test_ss = pca.transform(X_test_ss)'

## Modelación

### Métricas

In [23]:
def metricas(y_true, y_pred):
    print('R2 Score:' + str(r2_score(y_true, y_pred)))
    print('MAE:' + str(mean_absolute_error(y_true, y_pred)))

## LightGBM

In [24]:
import lightgbm as lgb

In [25]:
params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'learning_rate': 0.1,
    'num_iterations': 500,
    'num_threads': 8,      
    'max_depth': 10,
    'num_leaves': 15,
    'feature_fraction': .7,
    'bagging_fraction': .6,
    'bagging_freq': 17}

In [26]:
y_train_lgb = pd.Series(y_train.reshape(y_train.shape[0]))

In [27]:
data_lgb = lgb.Dataset(X_train_ss, y_train_lgb)
model_gbm = lgb.train(params, data_lgb)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1310
[LightGBM] [Info] Number of data points in the train set: 100547, number of used features: 50
[LightGBM] [Info] Start training from score 1917108.835012


In [28]:
y_pred_train = model_gbm.predict(X_train_ss)
y_pred_test = model_gbm.predict(X_test_ss)

In [29]:
print('Train Score')
metricas(y_train, y_pred_train)
print('')
print('Test Score')
metricas(y_test, y_pred_test)

Train Score
R2 Score:0.8317968733971104
MAE:486858.25827392004

Test Score
R2 Score:0.8103012975662316
MAE:509605.02009181


In [30]:
bb

NameError: name 'bb' is not defined

## Clasificación

In [None]:
from os import listdir
from PIL import Image

china = []

for i in listdir('.\data'):
    file = '.\\data\\' + i
    china.append(np.array(Image.open(file).resize(size = (28,28)).getdata()))

In [None]:
china = np.array(china)/255

In [None]:
y_china = pd.read_csv('chinese_mnist.csv')['value']

In [None]:
tranform_label = dict(zip(y_china.unique(), np.arange(0, 15)))
y_china_trans = y_china.apply(lambda x: tranform_label[x])

In [None]:
from sklearn.model_selection import train_test_split

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(china, y_china_trans, test_size = 0.2)

### Train lgb

In [None]:
params_c = {'task': 'train',
           'boosting_type': 'gbdt',
           'objective': 'multiclass',
           'num_iterations': 100,
           'learning_rate': 0.01,
           'num_threads': 8,
           'max_depth': 20,
           'num_leaves': 15,
           'feature_fraction': 1,
           'num_class': 15,
           'metrics': 'multi_logloss'}

In [None]:
import lightgbm as lgb
china_data = lgb.Dataset(X_train_c, y_train_c)

In [None]:
model_c = lgb.train(params_c, china_data)

In [None]:
y_train_pred_c = np.argmax(model_c.predict(X_train_c), axis = 1)
y_test_pred_c = np.argmax(model_c.predict(X_test_c), axis = 1)

print(accuracy_score(y_train_c, y_train_pred_c))
print(accuracy_score(y_test_c, y_test_pred_c))

## SVR

In [31]:
from sklearn.svm import SVR
svr = SVR('linear', degree = 2, C = 1.0, max_iter = 1000)



In [33]:
svr.fit(X_train_ss, y_train)

  return f(**kwargs)


SVR(degree=2, kernel='linear', max_iter=1000)

In [34]:
y_pred_train_svr = svr.predict(X_train_ss)
y_pred_test_svr = svr.predict(X_test_ss)

In [35]:
print('Train Score')
metricas(y_train, y_pred_train_svr)
print('')
print('Test Score')
metricas(y_test, y_pred_test_svr)

Train Score
R2 Score:-1.5511121328760544
MAE:2846081.168105085

Test Score
R2 Score:-1.6113832807653292
MAE:2852279.5351990107
