In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from unidecode import unidecode

%matplotlib notebook

In [2]:
df = pd.read_csv('../data/preprocessed/dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28620 entries, 0 to 28619
Data columns (total 46 columns):
acepta_permuta           913 non-null float64
altura_permitida         310 non-null float64
ambientes                2880 non-null float64
ambientes_extra          2880 non-null float64
ano_de_construccion      16116 non-null float64
aptos_por_piso           6362 non-null float64
banos                    25044 non-null float64
banos_extra              25044 non-null float64
barrio                   26229 non-null object
cantidad_de_pisos        6512 non-null float64
casco                    21 non-null float64
descripcion              28388 non-null object
direccion                9195 non-null object
disposicion              18954 non-null object
distancia_al_mar         5164 non-null float64
dormitorios              23034 non-null float64
dormitorios_extra        23034 non-null float64
estado                   25504 non-null object
extra                    23097 non-null objec

# Usefull methods to explore features
- df.description()
- df.shape
- df.count()
- df.sum()
- df.mean()
- df.std()
- df.corr()

- `df['feature'].unique()`
- `df.nunique()`
- `df['feature'].isna()`



In [3]:
df.sample(3).transpose()

Unnamed: 0,14457,3880,14735
acepta_permuta,,,
altura_permitida,,,
ambientes,,,
ambientes_extra,,,
ano_de_construccion,1959,2020,2016
aptos_por_piso,,,
banos,1,1,1
banos_extra,0,0,0
barrio,Buceo,Puerto Buceo,Pocitos
cantidad_de_pisos,,,


## Select feature for analysis
Check dataset [documentation](https://github.com/creyesp/houses-project/blob/add-binder-configs/data/dataset_description.md) to choice the most interesting feature to answer our questions.

In [4]:
columns_to_analyze = [
    'ano_de_construccion', 'aptos_por_piso', 'banos', 'banos_extra', 'cantidad_de_pisos',
    'descripcion', 'disposicion', 'distancia_al_mar', 'dormitorios', 'dormitorios_extra',
    'estado', 'extra', 'garajes', 'garajes_extra', 'gastos_comunes', 'gastos_comunes_moneda',
    'tipo_de_publicacion', 'm2_de_la_terraza', 'm2_del_terreno', 'm2_edificados', 'oficina', 'penthouse',
    'piso', 'plantas', 'plantas_extra', 'precio', 'precio_moneda', 'sobre', 'tipo_propiedad',
    'vista_al_mar', 'vivienda_social', 'barrio', 
]

In [14]:
df_obj = df[columns_to_analyze].select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

print('Numerical columns: {}\n'.format(df_num.columns.tolist()))
print('Caterorial columns: {}'.format(df_obj.columns.tolist()))

Numerical columns: ['acepta_permuta', 'altura_permitida', 'ambientes', 'ambientes_extra', 'ano_de_construccion', 'aptos_por_piso', 'banos', 'banos_extra', 'cantidad_de_pisos', 'casco', 'distancia_al_mar', 'dormitorios', 'dormitorios_extra', 'financia', 'garajes', 'garajes_extra', 'gastos_comunes', 'gastos_comunes_moneda', 'hectareas', 'huespedes', 'longitud_frente', 'm2_de_la_terraza', 'm2_del_terreno', 'm2_edificados', 'oficina', 'penthouse', 'piso', 'plantas', 'plantas_extra', 'precio', 'precio_moneda', 'vista_al_mar', 'vivienda_social']

Caterorial columns: ['descripcion', 'disposicion', 'estado', 'extra', 'tipo_de_publicacion', 'sobre', 'tipo_propiedad', 'barrio']


In [7]:
df_obj = (df_obj.fillna('')
          .apply(lambda x: x.str.lower().apply(lambda y: unidecode(y)))
         )

df_raw = pd.concat([df_num, df_obj], axis=1)

In [None]:
mask = (
    df_raw['tipo_propiedad'].isin(['apartamentos', 'casas'])
    & (df_raw['oficina'] != 1)
    & (df_raw['penthouse'] != 1) 
    & (df_raw['penthouse'] != 1) 
)
df_raw = df_raw[mask]
df_raw.columns

In [None]:
columns = ['ano_de_construccion', 'banos', 'disposicion', 'dormitorios',
           'estado', 'garajes', 'gastos_comunes',
           'm2_de_la_terraza', 'm2_del_terreno', 'm2_edificados',
           'price', 'tipo_propiedad', 'vivienda_social', 'zona']

In [None]:
df_selected = df_raw[columns].reset_index(drop=True)
fill_zero_col = ['m2_de_la_terraza', 'vivienda_social', 'gastos_comunes', 'garajes']
df_selected.loc[:, fill_zero_col] = df_selected.loc[:, fill_zero_col].fillna(0)
mask_m2_terreno = df_selected['m2_del_terreno'].isna()
df_selected.loc[mask_m2_terreno, 'm2_del_terreno'] = df_selected.loc[mask_m2_terreno, 'm2_edificados']
(
    df_selected
    .isna()
    .sum()
)

In [None]:
df_selected.dropna(inplace=True)

In [None]:
df_num_selected = df_selected.select_dtypes(exclude='object')
df_obj_selected = df_selected.select_dtypes(include='object')

In [None]:
df_obj_selected.sample(5)

In [None]:
zone_codes = pd.get_dummies(df_obj_selected['zona'])
property_type = pd.get_dummies(df_obj_selected['tipo_propiedad'])
property_state = pd.get_dummies(df_obj_selected['estado'])

In [None]:
zone_codes.columns = ['ZN_{}'.format(k.replace(' ', '_')) for k in zone_codes.columns]
property_state.columns = ['ST_{}'.format(k.replace(' ', '_')) for k in property_state.columns]


In [None]:
features_selected = pd.concat([df_num_selected, zone_codes, property_type, property_state], axis=1)
features_selected.to_csv('../data/ready/dataset_houses.csv', index=False)

In [None]:
price =  features_selected['price']
features = features_selected.drop(columns=['price'])



# Modeling

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import Normalizer

In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# scaler =  RobustScaler(quantile_range=(25, 75))
scaler = QuantileTransformer(output_distribution='normal') 
# scaler = PowerTransformer(method='box-cox')  # Non Linear | Strictly positive data
# scaler = PowerTransformer(method='yeo-johnson')  # Non Linear 



In [None]:
train_features, test_features = train_test_split(features, test_size=0.2, random_state=14)
train_price, test_price = train_test_split(price, test_size=0.2, random_state=14)

scaler.fit(train_features)
sca_train = scaler.transform(train_features)
sca_test = scaler.transform(train_features)

In [None]:
pca = PCA(n_components=.95, )
pca.fit(sca_train)
arr_train = pca.transform(sca_train)
arr_test = pca.transform(sca_test)

In [None]:
pca.explained_variance_ratio_

In [None]:
sns.pairplot(pd.DataFrame(arr[:, 0:4]))

In [None]:
# color = features.loc[features.index, 'zona'].astype('category').cat.codes
color = train_features.iloc[:, 1]
f, ax = plt.subplots(2,2)
ax[0, 0].scatter(arr[:,0], arr[:,1], c=color)
ax[0, 1].scatter(arr[:,1], arr[:,2], c=color)
ax[1, 0].scatter(arr[:,0], arr[:,2], c=color)
ax[1, 1].scatter(arr[:,3], arr[:,5], c=color)

In [None]:
lda = LinearDiscriminantAnalysis(n_components=3).fit_transform(scaler.fit_transform(train_features),
                                                              train_price)

In [None]:
color = train_features.iloc[:, 1]

f, ax = plt.subplots(2, 2)
ax[0, 0].scatter(lda[:, 0], lda[:, 1], c=color)
ax[0, 1].scatter(lda[:, 0], lda[:, 2], c=color)
ax[1, 0].scatter(lda[:, 1], lda[:, 2], c=color)


In [None]:
def plot_predict_result(test_price, predict_price):
    f, ax = plt.subplots(2)
    ax[0].scatter(test_price, predict_price)
    ax[0].plot([0, 1600000], [0, 1600000], c='r')
    ax[1].hist(test_price - predict_price, bins=100)
    return f, ax

def get_scores(test_train, test_predict):
    mse = mean_squared_error(test_train, test_predict)
    mea = mean_absolute_error(test_train, test_predict)
    r2 = r2_score(test_train, test_predict)
    return 'mse={}, mea={}, r2={}'.format(mse, mea, r2)

# Linear Model

In [None]:
model = LinearRegression(fit_intercept=True)
model.fit(arr_train, train_price)
#model.coef_, model.intercept_, model.rank_

In [None]:
predict_price = model.predict(arr_test)
get_scores(test_price, predict_price)

In [None]:
f, ax = plot_predict_result(test_price, predict_price)
ax[0].set(ylim=[-10000000000, 10000000000])

# Regression Tree

In [None]:
cls_tree = DecisionTreeRegressor()
cls_tree.fit(train_features, train_price)


In [None]:
predic_price_tree = cls_tree.predict(test_features)
get_scores(predic_price_tree, predict_price)

In [None]:
plot_predict_result(test_price, predic_price_tree)

In [None]:
f, ax = plt.subplots()
(pd
 .Series(cls_tree.feature_importances_, index=train_features.columns)
 .sort_values()[-10:]
 .plot(ax=ax, kind='barh')
)

In [None]:
df_raw[df_raw['gastos_comunes']>1000000]
#df_selected['garajes'].unique()

In [None]:
df_raw.loc[18045, 'url']

In [None]:
df_selected.describe()

In [None]:
#df_selected['m2_del_terreno'] > 1
#df_selected['gastos_comunes'] < 1e5
#df_selected['m2_edificados'] > 20

(
    df_selected.query('m2_del_terreno > 2')
    .query('gastos_comunes < 1e5')
    .query('m2_edificados > 20')
    #.pipe(lambda x: x.describe().transpose())
    .to_csv('../data/ready/num_dataset_houses.csv', index=False)
)
