In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import numpy
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr


%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

In [None]:
def null_values(df):
    
    sum_null = df.isnull().sum()
    total = df.isnull().count()
    percent_nullvalues = 100* sum_null / total 
    df_null = pd.DataFrame()
    df_null['Total'] = total
    df_null['Null_Count'] = sum_null
    df_null['Percent'] = round(percent_nullvalues,2)
    df_null = df_null.sort_values(by='Null_Count',ascending = False)
    df_null = df_null[df_null.Null_Count > 0]
    
    return(df_null)

def normailize_df(refDf, train):
    df = refDf.copy()
    df['año'] = df['fecha'].dt.year
    df['antiguedad'] = df['antiguedad'] + (2016 - df['año']) # Se normaliza la antiguedad.
    
    nulltotales = df[df['metrostotales'].isnull()]
    nullcubiertos = df[df['metroscubiertos'].isnull()]
    notnullapart = df[(~ df['metrostotales'].isnull()) & (df['metrostotales'] < df['metroscubiertos'])]
    notnullapart2 = df[(~ df['metrostotales'].isnull()) & (df['metrostotales'] > df['metroscubiertos'])]
    notnullapart3 = df[(~ df['metrostotales'].isnull()) & (df['metrostotales'] == df['metroscubiertos'])]
 
    df['habitable'] = False
    tipodepropiedades = df.tipodepropiedad.cat.categories.to_list()
    for tipodepropiedad in tipodepropiedades: 

        idsNullMetrosTotales = (df.tipodepropiedad == tipodepropiedad) & (df.metrostotales.isnull())
        idsNullMetrosCubiertos = (df.tipodepropiedad == tipodepropiedad) & (df.metroscubiertos.isnull())
        banos = df[df.tipodepropiedad == tipodepropiedad].banos.mode(dropna=False);
        habitaciones = df[df.tipodepropiedad == tipodepropiedad].habitaciones.mode(dropna=False);

        ## Verificamos si la cantidad de registros con metros cubiertos nulos es mayor a 2/5 de los totales. Si es asi 
        ## los consideramos propiedades no habitables. Y los tratamos de manera diferente
        if(len(nullcubiertos[nullcubiertos.tipodepropiedad == tipodepropiedad]) >= 2/5* len(df[df.tipodepropiedad == tipodepropiedad])):
            df.metrostotales.fillna(0, inplace=True)
            df.metroscubiertos.fillna(0, inplace=True)
        else:
            df[idsNullMetrosTotales]['metrostotales'] =  df[idsNullMetrosTotales]['metroscubiertos']
            df[idsNullMetrosCubiertos]['metroscubiertos'] =  df[idsNullMetrosCubiertos]['metrostotales']

        #Si la moda del tipo de propiedad de banos y habitaciones son ambas distintas de nan entonces la propiedad es habitable.

        df.loc[(df.tipodepropiedad == tipodepropiedad), 'habitable'] = not(numpy.isnan(banos[0]) and numpy.isnan(habitaciones[0]))
    
    df['metros'] = df['metrostotales'] + df['metroscubiertos']
    if(train):
        def is_outlier(group):
            Q1 = group.quantile(0.25)
            Q3 = group.quantile(0.75)
            IQR = Q3 - Q1
            precio_min = Q1 - 1.5 * IQR
            precio_max = Q3 + 1.5 * IQR
            return ~group.between(precio_min, precio_max)
        df['precio_mt2'] = df['precio'] / df['metros']
        print()
        
        df = df[~df.groupby('tipodepropiedad')['precio_mt2'].apply(is_outlier).fillna(False)]
        print('Despues de filtrar: ', df.shape)
    return df

train = pd.read_csv('../train.csv',
        dtype={'gimnasio': int,
                'usosmultiples': int,
                'escuelascercanas': int,
                'piscina': int,
                'centroscomercialescercanos': int,
                'tipodepropiedad': 'category',
                'provincia': 'category',
                'ciudad': 'category'
            },
        parse_dates=['fecha'])
test = pd.read_csv('../test.csv',
        dtype={'gimnasio': int,
                'usosmultiples': int,
                'escuelascercanas': int,
                'piscina': int,
                'centroscomercialescercanos': int,
                'tipodepropiedad': 'category',
                'provincia': 'category',
                'ciudad': 'category'
            },
        parse_dates=['fecha'])
# train = normailize_df(train, True)
# test = normailize_df(test, False)
train.drop(columns=["direccion", "titulo", 'descripcion', 'lat', 'lng', 'fecha', 'idzona'], inplace=True)
test.drop(columns=["direccion", 'titulo', 'descripcion', 'lat', 'lng', 'fecha', 'idzona'], inplace=True)
df_all = train.append(test)
null_values(df_all)
df_all.dtypes

In [None]:
# for col in ('metrostotales', 'metroscubiertos'):
#     df_all[col] = df_all[col].fillna(0)
    
# for col in ('garages', 'banos', 'antiguedad', 'habitaciones'):
#     tipodepropiedades = df_all.tipodepropiedad.cat.categories.to_list()
#     for tipodepropiedad in tipodepropiedades: 
#         df_all[df_all.tipodepropiedad == tipodepropiedad][col].fillna(df_all[df_all.tipodepropiedad == tipodepropiedad][col].mode(dropna=False))
    
# for col in ('ciudad', 'provincia', 'tipodepropiedad'):
#     df_all[col] = df_all[col].fillna('None')

# # Total area is the most important in terms of prices.    
# df_all['metros'] = df_all['metrostotales'] + df_all['metroscubiertos']
# df_all.drop(columns=["metrostotales", 'metroscubiertos'], inplace=True)

In [None]:
train.dtypes

In [None]:
all_data =  pd.concat((train.loc[:,'tipodepropiedad':'centroscomercialescercanos'], test.loc[:,'tipodepropiedad':'centroscomercialescercanos']))

In [None]:
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":train["precio_mt2"], "log(price + 1)":np.log1p(train["precio_mt2"])})
prices.hist()

In [None]:
#log transform the target:
train["precio_mt2"] = np.log1p(train["precio_mt2"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[(all_data.dtypes == "float64") | (all_data.dtypes == "int64")].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])


In [None]:
all_data = pd.get_dummies(all_data)


In [None]:
all_data.dtypes


In [None]:
# filling NA's with the mean of the column:
all_data = all_data.fillna(all_data.mean())

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_data[:train.shape[0]], train.precio_mt2, test_size=0.25, random_state=1)


In [None]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [None]:
model_ridge = Ridge()


In [None]:
alphas = [0.05, 0.1, 0.3, 1, 3, 5]
cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() 
            for alpha in alphas]

In [None]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot(title = "Validation - Just Do It")
plt.xlabel("alpha")
plt.ylabel("rmse")

In [None]:
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, y_train)

In [None]:
rmse_cv(model_lasso).mean()


In [None]:
cv_ridge.min()


In [None]:
coef = pd.Series(model_lasso.coef_, index = X_train.columns)


In [None]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

In [None]:
imp_coef = pd.concat([coef.sort_values().head(10),
                     coef.sort_values().tail(10)])

In [None]:
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")

In [31]:
model = Ridge(alpha = 5).fit(X_train, y_train)
ridge_preds = model.predict(X_test)
X_train2, X_test2, y_train2, y_test2 = train_test_split(all_data[:train.shape[0]], train.precio_mt2, test_size=0.15, random_state=1)
ridge_preds2 = model.predict(X_test2)

In [49]:
# Métrica de evaluación
def RMSLE(actual, pred):
    return (np.mean((np.log(actual + 1) - np.log(pred + 1)) ** 2)) **.5

def RMSE(actual, pred): 
    return np.sqrt(mean_squared_error(actual, pred))

errordf = pd.DataFrame(y_test)
errordf['pred'] = ridge_preds
errordf['error'] = ((errordf.precio_mt2 - errordf.pred) * 2) * .5
errordf[abs(errordf.error) > RMSE(ridge_preds, y_test)]


# (

# from sklearn.metrics import mean_squared_error
# print(RMSE(ridge_preds, y_test))

# print(RMSLE(ridge_preds2, y_test2))

Unnamed: 0,precio_mt2,pred,error
150885,7.517935,8.022703,-0.504768
108192,7.753348,8.242022,-0.488674
85452,8.504817,9.032765,-0.527948
41421,10.230667,9.815183,0.415484
119345,9.063073,8.262962,0.800111
...,...,...,...
93602,7.742037,8.295034,-0.552997
17978,8.079248,8.758708,-0.679459
70338,9.919375,9.400922,0.518452
72048,9.076923,8.606418,0.470505


In [None]:
ridge_final_preds = Ridge(alpha = 5).fit(all_data[:train.shape[0]], train.precio_mt2).predict(all_data[train.shape[0]:])

In [1]:
final = test[['id']]
final.loc[:, 'target'] = test.metros * np.expm1(ridge_final_preds)
final.set_index('id').to_csv('../prediction.csv')

NameError: name 'test' is not defined

In [None]:
h = FeatureHasher(n_features=2)
D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]
f = h.transform(D)