In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import numpy
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from collections import OrderedDict
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb

# %config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
# %matplotlib inline

In [2]:
train = pd.read_csv('../normalized_train.csv',
        dtype={'gimnasio': int,
                'usosmultiples': int,
                'escuelascercanas': int,
                'piscina': int,
                'centroscomercialescercanos': int
            },
        parse_dates=['fecha'])
test = pd.read_csv('../normalized_test.csv',
        dtype={'gimnasio': int,
                'usosmultiples': int,
                'escuelascercanas': int,
                'piscina': int,
                'centroscomercialescercanos': int,
            },
        parse_dates=['fecha'])
train_raw = pd.read_csv('../train.csv',
        dtype={'gimnasio': int,
                'usosmultiples': int,
                'escuelascercanas': int,
                'piscina': int,
                'centroscomercialescercanos': int,
                'tipodepropiedad': 'category',
                'provincia': 'category',
                'ciudad': 'category'
            },
        parse_dates=['fecha'])
test = test.set_index('id').dropna(subset=['titulo'])
train = train.set_index('id').dropna(subset=['titulo'])
df_all = train.append(test)
train.shape

(230872, 42)

In [3]:
tipodepopiedades = [item.lower() for item in train_raw.tipodepropiedad.cat.categories]
tipodepopiedades

['apartamento',
 'bodega comercial',
 'casa',
 'casa en condominio',
 'casa uso de suelo',
 'departamento compartido',
 'duplex',
 'edificio',
 'huerta',
 'inmuebles productivos urbanos',
 'local comercial',
 'local en centro comercial',
 'lote',
 'nave industrial',
 'oficina comercial',
 'otros',
 'quinta vacacional',
 'rancho',
 'terreno',
 'terreno comercial',
 'terreno industrial',
 'villa',
 'hospedaje',
 'garage']

In [4]:
import nltk
import re
from unicodedata import normalize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
stopwords = stopwords.words('spanish')


from collections import Counter
counter = Counter()
tokenizer = RegexpTokenizer(r'\w+')
a,b = 'áéíóúü','aeiouu'
trans = str.maketrans(a,b)

def countplease(x):
    
#     print(x)
    if(x!=x): return;
    
    x = tokenizer.tokenize(x)

    for word in list(x):  # iterating on a copy since removing will mess things up
        if word in stopwords or word.isnumeric() or word in tipodepopiedades:
            x.remove(word)

    x = [item.translate(trans) for item in x]
    counter.update(x)
    

df_all['titulo'].apply(lambda x: countplease(x))
counter.most_common(20)


[('venta', 131068),
 ('departamento', 47327),
 ('san', 20304),
 ('excelente', 15410),
 ('lomas', 13817),
 ('col', 13270),
 ('residencial', 12433),
 ('hermosa', 11714),
 ('condominio', 10737),
 ('valle', 9846),
 ('recamaras', 8825),
 ('oportunidad', 8598),
 ('santa', 8511),
 ('fracc', 8037),
 ('fraccionamiento', 7495),
 ('nueva', 7384),
 ('casas', 7347),
 ('remate', 7086),
 ('bonita', 6782),
 ('cerca', 6379)]

In [5]:
soloprecio = train[['logprecio', 'metros']]
solopreciotrain = soloprecio;
solotest = test[['metros']]
def buildDataframe(appearences, counter):
    newDict = dict()
    # Iterate over all the items in dictionary and filter items which has even keys
    for (key, value) in dict(counter).items():
        # Check if key is even then add pair to new dictionary
    #     print(value)
        if int(value) > appearences:
            newDict[key] = value

    # print('Filtered Dictionary : ')
    print(len(newDict.keys()))
    columns = list(newDict.keys())


    for column in columns: 
        soloprecio[column] = train['titulo'].str.contains(column)
        solotest[column] = test['titulo'].str.contains(column)
    #     columncontent = []
    return int(len(newDict.keys()))


In [6]:
clf = MLPRegressor(hidden_layer_sizes=(10), activation='tanh', solver='adam')

In [10]:
from sklearn.model_selection import cross_val_score
columns = []
def rmse_cv(model, appearence):
    columns.append(buildDataframe(appearence, counter))

    
    every_column_except_y= [col for col in solopreciotrain.columns if col not in ['id', 'logprecio']]
    X_train, X_test, y_train, y_test = train_test_split(solopreciotrain[every_column_except_y], solopreciotrain.logprecio, test_size=0.2, random_state=123)
    print(solopreciotrain.logprecio.isnull().sum())
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 2))
    return(rmse)

def xgBoost_rmse(model, appearence):
    columns.append(buildDataframe(appearence, counter))
    
    every_column_except_y= [col for col in solopreciotrain.columns if col not in ['id', 'logprecio']]
    X_train, X_test, y_train, y_test = train_test_split(solopreciotrain[every_column_except_y], solopreciotrain.logprecio, test_size=0.2, random_state=123)
    print(solopreciotrain.logprecio.isnull().sum())
    model.fit(X_train, y_train)
    
    print(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
    print(OrderedDict(sorted(model.get_booster().get_fscore().items(), key=lambda t: t[1], reverse=True)))
    return (OrderedDict(sorted(model.get_booster().get_fscore().items(), key=lambda t: t[1], reverse=True)))
#     return(rmse)

appearences = list(range(2000, 0, -200))
# cv_neuron = [rmse_cv(clf, appearence).mean() for appearence in appearences]

In [11]:
model = xgb.XGBRegressor(learning_rate =0.1, n_estimators=700, max_depth=5, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=6, scale_pos_weight=1, seed=27)
important_words = xgBoost_rmse(model, 800)

263


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0


  if getattr(data, 'base', None) is not None and \


0.49543795910048527
OrderedDict([('metros', 4175), ('departamento', 810), ('venta', 512), ('d', 281), ('c', 272), ('san', 247), ('depto', 246), ('col', 236), ('i', 210), ('av', 192), ('valle', 175), ('remate', 168), ('l', 165), ('fracc', 161), ('fe', 158), ('m2', 152), ('lomas', 150), ('excelente', 148), ('condominio', 142), ('residencial', 140), ('rio', 134), ('n', 132), ('hermosa', 126), ('local', 125), ('pre', 123), ('min', 121), ('rec', 121), ('centro', 112), ('loma', 111), ('mar', 110), ('norte', 107), ('real', 105), ('torre', 104), ('departamentos', 102), ('santa', 101), ('comercial', 99), ('nuevo', 98), ('cerca', 97), ('gran', 95), ('zona', 94), ('casas', 93), ('residencia', 90), ('ciudad', 89), ('bosque', 86), ('oportunidad', 84), ('juriquilla', 78), ('vista', 78), ('pedregal', 78), ('oficina', 77), ('nueva', 76), ('sol', 75), ('vendo', 72), ('sola', 71), ('calle', 70), ('sur', 69), ('planta', 66), ('privada', 65), ('quinta', 64), ('monte', 62), ('unidad', 62), ('miguel', 61), 

In [None]:
# clffinal = MLPRegressor(activation='tanh', solver='adam')
# buildDataframe(15000)
# every_column_except_y= [col for col in solopreciotrain.columns if col not in ['id', 'logprecio']]
# X_train, X_test, y_train, y_test = train_test_split(solopreciotrain[every_column_except_y], solopreciotrain.logprecio, test_size=0.2, random_state=123)

In [None]:
# cv_neuron = pd.Series(cv_neuron, index = columns)
# cv_neuron.plot(title = "Estimación solo con palabras del titulo")
# plt.xlabel("Columnas")
# plt.ylabel("rmse")
# cv_neuron

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(solopreciotrain[every_column_except_y], solopreciotrain.logprecio, test_size=0.2, random_state=123)
# np.sqrt(mean_squared_error(y_test, clf.predict(y_train)))

In [None]:
clffinal = MLPRegressor(activation='tanh', solver='adam')
buildDataframe(15000)
every_column_except_y= [col for col in solopreciotrain.columns if col not in ['id', 'logprecio']]
X_train, X_test, y_train, y_test = train_test_split(solopreciotrain[every_column_except_y], solopreciotrain.logprecio, test_size=0.2, random_state=123)

In [None]:
# clffinal.fit(X_train, y_train)

In [None]:
# np.sqrt(mean_squared_error(y_test, clffinal.predict(X_test)))

In [None]:
buildDataframe(50, Counter(important_words).most_common(100))

In [None]:
from sklearn.feature_extraction import FeatureHasher

cantidad_features = 20

h = FeatureHasher(n_features=cantidad_features, input_type='string')
every_column_except_y= [col for col in solopreciotrain.columns if col not in ['id', 'logprecio', 'metros']]

train_hashtrick = solopreciotrain.copy()
test_hashtrick = solotest.copy()

# Recorro las columnas y asigno la palabra si es True, sino nan.
for el in every_column_except_y:
    train_hashtrick.loc[train_hashtrick[el] == True, el] = el
    train_hashtrick.loc[train_hashtrick[el] == False, el] = np.nan
    test_hashtrick.loc[test_hashtrick[el] == True, el] = el
    test_hashtrick.loc[test_hashtrick[el] == False, el] = np.nan

# Armo la matriz de arrays para poder usar the hashing trick
train_hashtrick = train_hashtrick[every_column_except_y].apply(lambda x: list(filter(lambda y : y == y, x)), axis=1)
test_hashtrick = test_hashtrick[every_column_except_y].apply(lambda x: list(filter(lambda y : y == y, x)), axis=1)

# Termino de armar los arrays.
names = [f'fh{el + 1}' for el in range(cantidad_features)]
f = h.transform(train_hashtrick.values)
train_hashtrick = pd.DataFrame(f.toarray(), columns=names)
train_hashtrick['id'] = train.index
train_hashtrick = train_hashtrick.set_index('id')
f = h.transform(test_hashtrick.values)
test_hashtrick = pd.DataFrame(f.toarray(), columns=names)
test_hashtrick['id'] = test.index
test_hashtrick = test_hashtrick.set_index('id')
test_hashtrick

In [None]:
test_hashtrick.shape

In [None]:
finaltest = pd.read_csv('../normalized_test.csv')
finaltest = finaltest.set_index('id')
finaltest = finaltest.join(test_hashtrick, how='left')

finaltest.loc[:, names] = finaltest[names].fillna(0)
finaltest.to_csv('../normalized2_test.csv')

In [None]:
finaltrain = pd.read_csv('../normalized_train.csv')
finaltrain = finaltrain.set_index('id')
finaltrain = finaltrain.join(train_hashtrick)

finaltrain.loc[:, names] = finaltrain[names].fillna(0)
finaltrain.to_csv('../normalized2_train.csv')

In [None]:
finaltrain.precio

In [None]:
finaltrain.shape