In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsRegressor
import csv

### Primer acercamiento a Machine Learning (KNN)

In [2]:
dfTest = pd.read_csv('../data/TEST_SET/properati_dataset_testing_noprice.csv', parse_dates=['created_on'], infer_datetime_format=True)
dfTrain = pd.read_csv('../data/output_2/output_2.csv', parse_dates=['created_on'], infer_datetime_format=True)

In [3]:
dfTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14166 entries, 0 to 14165
Data columns (total 17 columns):
id                         14166 non-null int64
created_on                 14166 non-null datetime64[ns]
property_type              14166 non-null object
operation                  14166 non-null object
place_name                 14166 non-null object
place_with_parent_names    14166 non-null object
country_name               14166 non-null object
state_name                 14166 non-null object
lat-lon                    10487 non-null object
lat                        10487 non-null float64
lon                        10487 non-null float64
surface_total_in_m2        11853 non-null float64
surface_covered_in_m2      13005 non-null float64
floor                      1368 non-null float64
rooms                      7500 non-null float64
expenses                   2543 non-null object
description                14166 non-null object
dtypes: datetime64[ns](1), float64(6), int64(1),

In [4]:
dfTrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462286 entries, 0 to 1462285
Data columns (total 18 columns):
id                            1462286 non-null object
created_on                    1462286 non-null datetime64[ns]
property_type                 1462286 non-null object
place_name                    1462286 non-null object
state_name                    1462286 non-null object
lat                           1122304 non-null float64
lon                           1122293 non-null float64
price                         1462286 non-null float64
currency                      1457400 non-null object
price_aprox_local_currency    1462286 non-null float64
price_aprox_usd               1462286 non-null float64
surface_total_in_m2           1079918 non-null float64
surface_covered_in_m2         233333 non-null float64
price_usd_per_m2              1016393 non-null float64
price_per_m2                  226511 non-null float64
floor                         289280 non-null float64
rooms   

___
### Función para corregir el Test Set:
> + Debe traducir al inglés **property_type**
> + Eliminar la columna operation ya que supongo que son todos venta (por más que haya 1 alquiler)
> + Corregir con la descripcion el unico registro de Buenos Aires Interior
> + Debe mantener el orden inicial del set ya que se chequean los resultados por indices de array.

> #### Features restantes:
+ id
+ created_on
+ property_type
+ place_name
+ state_name
+ lat
+ lon
+ surface_total_in_m2

In [5]:
def _translate(row) :
    
    english = ['apartment', 'PH', 'house', 'store']
    spanish = ['departamento', 'ph', 'casa', 'local']
    
    if row.property_type in spanish :
        return english[spanish.index(row.property_type)]
    
    return row.property_type



def correctTestSet(testSet) :
    testSet['order'] = pd.Series(xrange(len(dfTest)))
    testSet.property_type = testSet.apply(_translate, axis=1)
    testSet.iat()[1, 4] = 'Palermo'
    testSet.iat()[1, 7] = 'Capital Federal'
    testSet.iat()[1, 14] = 2
    testSet['year_created'] = pd.Series(testSet.created_on.dt.year)
    testSet['month_created'] = pd.Series(testSet.created_on.dt.month)
    testSet['day_created'] = pd.Series(testSet.created_on.dt.day)
    return testSet.sort_values(by='order').loc[:, ('id', 'year_created', 'month_created', 'day_created', 
                                                   'property_type', 'place_name', 'state_name', 'lat', 'lon', 
                                                   'surface_total_in_m2', 'rooms') ]
    
    

In [6]:
dfTest.loc[dfTest.state_name.str.contains('Inter')].iat()[0, len(dfTest.columns) - 1]

'El departamento cuenta con un living-comedor amplio con ventanales amplios que dan una excelente vista a Ciudad y Rio. Cocina apartada con doble ingreso y salida a balc\xc3\xb3n. Posee dos dormitorios amplios con excelentes vistas y su principal con vestidor y ba\xc3\xb1o en Suite.Palermo UNO es una de las torres m\xc3\xa1s altas de Palermo, con una incre\xc3\xadble vista a la ciudad, desde la Rural hasta el Rio. El edificio tiene un gimnasio en el piso 32, muy completo con aire acondicionado, con una vista inmejorable. Adem\xc3\xa1s tiene Laundry, SUM, SPA, jacuzzi con vista incre\xc3\xadble, sauna, una pileta olimpica y un espacio que juegos infantiles. Hay dos entradas al complejo lo que lo hace mas c\xc3\xb3modo al acceso, con seguridad las 24 hs.Valor: U$S 410.000Con Cochera. EasyBroker ID: EB-AO9021'

In [7]:
dfTest.loc[dfTest.id == 3633]

Unnamed: 0,id,created_on,property_type,operation,place_name,place_with_parent_names,country_name,state_name,lat-lon,lat,lon,surface_total_in_m2,surface_covered_in_m2,floor,rooms,expenses,description
1,3633,2017-08-25,departamento,venta,Buenos Aires Interior,|Argentina|Buenos Aires Interior|,Argentina,Buenos Aires Interior,,,,0.0,,,,,El departamento cuenta con un living-comedor a...


In [8]:
dfTest = correctTestSet(dfTest)

In [9]:
dfTest.loc[dfTest.id == 3633]

Unnamed: 0,id,year_created,month_created,day_created,property_type,place_name,state_name,lat,lon,surface_total_in_m2,rooms
1,3633,2017,8,25,apartment,Palermo,Capital Federal,,,0.0,2.0


### TestSet Corregido:

In [10]:
dfTest.head()

Unnamed: 0,id,year_created,month_created,day_created,property_type,place_name,state_name,lat,lon,surface_total_in_m2,rooms
0,3632,2017,8,24,apartment,Puerto Madero,Capital Federal,-34.610988,-58.363464,0.0,
1,3633,2017,8,25,apartment,Palermo,Capital Federal,,,0.0,2.0
2,2263404,2017,8,1,apartment,Palermo Soho,Capital Federal,-34.589363,-58.41288,53.0,
3,2263405,2017,8,1,apartment,Chacarita,Capital Federal,,,39.0,
4,2263406,2017,8,1,apartment,Chacarita,Capital Federal,,,51.0,


___

### Corrigiendo TrainSet:
> + Valores muy bajos o extremadamente altos en **surface_total_in_m2** y **price_usd_per_m2**.
> + Filtrar latitudes y longitudes muy por fuera de Bs. As.
> + Filtrar precios irrealmente bajos y altos.
> + Eliminar columna currency.

In [11]:
def _pointInsideBsAs(lat, lon) :
    NO, NE, SE, SO = (-34.2560, -60.1354), (-34.2560, -57.0902), (-36.0219, -57.0902), (-36.0219, -60.1354)    
    return not ( (lat > NO[0]) or (lon < NO[1]) or (lon > NE[1]) or (lat < SE[0]) )

def pointInsideBsAs(latSeries, lonSeries) :    
    result = []
    
    for lat, lon in zip(latSeries, lonSeries) :
        result.append(_pointInsideBsAs(lat, lon))
    
    return np.array(result, dtype='bool')

def correctTrainSet(trainSet): 
    trainSet = trainSet.loc[ pointInsideBsAs(trainSet.lat, trainSet.lon) ]
    trainSet = trainSet.loc[ (trainSet.price_aprox_usd > 11) & (trainSet.price_aprox_usd < 20000000) ]
    trainSet = trainSet.loc[ (trainSet.surface_total_in_m2 > 11) & (trainSet.surface_total_in_m2 < 15000) ]
    #trainSet = trainSet.loc[ (trainSet.surface_covered_in_m2 > 11) & (trainSet.surface_covered_in_m2 < 15000) ]
    #trainSet = trainSet.loc[ trainSet.rooms < 11 ]
    #trainSet = trainSet.loc[ trainSet.expenses < 3500 ]
    trainSet['year_created'] = pd.Series(trainSet.created_on.dt.year)
    trainSet['month_created'] = pd.Series(trainSet.created_on.dt.month)
    trainSet['day_created'] = pd.Series(trainSet.created_on.dt.day)
    return trainSet.loc[:, ('id', 'year_created', 'month_created', 'day_created', 
                            'property_type', 'place_name', 'state_name', 'lat', 'lon', 
                            'surface_total_in_m2', 
                            'rooms', 'price_aprox_usd') ]

In [12]:
dfTrain = correctTrainSet(dfTrain)

In [13]:
dfTrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1004817 entries, 0 to 1462285
Data columns (total 12 columns):
id                     1004817 non-null object
year_created           1004817 non-null int64
month_created          1004817 non-null int64
day_created            1004817 non-null int64
property_type          1004817 non-null object
place_name             1004817 non-null object
state_name             1004817 non-null object
lat                    743711 non-null float64
lon                    743700 non-null float64
surface_total_in_m2    1004817 non-null float64
rooms                  594687 non-null float64
price_aprox_usd        1004817 non-null float64
dtypes: float64(5), int64(3), object(4)
memory usage: 99.7+ MB


In [14]:
dfTrain.describe()

Unnamed: 0,year_created,month_created,day_created,lat,lon,surface_total_in_m2,rooms,price_aprox_usd
count,1004817.0,1004817.0,1004817.0,743711.0,743700.0,1004817.0,594687.0,1004817.0
mean,2014.215,6.24809,17.61971,-34.589288,-58.499438,214.2412,2.64141,228773.3
std,1.191571,3.215552,9.016406,0.094543,0.141215,428.242,1.36438,314787.4
min,2012.0,1.0,1.0,-36.018143,-60.033478,12.0,0.0,3032.98
25%,2013.0,3.0,10.0,-34.62998,-58.562516,51.0,2.0,92000.0
50%,2014.0,6.0,19.0,-34.595669,-58.475028,87.0,2.0,143522.4
75%,2015.0,9.0,25.0,-34.552376,-58.419656,211.0,3.0,250000.0
max,2017.0,12.0,31.0,-34.257713,-57.517536,14820.0,40.0,19802480.0


___
### Corriendo el primer KNN

In [15]:
dfTrain = dfTrain.drop('id', axis=1).rename(columns={'price_aprox_usd':'price_usd'})

In [16]:
dfTrain.head()

Unnamed: 0,year_created,month_created,day_created,property_type,place_name,state_name,lat,lon,surface_total_in_m2,rooms,price_usd
0,2013,8,28,apartment,Caseros,Bs.As. G.B.A. Zona Oeste,-34.600116,-58.565334,62.0,3.0,130650.48
1,2013,8,28,apartment,Villa Crespo,Capital Federal,-34.597274,-58.439479,44.0,2.0,78500.0
2,2013,8,28,house,General Rodríguez,Bs.As. G.B.A. Zona Oeste,,,150.0,,243081.79
3,2013,8,28,apartment,Las Cañitas,Capital Federal,-34.567401,-58.434775,250.0,4.0,710000.0
4,2013,8,28,apartment,Flores,Capital Federal,-34.626409,-58.459324,40.0,1.0,110714.41


In [17]:
dfTest.head()

Unnamed: 0,id,year_created,month_created,day_created,property_type,place_name,state_name,lat,lon,surface_total_in_m2,rooms
0,3632,2017,8,24,apartment,Puerto Madero,Capital Federal,-34.610988,-58.363464,0.0,
1,3633,2017,8,25,apartment,Palermo,Capital Federal,,,0.0,2.0
2,2263404,2017,8,1,apartment,Palermo Soho,Capital Federal,-34.589363,-58.41288,53.0,
3,2263405,2017,8,1,apartment,Chacarita,Capital Federal,,,39.0,
4,2263406,2017,8,1,apartment,Chacarita,Capital Federal,,,51.0,


In [18]:
# Tengo que pasar property_type, place_name y state_name a datos numéricos:

le = preprocessing.LabelEncoder()
le.fit(np.array(list(set(list(dfTrain.property_type.unique()) + list(dfTest.property_type.unique()))), dtype='object'))
dfTrain.property_type = le.transform(dfTrain.property_type)
dfTest.property_type = le.transform(dfTest.property_type)

le = preprocessing.LabelEncoder()
le.fit(np.array(list(set(list(dfTrain.place_name.unique()) + list(dfTest.place_name.unique()))), dtype='object'))
dfTrain.place_name = le.transform(dfTrain.place_name)
dfTest.place_name = le.transform(dfTest.place_name)

le = preprocessing.LabelEncoder()
le.fit(np.array(list(set(list(dfTrain.state_name.unique()) + list(dfTest.state_name.unique()))), dtype='object'))
dfTrain.state_name = le.transform(dfTrain.state_name)
dfTest.state_name = le.transform(dfTest.state_name)

In [19]:
# Lleno los valores faltantes (NaN's) con la clase predominante en esa columna

imp = preprocessing.Imputer(missing_values='NaN', strategy='most_frequent')
imp.fit(dfTrain[['rooms']])
dfTrain.rooms = imp.transform(dfTrain[['rooms']]).ravel()

imp = preprocessing.Imputer(missing_values='NaN', strategy='mean')
imp.fit(dfTrain[['lat']])
dfTrain.lat = imp.transform(dfTrain[['lat']]).ravel()
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean')
imp.fit(dfTrain[['lon']])
dfTrain.lon = imp.transform(dfTrain[['lon']]).ravel()

imp = preprocessing.Imputer(missing_values='NaN', strategy='most_frequent')
imp.fit(dfTest[['rooms']])
dfTest.rooms = imp.transform(dfTest[['rooms']]).ravel()

imp = preprocessing.Imputer(missing_values='NaN', strategy='mean')
imp.fit(dfTest[['lat']])
dfTest.lat = imp.transform(dfTest[['lat']]).ravel()
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean')
imp.fit(dfTest[['lon']])
dfTest.lon = imp.transform(dfTest[['lon']]).ravel()

imp = preprocessing.Imputer(missing_values='NaN', strategy='mean')
imp.fit(dfTest[['surface_total_in_m2']])
dfTest.surface_total_in_m2 = imp.transform(dfTest[['surface_total_in_m2']]).ravel()

In [20]:
dfTrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1004817 entries, 0 to 1462285
Data columns (total 11 columns):
year_created           1004817 non-null int64
month_created          1004817 non-null int64
day_created            1004817 non-null int64
property_type          1004817 non-null int64
place_name             1004817 non-null int64
state_name             1004817 non-null int64
lat                    1004817 non-null float64
lon                    1004817 non-null float64
surface_total_in_m2    1004817 non-null float64
rooms                  1004817 non-null float64
price_usd              1004817 non-null float64
dtypes: float64(5), int64(6)
memory usage: 92.0 MB


In [21]:
dfTest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14166 entries, 0 to 14165
Data columns (total 11 columns):
id                     14166 non-null int64
year_created           14166 non-null int64
month_created          14166 non-null int64
day_created            14166 non-null int64
property_type          14166 non-null int64
place_name             14166 non-null int64
state_name             14166 non-null int64
lat                    14166 non-null float64
lon                    14166 non-null float64
surface_total_in_m2    14166 non-null float64
rooms                  14166 non-null float64
dtypes: float64(4), int64(7)
memory usage: 1.3 MB


In [22]:
# Genero TRAIN y TARGET
target = dfTrain.price_usd
train = dfTrain.drop('price_usd', axis=1)

testIds = dfTest.loc[:, ('id')]
testVals = dfTest.drop('id', axis=1)

In [23]:
knn = KNeighborsRegressor(n_neighbors=30, weights='distance', 
                          algorithm='kd_tree', leaf_size=30, 
                          metric='euclidean', n_jobs=-1)

knn.fit(train, target)

KNeighborsRegressor(algorithm='kd_tree', leaf_size=30, metric='euclidean',
          metric_params=None, n_jobs=-1, n_neighbors=30, p=2,
          weights='distance')

In [24]:
predictions = knn.predict(testVals)

In [25]:
# Escribo a archivo

with open('../data/predictions/1st_acercamiento.csv', 'w') as outfile:
        rows = [["id", "price_usd"]]

        for idTest, prediction in zip(testIds, predictions):
            rows.append([idTest, prediction])

        out_csv = csv.writer(outfile)
        out_csv.writerows(rows)