In [19]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import label_binarize

def read_df (filename):
    df = pd.read_csv(filename)
    df['metro_dist'] = df['metro_dist'].replace(np.nan, 30) 
    df = df.replace(np.nan, -999)
    df['year'] = pd.to_numeric(df['date'].str.split('-').str[0], errors='coerce')
    df['month'] = pd.to_numeric(df['date'].str.split('-').str[1], errors='coerce')
    df['date_num'] = df.apply(lambda r: (r['year'] - 2011) * 12 + r['month'], axis=1)
    
    df['balcon'] = df['balcon'].astype(str)
    df = pd.get_dummies(df, columns=['balcon'])
    
    df['g_lift'] = df['g_lift'].astype(str)
    df = pd.get_dummies(df, columns=['g_lift'])
    
    df['build_tech'] = df['build_tech'].astype(str)
    df = pd.get_dummies(df, columns=['build_tech'])
    
    bins = np.linspace(0, 671, 67)
    df['street_id'] = np.digitize(df['street_id'], bins=bins)
    df['street_id'] = df['street_id'].astype(str)
    df = pd.get_dummies(df, columns=['street_id'])

    return df


train = read_df('Train.csv')
test = read_df('Test.csv')


In [21]:
trash_columns = ['date', 'year', 'id']
target = 'price'
features = list(set(train.columns) - set(trash_columns + [target]))
features

['street_id_21',
 'street_id_54',
 'street_id_7',
 'kw11',
 'street_id_26',
 'street_id_38',
 'kw12',
 'metro_dist',
 'street_id_58',
 'street_id_20',
 'street_id_4',
 'kw9',
 'area',
 'street_id_43',
 'build_tech_0.0',
 'street_id_53',
 'street_id_60',
 'street_id_32',
 'balcon_2',
 'street_id_5',
 'street_id_14',
 'street_id_30',
 'kw1',
 'street_id_48',
 'street_id_64',
 'build_tech_2.0',
 'g_lift_0.0',
 'street_id_39',
 'floor',
 'street_id_37',
 'street_id_51',
 'g_lift_1.0',
 'street_id_63',
 'street_id_61',
 'street_id_2',
 'kw6',
 'g_lift_-999.0',
 'street_id_67',
 'kw7',
 'balcon_1',
 'street_id_46',
 'street_id_23',
 'street_id_66',
 'street_id_49',
 'street_id_27',
 'balcon_0',
 'build_tech_1.0',
 'street_id_44',
 'kw3',
 'street_id_19',
 'rooms',
 'street_id_10',
 'street_id_3',
 'street_id_55',
 'kw4',
 'street_id_16',
 'street_id_45',
 'street_id_47',
 'street_id_29',
 'street_id_15',
 'street_id_18',
 'street_id_25',
 'street_id_56',
 'street_id_41',
 'street_id_1',
 'st

In [22]:
from sklearn.model_selection import train_test_split

y = train[target].values
X = train[features].values
Xt = test[features].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7)

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE

X_train_selected = X_train
X_test_selected = X_test

#select = SelectFromModel (RandomForestRegressor())
#select.fit(X_train, y_train)
#X_train_selected = select.transform(X_train)
#X_test_selected = select.transform(X_test)

In [24]:
#import matplotlib.pyplot as plt
#mask = select.get_support()
#d = {}
#for i in range (mask.size):
#    d[features[i]] = mask[i]
#    if 'street_id' not in features[i] and mask[i]:
#        print (features[i])

In [25]:
from sklearn.metrics import mean_absolute_error

pipe = make_pipeline(StandardScaler (), Ridge())
pipe.fit(X_train_selected, y_train)
preds = pipe.predict(X_test_selected)
print ('error = ', mean_absolute_error (y_test, preds))
print (pipe.named_steps['ridge'].coef_)

#pipe.fit(X, y)
#
#preds = pipe.predict(Xt)
#test['price'] = preds
#test[['id', 'price']].to_csv('sub.csv', index=False)

error =  1656012.8622471476
[ 2.02444036e+04 -1.61376703e+03  5.85006950e+04  6.27738667e+04
 -2.53459175e+04 -6.73349541e+03 -2.49698426e+04 -4.36914106e+05
 -4.42600924e+04  1.30538533e+04 -6.73895337e+04  6.02973920e+03
  3.55627861e+06 -1.24050668e+05 -1.46938237e+05 -1.62789919e+05
 -4.53642761e+04  6.54389617e+04  9.47171969e+04  1.28699830e+05
 -1.52444139e+04 -6.10252166e+04  3.78059398e+04  6.46917275e+04
  1.54740176e+05  7.88760252e+04 -2.50408230e+03 -6.55854554e+04
  2.45338811e+05 -2.43306142e+04  8.10990011e+02 -1.63912779e+03
 -6.02954747e+04 -9.82444081e+04  1.66832179e+05 -4.52874415e+03
  4.32983291e+03  7.43581171e+04  1.11050256e+04  7.75198277e+04
 -1.60307240e+04  1.45576101e+05 -3.77372349e+04  5.36459953e+04
  6.10788809e+04 -1.09602826e+05  1.24821003e+05 -7.52801882e+04
 -1.94029374e+04 -3.77353608e+04 -1.48643823e+06  1.34656367e+04
 -1.18224437e+05  1.35968132e+05 -2.22788044e+04 -8.84247229e+03
  7.22422906e+04  6.48713820e+03 -7.19741639e+04  5.07136236e+

In [7]:
Xt_selected = select.transform(Xt)
Xt_selected = select.transform(Xt)

pipe.fit(X, y)
preds = pipe.predict(Xt_selected)
test['price'] = preds
test[['id', 'price']].to_csv('sub.csv', index=False)

ValueError: operands could not be broadcast together with shapes (100000,62) (702,) (100000,62) 