In [21]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mplt
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error


from sklearn.linear_model import LogisticRegression

In [153]:
df = pd.read_csv("../data/sherbank-housing/train.csv")
df.shape

(30471, 292)

In [170]:
cols = ["id","life_sq","full_sq","build_year","num_room","timestamp","price_doc"]
test_cols = ["id","life_sq","full_sq","build_year","num_room","timestamp"]
prediction_cols = ["life_sq","build_year","num_room","transaction_year"]
def columnExtractor(colnames,df):
    df_features = df[colnames]
    return df_features

df_features = columnExtractor(cols,df)

## Improving Features

### Add missing Values
- [Pandas link](http://pandas.pydata.org/pandas-docs/stable/missing_data.html)
- [Sklearn Imputer with pipeline](http://scikit-learn.org/stable/auto_examples/missing_values.html)
- [Checking nans SO](http://stackoverflow.com/questions/34779961/scikit-learn-error-in-fitting-model-input-contains-nan-infinity-or-a-value)
- [Replacing missing values fillna pandas](http://stackoverflow.com/questions/29177498/python-pandas-replace-nan-in-one-column-with-value-from-corresponding-row-of-sec)

In [156]:
def imputer(df_features):
    df_features["life_sq"].fillna(df_features["full_sq"]-15,inplace=True)
    df_features["num_room"].fillna(df_features["life_sq"]/14,inplace=True)
    df_features["num_room"].fillna(df_features["life_sq"]/14,inplace=True)
    df_features = df_features.fillna(df_features.median())
    return df_features

df_features = imputer(df_features)






# df_features.mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


### Outlier Removal

In [167]:
def oulierRemover(df_features):
    df_features[np.abs(df_features.life_sq-df_features.life_sq.mean())<=700]
    return df_features

## Below outlier remover only works on training set as it removes extreme values of the predicted variable
def oulierRemover_train(df_features):
    df_features[df_features.price_doc-df_features.price_doc.mean()<=(5*df_features.price_doc.std())]
    return df_features

df_features = oulierRemover(df_features)
df_features = oulierRemover_train(df_features)
# df_features[np.abs(df_features.life_sq-df_features.life_sq.mean())<=(6*df_features.life_sq.std())]

# df_features[np.abs(df_features.price_doc-df_features.price_doc.mean())<=(5*df_features.price_doc.std())]



### Adding new Features

In [158]:
def featureAdder(df_features):
    df_features['date_column'] = pd.to_datetime(df_features['timestamp'])
    df_features["transaction_year"] = df_features['date_column'].map(lambda x: x.year)
    return df_features

df_features = featureAdder(df_features)

### Feature Transformations

In [159]:
# df_features["life_sq"] = np.log(df_features["life_sq"])

## Model Build and Test

### Split train test

In [160]:
X = df_features[prediction_cols].values
y = df_features["price_doc"].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

np.isnan(X).any()

False

### Build model and generate predictions on split test 

In [161]:


# Create linear regression object
regr = linear_model.LinearRegression(copy_X=True,n_jobs=2,normalize=True)

# Train the model using the training sets
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)



In [162]:
eps = 1e-7
np.isnan(y_test).any()
np.any(np.absolute(y_pred) < eps)
np.log(y_test)



array([ 15.20805534,  15.53675385,  14.50865774, ...,  15.8949521 ,
        15.93176607,  13.81551056])

### Errors RMSE and RMSLE for train data

In [163]:
def findRMSLE(y_true,y_pred):
    return mean_squared_error(np.log(y_true), np.log(y_pred))

print('RMSLE: %.3f' % (findRMSLE(y_test, y_pred)))
print("RMSE: %.3f" % mean_squared_error(y_test, y_pred)**0.5)

RMSLE: 0.347
RMSE: 4111452.168


## Using Model on Test Set to generate prediction csv

In [171]:
df_test = pd.read_csv("../data/sherbank-housing/test.csv")
df_test.shape

(7662, 291)

### Extracting features and applying same transformations that were done on training data

In [172]:
df_features_test = columnExtractor(test_cols,df_test)
df_features_test = imputer(df_features_test)
df_features_test = oulierRemover(df_features_test)
df_features_test = featureAdder(df_features_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [173]:
df_features_test.head()

Unnamed: 0,id,life_sq,full_sq,build_year,num_room,timestamp,date_column,transaction_year
0,30474,20.7,39.0,1998.0,1,2015-07-01,2015-07-01,2015
1,30475,64.2,79.2,0.0,3,2015-07-01,2015-07-01,2015
2,30476,25.1,40.5,1960.0,2,2015-07-01,2015-07-01,2015
3,30477,36.0,62.8,2016.0,2,2015-07-01,2015-07-01,2015
4,30478,40.0,40.0,0.0,1,2015-07-01,2015-07-01,2015


In [174]:
X_t = df_features_test[prediction_cols].values
y_t = regr.predict(X_t)

### Generate Predictions

In [175]:
ids =np.array(df_features_test["id"]).astype(int)
my_solution = pd.DataFrame(y_t, ids, columns = ["price_doc"])
print(my_solution.head())
print(my_solution.shape)


          price_doc
30474  6.707694e+06
30475  1.003415e+07
30476  8.271053e+06
30477  8.333934e+06
30478  6.818592e+06
(7662, 1)


### Save to CSV

In [178]:
my_solution.to_csv("../data/sherbank-housing/results/my_solution_one.csv", index_label = ["id"])