In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
house = pd.read_csv('../input/kc_house_data.csv') #reading data
grade = house['grade'] # y of the data for logistic regression

In [None]:
house.head()

In [None]:
house.info()

# Features
Here I have generated new and transformed existed features: __date__ has been transformed because objects can't be fitted to train a model, so I have made a decision to transform it to datetype and than extract year, month and day; __yr_renovated__ has been changed in a way that if it was renovated then I substracted this year of renovation from 2018, if it was not I just have left 0; __yr_built__ has been changed to know how old is the building instead of year of construction; __zipcode__ column I have created dummies columns to have a look if that encereases the score


In [None]:
house['date'] = pd.to_datetime(house['date'])#transforming date column to datetype
house['year'] = house['date'].dt.year #extracting year 
house['month'] = house['date'].dt.month #month
house['day'] = house['date'].dt.day #day
house['yr_renovated'] = house['yr_renovated'].apply(lambda x: 2018 - x if x != 0 else 0)#analysing how old is a renovation
house['yr_built'] = np.abs(house['yr_built'] - 2018) #analysing how old is a building

#logistic regression features as they contain price, they can't be used in linear 
house['price/sqft_living'] = house['price']/ house['sqft_living'] # analysing price per sqrf_living
house['price/sqft_lot'] = house['price']/ house['sqft_lot'] # analysing price per sqrf_lot 
house['price/sqft_above'] = house['price']/ house['sqft_above'] # analysing price per sqrf_above
house['price/sqft_basement'] = house['price']/ house['sqft_basement'] # analysing price per sqrf_above
house['price/sqft_basement'] = house['price/sqft_basement'].replace(np.inf, 0)

house = house.drop('date', axis = 1)
house = house.drop('id', axis = 1)

# Feature Analysis

In [None]:
#visualizationg of price corresponging to sqrt_living with indication of condiditon of the property
plt.figure(figsize = (12,8))
g = sns.FacetGrid(data=house, hue='condition',size= 5, aspect=2)
g.map(plt.scatter, "sqft_living", "price")
plt.show()

In [None]:
#creating correlation matrix to know the relation between target feature and other features
f, ax = plt.subplots(figsize=(20, 15))
plt.title('Correlation Matrix',fontsize=25)
sns.heatmap(house.corr(), linewidths=0.25, vmax=1.0, square=True, cmap="RdBu_r", linecolor='k', annot=True)

In [None]:
house = house.drop('grade', axis = 1) # dropping y from x data

In [None]:
#lists of corresponging features to fit in classification tasks
columns_grade_prediction = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15',
       'sqft_lot15', 'year', 'month', 'day', 'price/sqft_living',
       'price/sqft_lot', 'price/sqft_above', 'price/sqft_basement']
# splitting train and test for price task
X_train_logistic, X_test_logistic, y_train_logistic, y_test_logistic = train_test_split(
    house, grade, test_size = 0.2, shuffle = True, random_state=49)
print("Logistic regression datasets")
print (X_train_logistic.shape, y_train_logistic.shape)
print (X_test_logistic.shape, y_test_logistic.shape)

In [None]:
logreg = LogisticRegression(multi_class='multinomial',  solver='newton-cg', C = 1)
logreg.fit(X_train_logistic, y_train_logistic)
predictions_logistic = logreg.predict(X_test_logistic)
# Summary of the predictions made by the classifier
print(classification_report(y_test_logistic , predictions_logistic))
# Accuracy score
print('Accuracy for test is', metrics.accuracy_score(y_test_logistic, predictions_logistic))
print('Accuracy for train is', metrics.accuracy_score(y_train_logistic, logreg.predict(X_train_logistic)))

In [None]:
# to visualize easier I will take constant number of features(25) and plot score corresponding to C 
train_score = [0.6267206477732794, 0.6287449392712551, 0.627646038172354, 0.6270676691729323,
               0.6271255060728745, 0.6253325621746675, 0.6281087333718912]
test_score = [0.6243349525792274, 0.6266481609993061, 0.6259541984732825, 0.6238723108952117, 0.6245662734212353, 
              0.6236409900532038, 0.6264168401572981]
C = [1e-2, 1, 2, 10,15, 50, 100]

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(C, train_score)
plt.plot(C, test_score)
plt.xlim([-1, 101])
plt.ylim([0.622, 0.63])
plt.rcParams['font.size'] = 12
plt.title('Train Test Error')
plt.xlabel('C values')
plt.ylabel('Accuracy Score')
plt.grid(True)

Accuracy corresponds to percantage of correctly classified objects. From the graph above it is easy to notice that model with C = 1 scored the best. In the following code I will have a look on scaled features fitted to logistic regression.

# Logistic regression with scaled features

In [None]:
sc = StandardScaler()
house_scaled = sc.fit_transform(house)
X_train_logistic, X_test_logistic, y_train_logistic, y_test_logistic = train_test_split(
    house_scaled, grade, test_size = 0.2, shuffle = True, random_state=49)
print("Logistic regression datasets")
print (X_train_logistic.shape, y_train_logistic.shape)
print (X_test_logistic.shape, y_test_logistic.shape)
logreg = LogisticRegression(multi_class='multinomial',  solver='sag', C = 1)
logreg.fit(X_train_logistic, y_train_logistic)
predictions_logistic = logreg.predict(X_test_logistic)
# Summary of the predictions made by the classifier
print(classification_report(y_test_logistic , predictions_logistic))
# Accuracy score
print('Accuracy for test is', metrics.accuracy_score(y_test_logistic, predictions_logistic))
print('Accuracy for train is', metrics.accuracy_score(y_train_logistic, logreg.predict(X_train_logistic)))

# Decision Tree Classifier

In [None]:
dt = DecisionTreeClassifier(max_depth = 8, criterion='entropy')
dt.fit(X_train_logistic, y_train_logistic)
predictions_dt = dt.predict(X_test_logistic)
# Summary of the predictions made by the classifier
print(classification_report(y_test_logistic, predictions_dt))
# Accuracy score
print('Accuracy for test is', metrics.accuracy_score(y_test_logistic, predictions_dt))
print('Accuracy for train is', metrics.accuracy_score(y_train_logistic, dt.predict(X_train_logistic)))