In [0]:
# Needed Libraries
import numpy as np
import pandas as pd
import seaborn as sns

# Data visualization
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
%matplotlib inline
from scipy import stats
from sklearn.preprocessing import LabelEncoder

# Importing Machine Learning related packages
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [5]:
# Extract data from csv file
df = pd.read_csv("NYREIT Data.csv")
df.head()

Unnamed: 0,tx_price,beds,baths,sqft,year_built,lot_size,property_type,exterior_walls,roof,basement,restaurants,groceries,nightlife,cafes,shopping,arts_entertainment,beauty_spas,active_life,median_age,married,college_grad,property_tax,insurance,median_school,num_schools,tx_year
0,295850,1,1,584,2013,0,Apartment / Condo / Townhouse,Wood Siding,,,107,9,30,19,89,6,47,58,33,65,84,234,81,9.0,3,2013
1,216500,1,1,612,1965,0,Apartment / Condo / Townhouse,Brick,Composition Shingle,1.0,105,15,6,13,87,2,26,14,39,73,69,169,51,3.0,3,2006
2,279900,1,1,615,1963,0,Apartment / Condo / Townhouse,Wood Siding,,,183,13,31,30,101,10,74,62,28,15,86,216,74,8.0,3,2012
3,379900,1,1,618,2000,33541,Apartment / Condo / Townhouse,Wood Siding,,,198,9,38,25,127,11,72,83,36,25,91,265,92,9.0,3,2005
4,340000,1,1,634,1992,0,Apartment / Condo / Townhouse,Brick,,,149,7,22,20,83,10,50,73,37,20,75,88,30,9.0,3,2002


In [6]:
#shape of train data
df.shape

(1883, 26)

In [0]:
# Data sets information
df.info()

In [0]:
# summary statistics for numerical columns
df.describe()

In [0]:
# Data Structure of the data set
df.dtypes

In [0]:
plt.subplots(figsize=(12,9))
sns.distplot(df['tx_price'], fit=stats.norm)

# Get the fitted parameters used by the function

(mu, sigma) = stats.norm.fit(df['tx_price'])

# plot with the distribution

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
plt.ylabel('Frequency')

#Probablity plot

fig = plt.figure()
stats.probplot(df['tx_price'], plot=plt)
plt.show()

In [15]:
# Data Cleaning
# Display number of missing values by feature
df.columns[df.isnull().any()]

Index(['exterior_walls', 'roof', 'basement'], dtype='object')

In [16]:
#missing value counts in each of these columns
Isnull = df.isnull().sum()/len(df)*100
Isnull = Isnull[Isnull>0]
Isnull.sort_values(inplace=True, ascending=False)
Isnull

roof              18.799788
basement          12.002124
exterior_walls    11.842804
dtype: float64

In [19]:
# Correlation
df_corr = df.select_dtypes(include=[np.number])
df_corr.shape

(1883, 23)

In [0]:
# Correlation heat map
corr = df_corr.corr()
plt.subplots(figsize=(20,9))
sns.heatmap(corr, cmap="viridis", annot=True)

In [0]:
#top feature greater than 0.5 correlation
top_feature = df_kor.index[abs(corr['tx_price']>0.4)]
plt.subplots(figsize=(12, 8))
top_corr = df[top_feature].corr()
sns.heatmap(top_corr, cmap="viridis", annot=True)
plt.show()

In [0]:
# feature importance
print("Find most important features relative to target")
corr = df.corr()
corr.sort_values(['tx_price'], ascending=False, inplace=True)
corr.tx_price

In [0]:
# replacing missing value with none
for col in ('roof', 'exterior_walls'):
    df[col] = df[col].fillna('None')

In [0]:
# Replacing specific missing values with Median
df['basement'] = df['basement'].fillna(int(0))

In [0]:
#Checking there is any null value or not
plt.figure(figsize=(10, 5))
sns.heatmap(df.isnull())

In [0]:
df.dtypes

In [0]:
# Encoding str to int
cols = ('property_type', 'exterior_walls', 'roof', 'basement', 'median_school')

In [0]:
from sklearn.preprocessing import LabelEncoder
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(df[c].values)) 
    df[c] = lbl.transform(list(df[c].values))

In [0]:
# Machine Learning 

# Machine Learning using the Linear Regression
# Split data into training and test sets
data = df
y = data.tx_price
X = data.drop('tx_price', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Fitting Linear Regression to the Training set
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
clf_reg = regressor.fit(X_train, y_train)

In [0]:
# Predicting the Test set results
y_pred = clf_reg.predict(X_test)

In [54]:
# Evaluating the Test set results
print("MSE -->  ", mean_squared_error(y_test, y_pred))
print("Accuracy --> ",  regressor.score(X_test, y_test)*100)

MSE -->   12614771264.33042
Accuracy -->  45.129983448686204


In [0]:
# Machine Learning using the Random Forest Algorithm
# Split data into training and test sets
data = df
y = data.tx_price
X = data.drop('tx_price', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=123)

# Fitting Random Forest Decision Tree Regression to the Training set
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()
clf_rf = regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = clf_rf.predict(X_test)

In [56]:
# Evaluating the Test set results
print("MSE -->  ", mean_squared_error(y_test, y_pred))
print("Accuracy --> ",  regressor.score(X_test, y_test)*100)

MSE -->   4625108447.84865
Accuracy -->  79.88233224627183


In [0]:
# Machine Learning using the Gradient boosting regressor Algorithm
# Split data into training and test sets
data = df
y = data.tx_price
X = data.drop('tx_price', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=123)

# Fitting gradient boosting regressor model to the Training set
from sklearn.ensemble import GradientBoostingRegressor
regressor = GradientBoostingRegressor()
clf_rf = regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = clf_rf.predict(X_test)

In [58]:
# Evaluating the Test set results
print("MSE -->  ", mean_squared_error(y_test, y_pred))
print("Accuracy --> ",  regressor.score(X_test, y_test)*100)

MSE -->   4480752099.796018
Accuracy -->  80.51023385787951


In [0]:
# Machine Learning using the Decision tree regressor Algorithm
# Split data into training and test sets
data = df
y = data.tx_price
X = data.drop('tx_price', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=123)

# Fitting Decision tree regressor model to the Training set
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
clf_rf = regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = clf_rf.predict(X_test)

In [60]:
# Evaluating the Test set results
print("MSE -->  ", mean_squared_error(y_test, y_pred))
print("Accuracy --> ",  regressor.score(X_test, y_test)*100)

MSE -->   9532927671.877985
Accuracy -->  58.534967604409715
