In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import minmax_scale

from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer

from scipy.stats import boxcox
import sklearn.preprocessing as sklearn
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import (StandardScaler, MinMaxScaler)
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from scipy.stats import iqr
from scipy.stats import scoreatpercentile as pct
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
import statsmodels.api as sm

In [2]:
df = pd.read_excel("Data/midterm_project_cleaned.xlsx")

In [3]:
df.shape

(21596, 25)

In [4]:
df.zipcode.value_counts()

98103    601
98038    589
98115    583
98052    574
98117    553
        ... 
98102    104
98010    100
98024     80
98148     57
98039     50
Name: zipcode, Length: 70, dtype: int64

In [5]:
df.drop(["Unnamed: 0"], axis = 1, inplace = True)

In [13]:
price_by_zipcode = df.groupby('zipcode', as_index=False)['price'].mean()

In [14]:
price_by_zipcode

Unnamed: 0,zipcode,price
0,98001,2.811949e+05
1,98002,2.342840e+05
2,98003,2.941113e+05
3,98004,1.356524e+06
4,98005,8.102897e+05
...,...,...
65,98177,6.764194e+05
66,98178,3.106128e+05
67,98188,2.890783e+05
68,98198,3.028967e+05


In [15]:
type(price_by_zipcode)

pandas.core.frame.DataFrame

In [None]:
len(price_by_zipcode)

In [None]:
# Dropping highly correlated columns sqft_living and sqft_above
## sqft_living: living area of the home; area that can be heated or cooled 
## sqft_above: area above the basement; living area minus attic
df.drop(["sqft_above"], axis = 1, inplace = True)

In [None]:
decade_dict = {"Category A": 1, "Category B": 2, "Category C": 3, "Category D": 4}
df = df.replace({"decade": decade_dict})

In [None]:
df.info()

In [None]:
# Looking for highly correlated columns

correlation_matrix = df.corr()
correlation_matrix

In [None]:
fig5, ax = plt.subplots(figsize = (12,12))

ax.set_title("Heatmap")

sns.heatmap(correlation_matrix,  
            cmap = "coolwarm", 
            linewidth = 1, 
            annot = True, 
            fmt = ".2f")

fig, axe = plt.subplots(2,3,figsize = (14,12))

sns.histplot(df, x=df.loc[:, "sqft_living"], ax = axe[0,0]) 
sns.histplot(df, x=df.loc[:, "sqft_lot"], ax = axe [0,1])
sns.histplot(df, x=df.loc[:, "sqft_basement"], ax = axe [0,2])
sns.histplot(df, x=df.loc[:, "sqft_living15"], ax = axe [1,0])
sns.histplot(df, x=df.loc[:, "sqft_lot15"], ax = axe [1,1])

In [None]:
# X/y split
X = df.drop("price", axis=1)
y = df["price"]

In [None]:
# Train/test split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.30,random_state=123)

In [None]:
X_train.reset_index
y_train.reset_index

In [None]:
X_train.columns

In [None]:
# 3.Splitting into X_train_num, X_train_cat, X_test_num, X_test_cat

X_train_num = X_train[["sqft_living", "sqft_basement", "sqft_lot", "sqft_living15"]]
X_train_cat = X_train[["bedrooms", "bathrooms", "floors", "lat", "long", "yr_renovated", "waterfront", "view", "condition", "grade", "decade", "geo1", "geo2"]]
X_test_num = X_test[["sqft_living", "sqft_basement", "sqft_lot", "sqft_living15"]]
X_test_cat = X_test[["bedrooms", "bathrooms", "floors", "lat", "long", "yr_renovated", "waterfront", "view", "condition", "grade", "decade", "geo1", "geo2"]]

In [None]:
# 4.Applying minmax scaler to X_train_num, X_test_num (scaling)

# Fitting on X_train_num
scaler = StandardScaler() 
scaler.fit(X_train_num)

# Transforming X_train num and X_test_num
X_train_num_scaled = pd.DataFrame(scaler.transform(X_train_num), columns=X_train_num.columns).reset_index(drop=True)
X_test_num_scaled = pd.DataFrame(scaler.transform(X_test_num), columns=X_test_num.columns).reset_index(drop=True)

# 5.a: Applying power transformer (normalizing) to sqft_living

# Fitting on X_train_num

transformer = PowerTransformer()
sqft_living_transformer = transformer.fit(X_train_num_scaled['sqft_living'].to_numpy().reshape(-1,1))

# Normalizing X_train_num and X_test_num

X_train_num_scaled["sqft_living"] = sqft_living_transformer.transform(X_train_num_scaled["sqft_living"].to_numpy().reshape(-1,1))
X_test_num_scaled["sqft_living"] = sqft_living_transformer.transform(X_test_num_scaled["sqft_living"].to_numpy().reshape(-1,1))

# Plotting sqft_living distribution after normalization
fig = plt.subplots()

sns.histplot(X_train_num_scaled, x=X_train_num_scaled.loc[:, "sqft_living"])

# 5.b: Applying quantile transformer (normalizing) to sqft_lot

# Fitting on X_train_num

transformer = QuantileTransformer(output_distribution='normal')
sqft_living_transformer = transformer.fit(X_train_num_scaled['sqft_lot'].to_numpy().reshape(-1,1))

# Normalizing X_train_num and X_test_num

X_train_num_scaled["sqft_lot"] = sqft_living_transformer.transform(X_train_num_scaled["sqft_lot"].to_numpy().reshape(-1,1))
X_test_num_scaled["sqft_lot"] = sqft_living_transformer.transform(X_test_num_scaled["sqft_lot"].to_numpy().reshape(-1,1))

fig = plt.subplots()

sns.histplot(X_train_num_scaled, x=X_train_num_scaled.loc[:, "sqft_lot"])

# 5.c: Applying quantile transformer (normalizing) to sqft_living15

# Fitting on X_train_num

transformer = QuantileTransformer(output_distribution='normal')
sqft_living_transformer = transformer.fit(X_train_num_scaled['sqft_living15'].to_numpy().reshape(-1,1))

# Normalizing X_train_num and X_test_num

X_train_num_scaled["sqft_living15"] = sqft_living_transformer.transform(X_train_num_scaled["sqft_living15"].to_numpy().reshape(-1,1))
X_test_num_scaled["sqft_living15"] = sqft_living_transformer.transform(X_test_num_scaled["sqft_living15"].to_numpy().reshape(-1,1))

fig = plt.subplots()

sns.histplot(X_train_num_scaled, x=X_train_num_scaled.loc[:, "sqft_living15"])

# 5.d: Applying quantile transformer (normalizing) to sqft_lot15

# Fitting on X_train_num

transformer = QuantileTransformer(output_distribution='normal')
sqft_living_transformer = transformer.fit(X_train_num_scaled['sqft_lot15'].to_numpy().reshape(-1,1))

# Normalizing X_train_num and X_test_num

X_train_num_scaled["sqft_lot15"] = sqft_living_transformer.transform(X_train_num_scaled["sqft_lot15"].to_numpy().reshape(-1,1))
X_test_num_scaled["sqft_lot15"] = sqft_living_transformer.transform(X_test_num_scaled["sqft_lot15"].to_numpy().reshape(-1,1))

fig = plt.subplots()

sns.histplot(X_train_num_scaled, x=X_train_num_scaled.loc[:, "sqft_lot15"])

In [None]:
# 6. Splitting categorical columns into nominal and categorical ones

X_train_cat_nom = X_train_cat[["geo1", "geo2"]]
X_test_cat_nom = X_test_cat[["geo1", "geo2"]]

# Leaving only ordinal 

X_train_cat_ord = X_train_cat[["bedrooms", "bathrooms", "floors", "waterfront", "lat", "yr_renovated", "view", "condition", "grade", "decade"]]
X_test_cat_ord = X_test_cat[["bedrooms", "bathrooms", "floors", "waterfront", "lat", "yr_renovated", "view", "condition", "grade", "decade"]]


In [None]:
# 6.a Hot encoding nominal columns

X_train_cat_nom_encoded = pd.get_dummies(X_train_cat_nom, drop_first = True)
X_test_cat_nom_encoded = pd.get_dummies(X_test_cat_nom, drop_first = True)

X_test_cat_nom_encoded.describe()

In [None]:
# 6.b Reseting indexes before concatenating

X_train_cat_nom_encoded.reset_index(drop=True, inplace=True)
X_test_cat_nom_encoded.reset_index(drop=True, inplace=True)
X_train_cat_ord.reset_index(drop=True, inplace=True)
X_test_cat_ord.reset_index(drop=True, inplace=True)

In [None]:
X_test_cat_ord.head()

In [None]:
# 6.c Concatenating nominal and ordinal

X_train_cat_ready = pd.concat([X_train_cat_nom_encoded, X_train_cat_ord], axis = 1)
X_test_cat_ready = pd.concat([X_test_cat_nom_encoded, X_test_cat_ord], axis = 1)

In [None]:
# Concatenating X_train and X_test dataframes

X_train_num_scaled.reset_index(drop=True, inplace=True)
X_train_cat.reset_index(drop=True, inplace = True)
X_test_num_scaled.reset_index(drop=True, inplace=True)
X_test_cat.reset_index(drop=True, inplace = True)


X_train_ready = pd.concat([X_train_num_scaled, X_train_cat_ready], axis = 1)
X_test_ready = pd.concat([X_test_num_scaled, X_test_cat_ready], axis = 1)

In [None]:
model=LinearRegression()    
model.fit(X_train_ready, y_train)   

In [None]:
model.score(X_test_ready, y_test)

In [None]:
# Running a prediction on X_train_ready to compare the model score with the one from X_test_ready

model.score(X_train_ready, y_train)

In [None]:
features_importances = pd.DataFrame(data={
    'Attribute': X_train_ready.columns,
    'Importance': abs(model.coef_)
})
features_importances = features_importances.sort_values(by='Importance', ascending=False)

plt.bar(x=features_importances['Attribute'], height=features_importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
y_pred=model.predict(X_test_ready)   

In [None]:
R2=r2_score(y_test,y_pred)
R2

In [None]:
Adj_R2= 1 - (1-R2)*(len(y_test)-1)/(len(y_test)-X_test_ready.shape[1]-1)
Adj_R2