## 線性回歸預測房價 (有用factor analysis)

In [1]:
import numpy as np
import pandas as pd

In [2]:
housedatas=pd.read_csv("housedataset/kc_house_data.csv", encoding="utf-8")
housedatas.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062


In [3]:
len(housedatas)

21613

In [4]:
mostfeatures=['price','bedrooms', 'bathrooms', 'sqft_living', 'floors', 'condition', 'grade',
              'waterfront', 'view','yr_built','lat','long']
housedata=housedatas[mostfeatures]
housedata.head(3)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,condition,grade,waterfront,view,yr_built,lat,long
0,221900.0,3,1.0,1180,1.0,3,7,0,0,1955,47.5112,-122.257
1,538000.0,3,2.25,2570,2.0,3,7,0,0,1951,47.721,-122.319
2,180000.0,2,1.0,770,1.0,3,6,0,0,1933,47.7379,-122.233


In [5]:
housedata=housedata.astype('float')
housedata.head(3)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,condition,grade,waterfront,view,yr_built,lat,long
0,221900.0,3.0,1.0,1180.0,1.0,3.0,7.0,0.0,0.0,1955.0,47.5112,-122.257
1,538000.0,3.0,2.25,2570.0,2.0,3.0,7.0,0.0,0.0,1951.0,47.721,-122.319
2,180000.0,2.0,1.0,770.0,1.0,3.0,6.0,0.0,0.0,1933.0,47.7379,-122.233


In [6]:
housedata['label']='middle'
housedata.loc[housedata['price']>=600000, 'label'] = 'high'
housedata.loc[housedata['price']<=350000, 'label'] = 'low'

In [7]:
housedata_1=housedata[housedata['label'].str.contains('high')]
housedata_2=housedata[housedata['label'].str.contains('middle')]
housedata_3=housedata[housedata['label'].str.contains('low')]
print(len(housedata),len(housedata_1),len(housedata_2),len(housedata_3))

21613 6366 8450 6797


In [8]:
housedata2=housedata.drop(columns=['price','label'])

In [9]:
from sklearn import model_selection
X = np.array(housedata2)
Y = np.array(housedata['label'])
validation_size = 0.20
seed = 20
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
print(len(X_train),len(X_validation))

17290 4323


In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [11]:
from sklearn.svm import SVC

SVC_model = SVC(kernel='linear')
SVC_model.fit(X_train,Y_train)
predictions = SVC_model.predict(X_validation)

print(SVC_model.score(X_train,Y_train))
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

0.6924233661075766
0.7103863058061531
[[ 960   19  299]
 [  22 1094  240]
 [ 277  395 1017]]
              precision    recall  f1-score   support

        high       0.76      0.75      0.76      1278
         low       0.73      0.81      0.76      1356
      middle       0.65      0.60      0.63      1689

    accuracy                           0.71      4323
   macro avg       0.71      0.72      0.72      4323
weighted avg       0.71      0.71      0.71      4323



In [12]:
from sklearn.decomposition import FactorAnalysis

transformer = FactorAnalysis(n_components=10, random_state=20)
house_transformed = transformer.fit_transform(housedata2)
house_transformed.shape

(21613, 10)

In [13]:
X = house_transformed
Y = np.array(housedata['label'])
validation_size = 0.20
seed = 20
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [14]:
SVC_model = SVC(kernel='linear')
SVC_model.fit(X_train,Y_train)
predictions = SVC_model.predict(X_validation)

print(SVC_model.score(X_train,Y_train))
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

0.7497975708502024
0.7508674531575295
[[ 913    9  356]
 [   4 1073  279]
 [ 198  231 1260]]
              precision    recall  f1-score   support

        high       0.82      0.71      0.76      1278
         low       0.82      0.79      0.80      1356
      middle       0.66      0.75      0.70      1689

    accuracy                           0.75      4323
   macro avg       0.77      0.75      0.76      4323
weighted avg       0.76      0.75      0.75      4323



In [15]:
housedata3=housedata.drop(columns=['price','waterfront','view','label'])
housedata3.head(10)

Unnamed: 0,bedrooms,bathrooms,sqft_living,floors,condition,grade,yr_built,lat,long
0,3.0,1.0,1180.0,1.0,3.0,7.0,1955.0,47.5112,-122.257
1,3.0,2.25,2570.0,2.0,3.0,7.0,1951.0,47.721,-122.319
2,2.0,1.0,770.0,1.0,3.0,6.0,1933.0,47.7379,-122.233
3,4.0,3.0,1960.0,1.0,5.0,7.0,1965.0,47.5208,-122.393
4,3.0,2.0,1680.0,1.0,3.0,8.0,1987.0,47.6168,-122.045
5,4.0,4.5,5420.0,1.0,3.0,11.0,2001.0,47.6561,-122.005
6,3.0,2.25,1715.0,2.0,3.0,7.0,1995.0,47.3097,-122.327
7,3.0,1.5,1060.0,1.0,3.0,7.0,1963.0,47.4095,-122.315
8,3.0,1.0,1780.0,1.0,3.0,7.0,1960.0,47.5123,-122.337
9,3.0,2.5,1890.0,2.0,3.0,7.0,2003.0,47.3684,-122.031


In [16]:
X = np.array(housedata3)
Y = np.array(housedata['label'])
validation_size = 0.20
seed = 20
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [17]:
SVC_model = SVC(kernel='linear')
SVC_model.fit(X_train,Y_train)
predictions = SVC_model.predict(X_validation)

print(SVC_model.score(X_train,Y_train))
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))

0.6950260266049739
0.710617626648161
[[ 970   20  288]
 [  23 1076  257]
 [ 298  365 1026]]
