In [2]:
# 載入套件
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

# 讀取訓練與測試資料
data_path = 'C:/Users/francis/Machine_Learning/francis/Documents/GitHub/4th-ML100Days/data/'
house_train = pd.read_csv(data_path + 'house_train.csv.gz')
house_test = pd.read_csv(data_path + 'house_test.csv.gz')

In [3]:
# 重組資料成為訓練 / 預測用格式
Y_train = np.log1p(house_train['SalePrice'])
ids = house_test['Id']
df_train = house_train.drop(['Id','SalePrice'],axis = 1)
df_test = house_test.drop(['Id'],axis = 1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [4]:
# 檢查欄位缺值數量 (去掉.head()可以顯示全部)
df.isnull().sum().sort_values(ascending = False).head(10)

PoolQC          2909
MiscFeature     2814
Alley           2721
Fence           2348
FireplaceQu     1420
LotFrontage      486
GarageCond       159
GarageQual       159
GarageYrBlt      159
GarageFinish     159
dtype: int64

In [5]:
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtypes,features in zip(df.dtypes,df.columns):
    if dtypes == 'int64' or dtypes == 'float64':
        num_features.append(features)

print(f'{len(num_features)} Numeric features  : {num_features}\n')

36 Numeric features  : ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']



In [6]:
#削弱文字型欄位,只剩數值型欄位
df_num = df[num_features]
train_num = Y_train.shape[0]
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [7]:
#空值補-1,做線性回歸
df_m1 = df_num.fillna(-1)
X_train = df_m1[:train_num]
estimator = LinearRegression()
print(cross_val_score(estimator,X_train,Y_train,cv =5).mean())

0.8466400643386492


In [8]:
#空值補0,做線性回歸
df_0 = df_num.fillna(0)
X_train = df_0[:train_num]
estimator = LinearRegression()
print(cross_val_score(estimator,X_train,Y_train,cv = 5).mean())

0.846611815586883


In [9]:
#空值填補平均值
df_num.mean()
df_mn = df_num.fillna(df.mean())
X_train = df_mn[:train_num]
estimator = LinearRegression()
print(cross_val_score(estimator,X_train,Y_train,cv = 5).mean())

0.8442642432201339


In [10]:
#空值填補-1,搭配最大最小化
df_m1 = df.fillna(-1)
df_temp = MinMaxScaler().fit_transform(df_m1)
X_train = df_temp[:train_num]
estimator = LinearRegression()
print(cross_val_score(estimator,X_train,Y_train,cv = 5).mean())

0.8454595084023964


In [11]:
#搭配標準化
df_temp = StandardScaler().fit_transform(df_m1)
X_train = df_temp[:train_num]
estimator = LinearRegression()
print(cross_val_score(estimator,X_train,Y_train,cv =5).mean())

0.8467539186532764


In [29]:
#作業1
df_m1 = df_num.fillna(-1)
X_train  = df_m1[:train_num]
estimator = LinearRegression()
print(cross_val_score(estimator,X_train,Y_train,cv = 5).mean())

0.8466400643386492


In [30]:
df_mn = df_num.fillna(df_num.mean())
X_train = df_mn[:train_num]
estimator = LinearRegression()
print(cross_val_score(estimator,X_train,Y_train,cv = 5).mean())

0.8442642432201339


In [35]:
df_0 = df_num.fillna(0)
X_train = df_0[:train_num]
estimator = LinearRegression()
print(cross_val_score(estimator,X_train,Y_train,cv = 5).mean())

0.846611815586883


In [50]:
#作業2
saleprice = house_train.groupby('LotShape')
print(saleprice.size())

LotShape
IR1    484
IR2     41
IR3     10
Reg    925
dtype: int64


In [54]:
#最大值最小值
LS = house_train['LotShape']
df_m1 = df_num.fillna(-1)
df_temp = MinMaxScaler().fit_transform(df_m1)
X_train = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator,X_train,LS,cv = 5).mean())

0.6828728965767322




In [59]:
#標準差
df_m1 = df_num.fillna(-1)
df_temp = StandardScaler().fit_transform(df_m1)
X_train = df_temp[:train_num]
Y_train_LS = house_train['LotShape']
estimator = LogisticRegression()
print(cross_val_score(estimator,X_train,Y_train_LS,cv =5).mean())

0.69928313667349




In [61]:
#原本的值
df_m1 = df_num.fillna(-1)
df_temp = df_m1
X_train = df_temp[:train_num]
Y_train_LS = house_train['LotShape']
estimator = LogisticRegression()
print(cross_val_score(estimator,X_train,Y_train_LS,cv = 5).mean())



0.6999727756133762




In [72]:
import pandas as pd 
import numpy as np 
import copy
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

data_path = 'C:/Users/francis/Machine_Learning/francis/Documents/GitHub/4th-ML100Days/data/'

titanic_train = pd.read_csv(data_path + 'titanic_train.csv')
titanic_test = pd.read_csv(data_path + 'titanic_test.csv')

Y_train = titanic_train['Survived']
ids = titanic_test['PassengerId']
titanic_train = titanic_train.drop(['PassengerId','Survived'],axis = 1)
titanic_test = titanic_test.drop(['PassengerId'],axis = 1)
titanic = pd.concat([titanic_train,titanic_test])

print(Y_train)

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


In [86]:
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
titanic_num_features = []
for columns,dtypes in zip(titanic.columns,titanic.dtypes):
    if dtypes == 'int64' or dtypes == 'float64':
        titanic_num_features.append(columns)
        
print(f'{len(titanic_num_features)} Numeric Features : {titanic_num_features}\n')

5 Numeric Features : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']



In [88]:
df.isnull().sum().sort_values(ascending = False)

PoolQC         2909
MiscFeature    2814
Alley          2721
Fence          2348
FireplaceQu    1420
               ... 
1stFlrSF          0
CentralAir        0
Heating           0
Foundation        0
MSSubClass        0
Length: 79, dtype: int64

In [92]:
titanic_num = titanic[titanic_num_features]
train_num = Y_train.shape[0]
print(len(Y_train))
print(train_num)

891
891


In [95]:
titanic_m1 = titanic_num.fillna(-1)
X_train = titanic_m1[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator,X_train,Y_train,cv = 5).mean())

0.6960299128976762




In [98]:
titanic_0 = titanic_num.fillna(0)
X_train = titanic_0[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator,X_train,Y_train,cv = 5).mean())

0.6971535084032942




In [99]:
titanic_mn = titanic_num.fillna(titanic_num.mean())
X_train = titanic_mn[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator,X_train,Y_train,cv = 5).mean())

0.6981761033723469




In [105]:
titanic_temp = MinMaxScaler().fit_transform(titanic_mn)
X_train = titanic_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator,X_train,Y_train,cv = 5).mean())

0.6993501991462476




In [106]:
titanic_temp = StandardScaler().fit_transform(titanic_mn)
X_train = titanic_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator,X_train,Y_train,cv = 5).mean())

0.6959413955734954




In [None]:
#MinMaxScaler效果最好