In [None]:
!pip install feature_engine

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

from feature_engine.imputation import MeanMedianImputer

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

数据基本情况检查

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train.head()

In [None]:
pd.DataFrame(data = [train.isna().sum()/train.shape[0]*100, test.isna().sum()/test.shape[0]*100], 
             index=["Train Null (%)", "Test Null (%)"]).T.style.background_gradient(cmap='summer_r')

In [None]:
train_data = train.drop(['Alley','PoolQC','Fence','MiscFeature','FireplaceQu'],axis = 1)
test_data = test.drop(['Alley','PoolQC','Fence','MiscFeature','FireplaceQu'],axis = 1)

In [None]:
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution 
sns.distplot(train['SalePrice'], color="b")
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="SalePrice")
ax.set(title="SalePrice Distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
# 斜度 和 曲度
print("Skewness: %f" % train['SalePrice'].skew())
print("Kurtosis: %f" % train['SalePrice'].kurt())

In [None]:
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution 
sns.distplot(np.log(train['SalePrice']), color="b")
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="log SalePrice")
ax.set(title="log SalePrice Distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
# 斜度 和 曲度
print("Skewness: %f" % np.log(train['SalePrice']).skew())
print("Kurtosis: %f" % np.log(train['SalePrice']).kurt())

**所以我们考虑对lnY回归**

In [None]:
# Finding numeric features
# 这样分出来的不全是数值型的
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num = []
cat = []
for i in train_data.columns:
    if (train_data[i].dtype in numeric_dtypes)&(i != 'SalePrice'):
            num.append(i)  
    elif i != 'SalePrice': #elif train_data[i].dtype=='object'
            cat.append(i) 

In [None]:
#对num、cat进行人工校对,可能还有那天晚上后来没注意了
change = ['MSSubClass','OverallQual']
for c in change:
    num.remove(c)
    cat.insert(-1,c)

**各变量分布情况**

In [None]:
def num_dist(data, var):
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))

    sns.histplot(data=data, x=var, kde=True, ax=ax[0])
    sns.boxplot(data=data, x=var, ax=ax[1])
    ax[0].set_title(f"{var} Distribution Histogram")
    ax[1].set_title(f"{var} Distribution Boxplot")

    plt.show()
for var in num:
    num_dist(train, var)

In [None]:
def cat_dist(data, var):
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))

    train_data[var].value_counts().plot(kind="pie", explode=[0.05 for x in data[var].dropna().unique()], autopct='%1.1f%%', ax=ax[0], shadow=True)
    ax[0].set_title(f"{var} Pie Chart")
    ax[0].set_ylabel('')

    count = sns.countplot(x=var, data=train_data, ax=ax[1])
    for bar in count.patches:
        count.annotate(format(bar.get_height()),
            (bar.get_x() + bar.get_width() / 2,
            bar.get_height()), ha='center', va='center',
            size=11, xytext=(0, 8),
            textcoords='offset points')
    ax[1].set_title(f"{var} Bar Chart")
    plt.show()

for c in cat:
    cat_dist(train,c)


**缺失值补齐**

https://www.kaggle.com/code/dansbecker/handling-missing-values

连续型：

1. 缺失值占比很少很少的用平均值或者中位数？？？0%~10% sklearn.impute.SimpleImputer

2. 缺失值稍微大的用????  10%以上的50%以下的

3. 50%以上的扔了吧

分类:

并不是很多，把缺的行删了

In [None]:
# 连续型，全部用中位数补齐了
median_imputer = MeanMedianImputer(imputation_method="median")
train_data[num] = median_imputer.fit_transform(train_data[num])
test_data[num] = median_imputer.transform(test_data[num])

In [None]:
# 合并数据进行数据处理
train_data.insert(train_data.shape[1],'lable',np.ones(train_data.shape[0]))
test_data.insert(test_data.shape[1],'lable',np.zeros(test_data.shape[0]))
data = pd.concat([train_data,test_data],axis = 0)

In [None]:
# 分类
missing = (data.isna().sum()!=0)[data.isna().sum()!=0].index.values
cat_missing = []
for m in missing:
    if m in cat:
        cat_missing.append(m)
data.dropna(subset = cat_missing,axis=0,inplace = True)

In [None]:
# 删去变量，需要有个人写一下解释原因，结合图像和数据分析
drop_ = ['Id','Neighborhood','Condition1','Condition2','Exterior1st','Exterior2nd','BsmtQual','BsmtFinType1','BsmtFinSF1',
        'BsmtFinType2','BsmtFinSF2','Heating','GarageYrBlt','GarageFinish','GarageCars','MoSold','YrSold']
data.drop(drop_,axis = 1,inplace = True)

In [None]:
# 更新变量分类
for d in drop_:
    for c in cat:
        if d==c:
            cat.remove(c)
    for n in num:
        if d==n:
            num.remove(n)
data = data.reset_index()

**特征工程** 

1. YearBuilt 分箱

In [None]:
def box_yearbuild(col):
    peroid1 = [0]*len(col)
    peroid2 = [0]*len(col)
    for i in range(0,len(col)):
        if col[i]<=1950:
            peroid1[i] = 1
        elif col[i] > 1980:
            peroid2[i] = 1
    per = pd.DataFrame(peroid1,columns = ['YearBuilt_Before1950'])
    per.insert(1,'YearBuilt_After1980',peroid2)
    return per
data = pd.concat([data,box_yearbuild(data['YearBuilt'])],axis = 1)
data.drop('YearBuilt', axis = 1,inplace = True)

2. YearRemodAdd 分箱

In [None]:
def box_yearremod(col):
    peroid = [0]*len(col)
    for i in range(0,len(col)):
        if col[i]<=1990:
            peroid[i] = 1
    per = pd.DataFrame(peroid,columns = ['YearRemodel_before1990'])
    return per
data = pd.concat([data,box_yearremod(data['YearRemodAdd'])],axis = 1)
data.drop('YearRemodAdd', axis = 1,inplace = True)

3.  比例构造

    BsmtUnfSF/TotalBsmtSF 未完成的地下室占比

    LowQualFinSF/GrLivArea 

In [None]:
def ratio(data,num,den,new_name):
    ratio = data[num]/data[den]
    data.insert(data.shape[1],new_name,ratio)
    data.drop([num, den],axis = 1, inplace = True)
ratio(data,'BsmtUnfSF','TotalBsmtSF','UnfinishedBsm_ratio')
ratio(data,'LowQualFinSF','GrLivArea','LowQuality_ratio')

4. 卫生间面积合并

In [None]:
def Bath_combine(data):
    Bath = 0.5*(0.6*data['HalfBath']+0.4*data['BsmtHalfBath'])+0.6*data['FullBath']+0.4*data['BsmtFullBath']
    data.insert(data.shape[1],'Bath_total',Bath)
    data.drop(['HalfBath', 'BsmtHalfBath', 'FullBath', 'BsmtFullBath'],axis = 1, inplace = True)
Bath_combine(data)

5. 门廊合并

In [None]:
def Porch_combine(data):
    porch = [0]*data.shape[0]
    for a in data.columns:
        if 'Porch' in a:
            porch += data[a]
            data.drop(a, axis = 1, inplace = True)
    data.insert(data.shape[1],'Porch',porch)
Porch_combine(data)

6. GarageQual & Cond 比较留

In [None]:
def compare(data):
    dic = {'Ex':6, 'Gd':5, 'TA':4, 'Fa':3, 'Po':2, 'NA':1}
    Garage = []
    for i in range(data.shape[0]):
        if data['GarageQual'].map(dic)[i] <= data['GarageCond'].map(dic)[i]:
            Garage.append(data['GarageQual'][i])
        else: 
            Garage.append(data['GarageCond'][i])
    data.insert(data.shape[1],'Garagelevel',Garage)
    data.drop(['GarageQual','GarageCond'], axis = 1, inplace = True)
compare(data)

In [None]:
# Dummy variable (one-hot) 
cat.remove('GarageQual')
cat.remove('GarageCond')
dummy = cat.copy()
dummy.remove('MSSubClass')
dummy.remove('OverallQual')
data = data.join(pd.get_dummies(data[dummy]), how = 'outer')
data.drop(dummy,axis = 1, inplace = True)
# 分割
#train_data = data[data['lable']==1]
#test_data = data[data['lable']==0]

In [None]:
x_train = train_data.drop('SalePrice',axis = 1)
y_train = train_data['SalePrice']

**粗略回归**

In [None]:
# Standardize numerical features
stdSc = StandardScaler()
x_train.loc[:, num] = stdSc.fit_transform(x_train.loc[:, num])

In [None]:
# Linear Regression
lr = LinearRegression()
lr.fit(x_train, np.log(y_train))

# Look at predictions on training and validation set
y_train_pred = lr.predict(x_train)

# Plot predictions
plt.scatter(y_train_pred, np.log(y_train), c = "blue", marker = "s", label = "Training data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()

In [None]:
#finally succeed
#change1