## 准备工作

### 导入必要模块

In [None]:
import hyper
import time, random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Tuple, Union, Literal
from scipy.stats import skew
from sklearn.preprocessing import RobustScaler, PowerTransformer, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error	
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import RidgeCV, LinearRegression
from hyper import printLog, boot, getTime

### 初始化数据

In [None]:
train_data = pd.read_csv("data/train.csv").drop(columns=["Id"])
test_data = pd.read_csv("data/test.csv").drop(columns=["Id"])
all_data = pd.concat([train_data, test_data], ignore_index=True)

printLog("Shape:", all_data.shape)
all_data.info()

### 杂项

In [None]:
random.seed(hyper.RANDOM_STATE)
sns.set_theme()
boot()

## 特征分析与处理

### 相关系数分析

In [None]:
corr_coef = train_data.corr("pearson", numeric_only=True)
if hyper.DISPLAY_CORR_FIGURE:
	plt.figure(figsize=(10, 10))
	sns.heatmap(corr_coef, square=True, cmap="rocket_r")
	plt.show()

### 数据类型更改

In [None]:
all_data[hyper.DTYPE_CONVERT_LIST] = all_data[hyper.DTYPE_CONVERT_LIST].astype(str)

### 缺失数据处理

#### 封装填充函数

In [None]:
def replaceNull(arr: pd.Series,
				typ: str,
				msg: Union[int, str, None] = None) -> pd.Series:
	"""Process the column in a specific mode."""
	if typ == "const":
		result = arr.fillna(msg)
	elif typ == "mode":
		mode = arr.mode()[0]
		result = arr.fillna(mode)
	elif typ == "median":
		median = arr.median()
		result = arr.fillna(median)
	elif typ == "mean":
		mean = arr.mean()
		result = arr.fillna(mean)
	elif typ == "random":
		isnull = arr.isnull()
		filled = arr[~isnull]
		result = arr[isnull].transform(lambda x: filled.sample(1).item())
		result = pd.concat([result, filled])
	else:
		raise KeyError("Illegal replace mode.")
	return result


#### 删除严重缺失特征

In [None]:
cnt_missed = all_data.isnull().sum(axis=0)
for key, cnt in cnt_missed.items():
	if cnt == 0:
		continue
	rate = cnt / all_data.shape[0]
	if rate >= hyper.UNACCEPTABLE_MISSED_RATE:
		all_data.drop(key, axis=1, inplace=True)
		printLog(f"Drop key: {key}.")

#### 缺失值填充

In [None]:
for key, operation in hyper.REPLACE_MODES.items():
	mode: str = operation[0]
	if mode == "group":
		group_name: str = operation[1]
		submode: str = operation[2]
		all_data[key] = all_data.groupby(group_name)[key].transform(lambda group: replaceNull(group, submode))
	else:
		msg = operation[1]
		all_data[key] = replaceNull(all_data[key], mode, msg)

#### 核查

In [None]:
assert(not all_data.isnull().values.any())

### 极端样本处理

#### 极端样本挖掘

In [None]:
def findOutliers(key1: str, key2: str, val: float) -> None:
	fig = plt.figure(figsize=(10, 10))
	plt.title(f"key1 = {key1}, key2 = {key2}, val = {val}")
	plt.xlabel(key1)
	plt.ylabel(key2)
	plt.plot(all_data[key1], all_data[key2], "bo", alpha=0.2)
	for id, (x, y) in enumerate(zip(all_data[key1], all_data[key2])):
		plt.text(x, y, str(id), fontsize='x-small')
	plt.show()
if hyper.ENABLE_OUTLIERS_DISCOVERY:
	corr_with_target = corr_coef[hyper.TARGET].drop(hyper.TARGET)
	corr_with_target.sort_values(ascending=False, inplace=True)
	for key, val in corr_with_target.items():
		findOutliers(str(key), hyper.TARGET, val)

#### 极端样本删除

In [None]:
for drop_list in hyper.DROP_LISTS:
	for drop_id in drop_list:
		# Ensure that no duplicate drop_ids exist
		if drop_id in all_data.index:
			all_data.drop(index=[drop_id], inplace=True)

### 预测目标改进

In [None]:
def showHistogram() -> None:
	if hyper.DISPLAY_TARGET_HISTOGRAM:
		fig = plt.figure(figsize=(6, 6))
		sns.histplot(all_data.loc[:, [hyper.TARGET]])
		plt.show()
all_data[hyper.TARGET] = np.log1p(all_data[hyper.TARGET])
showHistogram()

### 高偏特征处理

In [None]:
numeric_columns = [key for key in all_data.columns if pd.api.types.is_numeric_dtype(all_data[key]) and key != hyper.TARGET]
def print_skews():
	if hyper.PRINT_SKEWS:
		for key in numeric_columns:
			assert(not all_data[key].isnull().sum())
			printLog(key, all_data[key].min(), all_data[key].max(), skew(all_data[key]))

transformer = make_pipeline(StandardScaler(), PowerTransformer())
all_data[numeric_columns] = transformer.fit_transform(all_data[numeric_columns])
print_skews()

### 特征工程

In [None]:
all_data['YrRemodAfterBuilt'] = all_data['YearRemodAdd'] - all_data['YearBuilt']
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['Total_sqr_footage'] = (all_data['BsmtFinSF1'] + all_data['BsmtFinSF2'] + all_data['1stFlrSF'] + all_data['2ndFlrSF'])
all_data['Total_Bathrooms'] = (all_data['FullBath'] + (0.5 * all_data['HalfBath']) + all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath']))
all_data['Total_porch_sf'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] +
							  all_data['EnclosedPorch'] + all_data['ScreenPorch'] +
							  all_data['WoodDeckSF'])
all_data = all_data.drop("TotalBsmtSF", axis=1)

### 特征简化

In [None]:
def getMaxProps() -> list:
	max_occs_series: pd.Series = all_data.apply(lambda feature: (feature.value_counts().max() / len(feature),
																feature.name),
												result_type="reduce")
	occs = sorted(max_occs_series.to_numpy().tolist(), key=lambda tuple: tuple[0], reverse=True)
	return occs
max_props = getMaxProps()
for prop, key in max_props:
	if key != hyper.TARGET and prop >= hyper.MINIMUM_MONOTONOUS_PROPOTION:
		mode = all_data[key].mode()[0]
		all_data[key] = all_data[key].apply(lambda x: x == mode)
		printLog(f"Simplify {key} by {mode} with proportion = {prop * 100 :>.2f}%.")

### 特征拆解

In [None]:
all_data = pd.get_dummies(all_data)

### 过拟合列删除

In [None]:
dropped_keys = []
max_props = getMaxProps()
for prop, key in max_props:
	if prop > hyper.MINIMUM_ABANDONED_PROPOTION:
		printLog(f"Drop overfitting feature {key}.")
		dropped_keys.append(key)
all_data.drop(columns=dropped_keys, inplace=True)

## 模型建立、训练与融合

### 数据集初始化

In [None]:
train_data = all_data.loc[0: 1459]
train_features = train_data.drop(columns=hyper.TARGET)
train_labels = train_data[hyper.TARGET]

test_data = all_data.loc[1460: ]
test_features = test_data.drop(columns=hyper.TARGET)

train_data.shape
test_data.shape

### 模型建立

In [None]:
kfolder = KFold(hyper.CNT_FOLDS, shuffle=True, random_state=hyper.RANDOM_STATE)
ridge = make_pipeline(RobustScaler(), RidgeCV(), memory="cache")
gbr = make_pipeline(GradientBoostingRegressor(), memory="cache")

### 交叉验证

In [None]:
def crossValidationTrain(model: Pipeline, param_grid: dict) -> None:
	begin_time = time.time()
	printLog(f"Working on {type(model).__name__}...", to_logs=True)

	grid_searcher = GridSearchCV(model, param_grid, cv=hyper.CNT_FOLDS, scoring="neg_mean_squared_error", refit=False, error_score="raise")
	grid_searcher.fit(train_features.copy(), train_labels.copy())
	# display(grid_searcher.cv_results_)
	printLog(f"Done, time consumed: {time.time() - begin_time :>.3f}s, score: {grid_searcher.best_score_}, best_param: {grid_searcher.best_params_}",
			 to_logs=True)

if hyper.RIDGECV_EVAL:
	crossValidationTrain(ridge, hyper.RIDGECV_PARAM_GRID)

if hyper.GBR_EVAL:	
	crossValidationTrain(gbr, hyper.GBR_PARAM_GRID)

### 最优超参数选择与模型训练

In [None]:
if hyper.RIDGECV_TRAIN:
	ridge = make_pipeline(RobustScaler(), RidgeCV(**hyper.RIDGECV_PARAM), memory="cache")	#type: ignore
	ridge.fit(train_features, train_labels)

if hyper.GBR_TRAIN:
	gbr = make_pipeline(GradientBoostingRegressor(**hyper.GBR_PARAM), memory="cache")
	gbr.fit(train_features, train_labels)

## 结果预测

In [None]:
if hyper.SUBMIT:
	def makePrediction(model: Pipeline) -> np.ndarray:
		prediction = model.predict(test_features)
		prediction = np.expm1(prediction)
		return prediction

	test_prediction = makePrediction(ridge) * hyper.RIDGE_WEIGHT + makePrediction(gbr) * hyper.GRB_WEIGHT

## 提交

In [None]:
def submit() -> None:
	printLog("Saving...")
	submission_file = pd.read_csv("data/sample_submission.csv")

	submission_file[hyper.TARGET] = test_prediction
	submission_file.to_csv(f"submissions/{getTime(as_file_name=True)}.csv", index=False)
if hyper.SUBMIT:
	submit()
printLog("All done.")