In [335]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.multiclass import OneVsOneClassifier
# from sklearn import datasets
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler, LabelEncoder

# для полнного вывода:
pd.set_option('display.max_rows',100)

# загружаем датасеты
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [336]:
# План:
#     1) Отделяем от данных "SalePrice"
#     2) Разделяем данные на численные и классовые
#     3) Анализируем количество пропуском для численных данных, заменяем пропуски медианными значениями
#     4) Столбцы с большим числом пропусков - удаляем
#     5) Анализируем классовые данные с учетом того, что в некоторых столбца NaN значения можно заменить на значение.
#     6) Объединяем численные и классовые данные
#     7) Обучаем модель алгоритмом рандомизированного леса
#     8) Проверяем результат

In [337]:
# отделяем результат от зависимостей 
Y_train = data.SalePrice
train = data.drop(columns=["SalePrice"])
Y_test = test.SalePrice
test = test.drop(columns=["SalePrice"])

In [338]:
# print(data.isna().sum().nlargest(81).head(20))
# sns.heatmap(data.isnull(), yticklabels=False, cbar=False)

In [339]:
# print(test.isna().sum().nlargest(81).head(35))
# sns.heatmap(test.isnull(), yticklabels=False, cbar=False)

In [340]:
# получаем все численные колонки
numerical_columns = [col for col in train.columns if train[col].dtype != 'object']
# получаем все классовые колонки
object_columns = [col for col in train.columns if train[col].dtype == 'object']
if len(object_columns) + len(numerical_columns) == train.shape[1]:
    print("все данные разделены")


все данные разделены


In [341]:
# функция подсчета процента пропушенных значений от общего числа
def count_missing_values(df, set='Train'):
    missing_value_counts_df = df.isnull().sum()
    missing_value_counts_df = missing_value_counts_df[missing_value_counts_df > 0].sort_values(ascending=False)
    
    # calculate percentage of missing values
    missing_value_percentage_df = round(missing_value_counts_df * 100 / len(df), 2).astype(str) + ' %'

    # concat missing count and percentage
    missing_values = pd.concat([missing_value_counts_df, missing_value_percentage_df], axis=1, keys=['Missing Values', 'Percent'])

    #missing_values = pd.DataFrame({set: missing_value_counts_df})
    return missing_values

In [342]:
print("Missing values in train and test numerical data")
print(f"Train data missing values: {train[numerical_columns].isnull().sum().sum()}")
print(f"Test data missing values: {test[numerical_columns].isnull().sum().sum()}")

Missing values in train and test numerical data
Train data missing values: 348
Test data missing values: 348


In [343]:
print("Missing Values in Train Data")
missing_value_counts = count_missing_values(train[numerical_columns])
missing_value_counts

Missing Values in Train Data


Unnamed: 0,Missing Values,Percent
LotFrontage,259,17.74 %
GarageYrBlt,81,5.55 %
MasVnrArea,8,0.55 %


In [344]:
print("Missing Values in Test Data")
missing_value_counts = count_missing_values(test[numerical_columns])
missing_value_counts

Missing Values in Test Data


Unnamed: 0,Missing Values,Percent
LotFrontage,259,17.74 %
GarageYrBlt,81,5.55 %
MasVnrArea,8,0.55 %


In [321]:
# заполним пропущенные численые значения
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

# используем значения, где недостаток данных меньше 10%
cols_with_few_missing_train = train.columns[train.isnull().sum() <= 81]
cols_with_few_missing_test = test.columns[test.isnull().sum() <= 81]

# выделяем численные колонки
numerical_cols_with_few_missing_train = train[cols_with_few_missing_train].select_dtypes(include=[np.number]).columns
numerical_cols_with_few_missing_test = test[cols_with_few_missing_test].select_dtypes(include=[np.number]).columns

# заполняем данные
numerical_imputer = IterativeImputer(estimator=BayesianRidge(), max_iter=50, tol= 0.1)
train[numerical_cols_with_few_missing_train] = numerical_imputer.fit_transform(train[numerical_cols_with_few_missing_train])
test[numerical_cols_with_few_missing_test] = numerical_imputer.fit_transform(test[numerical_cols_with_few_missing_test])

# данные, пустые значения которых - это категория!
features_with_na = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType', 'Electrical',
               'KitchenQual', 'SaleType', 'Functional', 'Exterior2nd', 'Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2',
               'MSZoning', 'Utilities']

# удаляем излишне пустые колонки
for col in train.columns:
    if (col not in numerical_cols_with_few_missing_train and col not in features_with_na):
        train.drop(columns=[col], axis= 1 , inplace= True)
        
for col in test.columns:
    if (col not in numerical_cols_with_few_missing_train and col not in features_with_na):
        test.drop(columns=[col], axis= 1 , inplace= True)

In [322]:
# заменяем NaN на NA
for feature in features_with_na:
    train[feature].fillna('NA', inplace=True)
    test[feature].fillna('NA', inplace=True)

In [323]:
# проверяем на NaN
for col in train.columns:
    if train[col].isnull().sum() > 0:
        print(f"have Nan in {col}")

In [324]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
 
# переход из объектных к целочисленным категориальным признакам 
label_encoder = LabelEncoder()
x_categorical = train.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = pd.DataFrame(train.select_dtypes(include=[np.number]).values)
X = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1).values

# # Fitting Random Forest Regression to the dataset
regressor = RandomForestRegressor(n_estimators=100, random_state=0, oob_score=True)
 
# # Fit the regressor with x and y data
regressor.fit(X, Y_train)


In [327]:
# Evaluating the model
from sklearn.metrics import mean_squared_error, r2_score

# аналогичные преобразования для тестовой выбоки
label_encoder = LabelEncoder()
x_categorical = test.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = pd.DataFrame(train.select_dtypes(include=[np.number]).values)
X = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1).values  

# Access the OOB Score
oob_score = regressor.oob_score_
print(f'Out-of-Bag Score: {oob_score}')
 
# Making predictions on the same data or new data
predictions = regressor.predict(X)
 
# Evaluating the model
mse = mean_squared_error(Y_test, predictions)
print(f'Mean Squared Error: {mse}')
 
r2 = r2_score(Y, predictions)
print(f'R-squared: {r2}')

Out-of-Bag Score: 0.8567805293404076
Mean Squared Error: 121693840.46952964
R-squared: 0.9807043095193924


In [334]:
# создавем scv-файл
test = pd.read_csv("test.csv")
result = pd.DataFrame({'Id': test['Id'].astype('int32'), 'SalePrice': predictions})
result.to_csv("result.csv", header=True, index=False)

ValueError: array length 1460 does not match index length 1459

In [333]:
result

Unnamed: 0,Id,SalePrice
0,1,206633.00
1,2,175593.27
2,3,221365.85
3,4,154642.00
4,5,264099.00
...,...,...
1455,1456,175778.32
1456,1457,205228.48
1457,1458,261211.61
1458,1459,136358.25
