In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('train.csv')

In [None]:
print(df.head())

In [None]:
print(df.info())

In [None]:
missing = df.isnull().sum()
print(missing[missing > 0], '\n')

In [None]:
print(missing[missing > 0] / len(df) * 100) 

In [None]:
rating_columns = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
df[rating_columns] = df[rating_columns].replace({'Ex': 10, 'Gd': 7.5, 'TA': 5, 'Fa': 2.5, 'Po': 0})

In [None]:
df['BsmtExposure'] = df['BsmtExposure'].replace({'Gd': 10, 'Av': 6.66, 'Mn': 3.33, 'No': 0})

In [None]:
df['BsmtFinType1'] = df['BsmtFinType1'].replace({'GLQ': 10, 'ALQ': 8, 'BLQ': 6, 'Rec': 4, 'LwQ': 2, 'Unf': 0})
df['BsmtFinType2'] = df['BsmtFinType2'].replace({'GLQ': 10, 'ALQ': 8, 'BLQ': 6, 'Rec': 4, 'LwQ': 2, 'Unf': 0})

In [11]:
rating_columns.remove('FireplaceQu')
rating_columns.remove('PoolQC')

In [12]:
rating_columns.extend(['BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageYrBlt', 'GarageFinish'])

In [13]:
for i in rating_columns:
    df[i] = df[i].fillna(df[i].mode()[0])

In [None]:
missing = df.isnull().sum()
print(missing[missing > 0], '\n')

In [None]:
print(missing[missing > 0] / len(df) * 100, '\n')

In [None]:
print(df['LotFrontage'].unique(), '\n')

In [None]:
print(df['MasVnrArea'].unique(), '\n')

In [18]:
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median())

In [19]:
df['MasVnrArea'] = df['MasVnrArea'].fillna(df['MasVnrArea'].median())

In [None]:
sns.boxplot(x = df['SalePrice'])
plt.show()

In [21]:
df.loc[(np.abs(stats.zscore(df['SalePrice'])) > 2), 'SalePrice'] = df['SalePrice'].median()

In [None]:
sns.boxplot(x = df['LotArea'])
plt.show()

In [None]:
df.loc[(np.abs(stats.zscore(df['LotArea'])) > 1), 'LotArea'] = df['LotArea'].median()

In [None]:
sns.boxplot(x = df['YearBuilt'])
plt.show()

In [25]:
df.loc[(np.abs(stats.zscore(df['YearBuilt'])) > 3), 'YearBuilt'] = df['YearBuilt'].mode()[0]

In [None]:
sns.boxplot(x = df['GarageArea'])
plt.show()

In [27]:
df.loc[(np.abs(stats.zscore(df['GarageArea'])) > 2.75), 'GarageArea'] = df['GarageArea'].median()

In [28]:
scaler = StandardScaler()
min_max_scaler = MinMaxScaler()
label_encoder = LabelEncoder()

In [29]:
df['SalePrice_Scaled'] = scaler.fit_transform(df[['SalePrice']])
df['SalePrice_Normalized'] = min_max_scaler.fit_transform(df[['SalePrice']])
df['SalePrice_Log'] = np.log(df['SalePrice'])

In [None]:
sns.boxplot(x = df['SalePrice_Log'])
plt.show()

In [31]:
df['LotArea_Scaled'] = scaler.fit_transform(df[['LotArea']])
df['LotArea_Normalized'] = min_max_scaler.fit_transform(df[['LotArea']])
df['LotArea_Log'] = np.log(df['LotArea'])

In [None]:
sns.boxplot(x = df['LotArea_Log'])
plt.show()

In [33]:
df['YearBuilt_Scaled'] = scaler.fit_transform(df[['YearBuilt']])
df['YearBuilt_Normalized'] = min_max_scaler.fit_transform(df[['YearBuilt']])

In [None]:
df['GarageArea_Scaled'] = scaler.fit_transform(df[['GarageArea']])
df['GarageArea_Normalized'] = min_max_scaler.fit_transform(df[['GarageArea']])
df['GarageArea_Log'] = np.log(df['GarageArea'])

In [None]:
sns.boxplot(x = df['GarageArea_Log'])
plt.show()

In [36]:
df['OverallState'] = (df['OverallQual'] + df['OverallCond']) / 2
df['ExterState'] = (df['ExterQual'] + df['ExterCond']) / 2

In [37]:
df['BsmtState'] = (df['BsmtQual'] + df['BsmtCond'] + df['BsmtExposure'] + df['BsmtFinSF1'] + df['BsmtFinSF2']) / 5

In [38]:
df['YearsSinceRenovation'] = 2024 - df['YearRemodAdd']

In [39]:
category_columns = df.select_dtypes(object).columns

In [40]:
for i in category_columns:
    df[i] = label_encoder.fit_transform(df[i])

In [41]:
df.to_csv('output.csv')