In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder


## Loading data

In [None]:
test = pd.read_csv('/home/nadia/Documents/DICODING/MLPemula/house-prices-advanced-regression-techniques/test.csv')
test.head()

In [None]:
train = pd.read_csv('/home/nadia/Documents/DICODING/MLPemula/house-prices-advanced-regression-techniques/train.csv')
train.head()

## Data Cleaning


Understanding data

In [None]:
test.info()

In [None]:
train.info()

In [None]:
train.describe(include="all")

Manage missing value

In [None]:
missing_values = train.isnull().sum()
missing_percentage = (missing_values/ len(train))*100

missing_data = pd.DataFrame({
    'Missing Values':missing_values,
    'Percentage' : missing_percentage
}).sort_values(by='Missing Values', ascending=False)

missing_data[missing_data['Missing Values'] > 0]

In [None]:
missing_values = train.isnull().sum()
missing_values[missing_values > 0]

In [None]:
# less = missing_values[missing_values < 1000][missing_values > 0].index
# less

less = missing_values[missing_values < 1000].index
less

In [None]:
over = missing_values[missing_values >= 1000].index
over

In [None]:
missing_values[less][missing_values > 0]

In [None]:
numeric_features = train[less].select_dtypes(include=["number"]).columns
print(numeric_features)
train[numeric_features] = train[numeric_features].fillna(train[numeric_features].median())

In [None]:
train[numeric_features] = train[numeric_features].fillna(train[numeric_features].median())

In [None]:
update_miss_vals = train.isnull().sum()[missing_values>0][missing_values<1000]
update_miss_vals

In [None]:
kategorical_features = train[less].select_dtypes(include=["object"]).columns
print(kategorical_features)

In [None]:
for column in kategorical_features:
    train[column] = train[column].fillna(train[column].mode()[0])

In [None]:
train[less].isna().sum()

In [None]:
df = train.drop(columns=over)

In [None]:
df.head()

In [None]:
missing_values = df.isnull().sum()
missing_values[missing_values>0]

In [None]:
df.head()

In [None]:
for feature in numeric_features:
    plt.figure(figsize=(10,6))
    sns.boxplot(x=df[feature])
    plt.title(f'Box plot of {feature}')
    plt.show()

In [None]:
Q1 = df[numeric_features].quantile(0.25)
Q3 = df[numeric_features].quantile(0.75)
IQR = Q3-Q1

In [None]:
df.head()

In [None]:
# Filter dataframe untuk hanya menyimpan baris yang tidak mengandung outliers pada kolom numerik
condition = ~((df[numeric_features] < (Q1 - 1.5 * IQR)) | (df[numeric_features] > (Q3 + 1.5 * IQR))).any(axis=1)
df_filtered_numeric = df.loc[condition, numeric_features]
    
# Menggabungkan kembali dengan kolom kategorikal
categorical_features = df.select_dtypes(include=['object']).columns
df = pd.concat([df_filtered_numeric, df.loc[condition, categorical_features]], axis=1)

In [None]:
df.head()

In [None]:
condition = -((df[numeric_features] < (Q1-1.5*IQR)) | (df[numeric_features] > (Q3 + 1.5 * IQR))).any(axis=1)
df_filtered_numeric = df.loc[condition, numeric_features]

categorical_features = df.select_dtypes(include=['object']).columns
df = pd.concat([df_filtered_numeric,df.loc[condition, categorical_features]], axis=1)

In [None]:
df.head()

In [None]:
for feature in df_filtered_numeric:
    plt.figure(figsize=(10,6))
    sns.boxplot(x=df[feature])
    plt.title(f'Box plot of {feature}')
    plt.show()

cara lain mengatasi outlier

In [None]:
# median = df['column_name'].median()
# df['column_name'] = df['column_name'].apply(lambda x: median if x < (Q1 - 1.5  IQR) or x > (Q3 + 1.5  IQR) else x)

# # Mengganti outlier dengan nilai batas terdekat
# df['column_name'] = df['column_name'].apply(lambda x: (Q1 - 1.5  IQR) if x < lower_bound else (Q3 + 1.5  IQR) if x > (Q3 + 1.5 * IQR) else x)

In [None]:
for feature in numeric_features:
    fig, ax = plt.subplots(1,2,figsize=(24,6))

    sns.boxplot(x=train[feature], ax=ax[0])
    ax[0].set_title(f"{feature} before cleaning")

    sns.boxplot(x=df_filtered_numeric[feature])
    ax[1].set_title(f"{feature} after cleaning")

    plt.show()




Standarisasi

Standarisasi fitur numerik

In [None]:
scaler = StandardScaler()
# df.isna().sum()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

In [None]:
for feature in numeric_features:
    fig, ax = plt.subplots(1,2,figsize=(24,6))

    sns.histplot(x=train[feature], ax=ax[0])
    ax[0].set_title(f"{feature} before Standardisasi")

    sns.histplot(x=df[feature])
    ax[1].set_title(f"{feature} after Standardisasi")

    plt.show()


Menangani duplikat

In [None]:
duplicates = df.duplicated()

print("Baris duplikat : ")
# print(df[duplicates])
df[duplicates]

In [None]:
# # Menghapus baris duplikat
# df = df.drop_duplicates()
    
# print("DataFrame setelah menghapus duplikat:")
# print(df)

In [None]:
category_features = df.select_dtypes(include=['object']).columns
df[category_features]

One Hot Encoding

In [None]:
df_one_hot = pd.get_dummies(df, columns=category_features)
df_one_hot

In [None]:
df_one_hot.info()

In [None]:
df_one_hot.sample(10)

In [None]:
df.head()

Label Encoding

In [None]:
label_encoder = LabelEncoder()

df_lencoder = pd.DataFrame(df)

for col in category_features:
    df_lencoder[col] = label_encoder.fit_transform(df[col])

df_lencoder


## Exploratory dan Explanatory Data Analysis

In [None]:
df_lencoder.head()

In [None]:
missing_values = df_lencoder.isnull().sum()
missing_percentage = (missing_values/ len(df_lencoder))*100

missing_data = pd.DataFrame({
    'Missing Values':missing_values,
    'Percentage' : missing_percentage
}).sort_values(by='Missing Values', ascending=False)

missing_data[missing_data['Missing Values'] > 0]

In [None]:
num_vars = df_lencoder.shape[1]

n_cols = 4
n_rows = -(-num_vars//n_cols)

fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize = (20,n_rows*4))

axes = axes.flatten()

for i, column in enumerate(df_lencoder.columns):
    df_lencoder[column].hist(ax=axes[i], bins=20, edgecolor='black')
    axes[i].set_title(column)
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')

for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
columns_to_plot = ['OverallQual','YearBuilt','LotArea','SaleType','SaleCondition']

plt.figure(figsize=(15,10))

for i, column in enumerate(columns_to_plot, 1):
    plt.subplot(2,3,i)
    sns.histplot(df_lencoder[column],kde=True, bins=30)
    plt.title(f'Distribution of {column}')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12,10))
correlation_matrix = df_lencoder.corr()

sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)

plt.title('Correlation Matrix')
plt.show()

In [None]:
target_corr = df_lencoder.corr()['SalePrice']

target_corr_sorted = target_corr.abs().sort_values(ascending=False)

plt.figure(figsize=(10,6))
target_corr_sorted.plot(kind='bar')
plt.title(f'Correlation with SalePrice')
plt.xlabel('Variables')
plt.ylabel('Correlation Coefficient')
plt.show()

## Data Splitting

In [None]:
import sklearn

In [None]:
X = df_lencoder.drop(columns=['SalePrice'])
y = df_lencoder['SalePrice']

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# menghitung panjang/jumlah data 
print("Jumlah data: ",len(X))
# menghitung panjang/jumlah data pada x_test
print("Jumlah data latih: ",len(x_train))
# menghitung panjang/jumlah data pada x_test
print("Jumlah data test: ",len(x_test))

## Modelling

In [None]:
# Model 1 dengan algoritma Least Angle Regression
from sklearn import linear_model
lars = linear_model.Lars(n_nonzero_coefs=1).fit(x_train, y_train)

# Model 2 dengan algoritma Linear Regression
from sklearn.linear_model import LinearRegression
LR = LinearRegression().fit(x_train, y_train)

# Model 3 dengan algoritma Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor(random_state=184)
GBR.fit(x_train,y_train)

Evaluasi Model

In [None]:
from sklearn.metrics import *

pred_lars = lars.predict(x_test)
mae_lars = mean_absolute_error(y_test, pred_lars)
mse_lars = mean_squared_error(y_test, pred_lars)
r2_lars = r2_score(y_test, pred_lars)

data = {
    'MAE': [mae_lars],
    'MSE' : [mse_lars],
    'R2' : [r2_lars]
}

df_results = pd.DataFrame(data, index=['Lars'])

df_results

In [None]:
pred_LR = LR.predict(x_test)
mae_LR = mean_absolute_error(y_test, pred_LR)
mse_LR = mean_squared_error(y_test, pred_LR)
r2_LR = r2_score(y_test, pred_LR)
 
df_results.loc['Linear Regression'] = [mae_LR, mse_LR, r2_LR]
df_results

In [None]:
pred_GBR = GBR.predict(x_test)
mae_GBR = mean_absolute_error(y_test, pred_GBR)
mse_GBR = mean_squared_error(y_test, pred_GBR)
r2_GBR = r2_score(y_test, pred_GBR)
 
df_results.loc['GradientBoostingRegressor'] = [mae_GBR, mse_GBR, r2_GBR]
df_results

Menyimpan Model

In [None]:
import joblib

joblib.dump(GBR, 'gbr_model.joblib')

In [None]:
import pickle

with open('gbr_model.pkl', 'wb') as file:
    pickle.dump(GBR, file)

## Deployment dan Monitoring

In [None]:
joblib_model = joblib.load('gbr_model.joblib')

with open('gbr_model.pkl', 'rb') as file:
    pickle_model = pickle.load(file)