In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import r2_score, accuracy_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

import warnings
warnings.filterwarnings('ignore')

In [2]:
dTrain = pd.read_csv('train.csv')
dTest = pd.read_csv('test.csv')

In [3]:
dTrain

In [4]:
dTest

In [5]:
dTest.shape

In [6]:
dTrain.shape

In [7]:
dTrain.isna().sum()

In [8]:
dTrain.info()

In [9]:
dTrain.shape

In [10]:
dTrain.describe()

In [78]:
plt.figure(figsize=(20,9))
tcorr = dTrain.corr(method='pearson')
sns.heatmap(tcorr, annot=True) 

### Berapa hardness rata-rata dari sumber air yang memiliki kadar sodium di atas persentil 75 dan memiliki tingkat kebasaaan (alkalinity) di atas rata-rata?

In [61]:
sod_juma_per = dTrain['Sodium'].quantile(0.75)

mean_alka = dTrain['Alkalinity, total'].mean()

filter_data = dTrain[(dTrain['Sodium'] > sod_juma_per) & (dTrain['Alkalinity, total'] > mean_alka)]

mean_hardness = filter_data['Hardness'].mean()

print(f"Kesadahan air rata-rata yang kadar sodiumnya di atas persentil 75 dan memiliki tingkat kebasaaan (Alkalinity) di atas rata-rata: {avg_hardness.round(4)} mg/L")

### Apakah ada sumber air yang memiliki tingkat kebasaan (Alkalinity) yang dapat dianggap sebagai outlier? Jelaskan!

In [63]:
alka_stats = dTrain['Alkalinity, total'].describe()
median = alka_stats['50%']
q1 = alka_stats['25%']
q3 = alka_stats['75%']
iqr = q3 - q1
nilai_min = alka_stats['min']
nilai_max = alka_stats['max']

In [64]:
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

In [65]:
sns.set_theme(style="whitegrid")
custom_color = sns.color_palette("coolwarm")

plt.figure(figsize=(12,6))
boxplot = sns.boxplot(x='Alkalinity, total', data=dTrain, palette=custom_color)

boxplot.set_title('Alkalinity Visualization by Boxplot', fontsize=16)
boxplot.set_xlabel('Alkalinity', fontsize=14)
boxplot.tick_params(axis='both', which='major', labelsize=12)

outliers = dTrain[(dTrain['Alkalinity, total'] < lower_bound) | (dTrain['Alkalinity, total'] > upper_bound)]

for outlier in outliers['Alkalinity, total']:
    plt.scatter(outlier, 0, color='red', zorder=10)

plt.show()


In [68]:
print(f"Median: {median} mg/L")
print(f"Q1 (25th percentile): {q1} mg/L")
print(f"Q3 (75th percentile): {q3} mg/L")
print(f"IQR: {iqr} mg/L")
print(f"Lower bound (non-outliers): {lower_bound} mg/L")
print(f"Upper bound (non-outliers): {upper_bound} mg/L")
print(f"Minimum value (actual): {nilai_min} mg/L")
print(f"Maximum value (actual): {nilai_max} mg/L")

print("Outliers Alkalinity : ")
print(outliers['Alkalinity, total'])
print("Outliers Alkalinity total: ", outliers['Alkalinity, total'].value_counts().sum())

In [12]:
col = ['Alkalinity, total', 'Calcium', 'Chloride', 'Fluoride', 'Magnesium', 'Nitrate as N', 'Sodium', 'Specific Conductivity', 'Sulfate', 'Total Dissolved Solids', 'Hardness']

In [13]:
plt.style.use('ggplot')
bar_colors = sns.color_palette("hls", 8)

for kolom in col:
    plt.figure(figsize=(8, 10))
    counts = dTrain[kolom].value_counts()
    counts.plot(kind='bar', color=bar_colors)
    plt.xlabel(kolom)
    plt.ylabel("Count")
    plt.title(kolom)


In [14]:
X = dTrain.drop(columns=['Hardness'])
y = dTrain['Hardness']

In [15]:
X_train, X_val, y_train, y_val = tts(X, y,test_size=0.3, random_state=0)

## Linear reg

In [25]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_val_pred = lr.predict(X_val)

In [26]:
val_r2 = r2_score(y_val, y_val_pred)

In [27]:
print("Akurasi tes kedua yg val: ", val_r2)

In [28]:
# tes yang train cek overfit
y_train_pred = lr.predict(X_train)
train_r2 = r2_score(y_train, y_train_pred)
print("Akurasi tes kedua yg train: ", train_r2)

In [29]:
# load test.csv for linreg
X_test = dTest
test_pred = lr.predict(X_test)

In [91]:
submission2 = pd.DataFrame({
    'id' : dTest['id'],
    'Hardness' : test_pred
}
)

In [92]:
submission2.to_csv('submission2.csv', index=False)
print("submit csv kedua: submission2")

In [None]:
train_r2 = r2_score(y_train, lr.predict(X_train))
print(f'Training R² score: {train_r2}')
if train_r2 > val_r2:
    print("The model may be overfitting. The training score is higher than the validation score.")
else:
    print("The model does not appear to be overfitting.")

## LASSO

In [108]:
#try with lasso reg
las = Lasso()
las.fit(X_train, y_train)
y_val_pred_las = las.predict(X_val)
val_las_r2 = r2_score(y_val, y_val_pred_las)

In [109]:
print("Akurasi tes kedua yg val lasso: ", val_las_r2)

In [110]:
# tes yang train cek overfit lasso
y_train_pred_las = las.predict(X_train)
train_r2_las = r2_score(y_train, y_train_pred_las)
print("Akurasi tes kedua yg train: ", train_r2_las)

In [112]:
# load test.csv for lasreg
test_pred_las = las.predict(X_test)

In [114]:
submission2las = pd.DataFrame({
    'id' : dTest['id'],
    'Hardness' : test_pred_las
}
)

In [115]:
submission2las.to_csv('submission2lasso.csv', index=False)
print("submit csv kedua: submission2lasso")

## RIDGE

In [117]:
# try with ridge reg
rid = Ridge()
rid.fit(X_train, y_train)
y_val_pred_rid = rid.predict(X_val)
val_rid_r2 = r2_score(y_val, y_val_pred_rid)

In [118]:
print("Akurasi tes kedua yg val ridge: ", val_rid_r2)

In [119]:
# check the train ia it overfit with ridge?
y_train_pred_rid = rid.predict(X_train)
train_r2_rid = r2_score(y_train, y_train_pred_rid)
print("Akurasi tes kedua yg train ridge: ", train_r2_rid)

In [120]:
# load test.csv for lridge
test_pred_rid = rid.predict(X_test)

In [121]:
submission2rid = pd.DataFrame({
    'id' : dTest['id'],
    'Hardness' : test_pred_rid
}
)

In [123]:
submission2rid.to_csv('submission2ridge.csv', index=False)
print("submit csv kedua: submission2ridge")

## Try with DT regressor

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

In [129]:
#predict val set
y_val_pred_dtr = dtr.predict(X_val)
val_dtr_r2 = r2_score(y_val, y_val_pred_dtr)
print("Akurasi dtr kedua yg val dtr: ", val_dtr_r2)

In [130]:
#check training set
y_train_pred_dtr = dtr.predict(X_train)
train_r2_dtr = r2_score(y_train, y_train_pred_dtr)
print("Akurasi dtr yg train dtr: ", train_r2_dtr)

In [131]:
test_pred_dtr = dtr.predict(X_test)

In [132]:
submission2dtr = pd.DataFrame({
    'id' : dTest['id'],
    'Hardness' : test_pred_dtr
})

In [133]:
submission2dtr.to_csv('submission2dtr.csv', index=False)
print("submit csv kedua using dtr: submission2dtr")

## Try RFR

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

In [18]:
#pred val set
y_val_pred_rf = rfr.predict(X_val)
val_rf_r2 = r2_score(y_val, y_val_pred_rf)
print("Akurasi rf kedua yg val rf: ", val_rf_r2)

In [19]:
#pred train set
y_train_pred_rf = rfr.predict(X_train)
train_rf_r2 = r2_score(y_train, y_train_pred_rf)
print("Akurasi rf yg train rf: ", train_rf_r2)

In [30]:
test_pred_rf = rfr.predict(X_test)

In [31]:
submission2rf = pd.DataFrame({
    'id' : dTest['id'],
    'Hardness' : test_pred_rf
})

In [32]:
submission2rf.to_csv('submission2rf.csv', index=False)
print("submit csv using rf: submission2rf")

## SVR

In [33]:
svr = SVR()
svr.fit(X_train, y_train)

In [34]:
y_val_pred_svr = svr.predict(X_val)
val_rsv_r2 = r2_score(y_val, y_val_pred_svr)
print("Akurasi svr yg val : ", val_rsv_r2)

In [35]:
y_train_pred_svr = svr.predict(X_train)
train_rsv_r2 = r2_score(y_train, y_train_pred_svr)
print("Akurasi svr yg train : ", train_rsv_r2)

## Gradient Boosting Regressor

In [37]:
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

In [38]:
y_val_pred_gbr = gbr.predict(X_val)
val_gbr_r2 = r2_score(y_val, y_val_pred_gbr)
print("Akurasi gbr yg val : ", val_gbr_r2)

In [39]:
y_train_pred_gbr = gbr.predict(X_train)
train_gbr_r2 = r2_score(y_train, y_train_pred_gbr)
print("Akurasi gbr yg train : ", train_gbr_r2)

In [40]:
test_pred_gbr = gbr.predict(X_test)

In [41]:
submission2gbr = pd.DataFrame({
    'id' : dTest['id'],
    'Hardness' : test_pred_gbr
})

In [42]:
submission2gbr.to_csv('submission2gbr.csv', index=False)
print("submit csv using gbr: submission2gbr")