1. Menentukan Library yang akan di gunakan

In [47]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

2. Load Dataset

In [48]:
air_dataset = pd.read_csv('monitoringdata.csv')

In [49]:
air_dataset.head()

Unnamed: 0,Agency,Region,siteName,SiteID,LawaSiteID,siteType,Latitude,Longitude,Property,DateCollected,Resample following exceedance?,Value,SwimIcon,SwimmingGuidelinesTestResultDescription
0,Bay of Plenty Regional Council,Bay of Plenty region,FK325034,Lake Okareka at Steep St Reserve,EBOP-10005,Lake,-38.167698,176.347488,E-coli,10/25/2017 12:00,,25.0,green,Suitable for swimming
1,Bay of Plenty Regional Council,Bay of Plenty region,FK325034,Lake Okareka at Steep St Reserve,EBOP-10005,Lake,-38.167698,176.347488,E-coli,10/29/2017 12:00,,2.0,green,Suitable for swimming
2,Bay of Plenty Regional Council,Bay of Plenty region,FK325034,Lake Okareka at Steep St Reserve,EBOP-10005,Lake,-38.167698,176.347488,E-coli,11/5/2017 12:00,,1.0,green,Suitable for swimming
3,Bay of Plenty Regional Council,Bay of Plenty region,FK325034,Lake Okareka at Steep St Reserve,EBOP-10005,Lake,-38.167698,176.347488,E-coli,11/13/2017 0:19,,5.0,green,Suitable for swimming
4,Bay of Plenty Regional Council,Bay of Plenty region,FK325034,Lake Okareka at Steep St Reserve,EBOP-10005,Lake,-38.167698,176.347488,E-coli,11/19/2017 22:34,,2.0,green,Suitable for swimming


In [50]:
air_dataset.shape

(70886, 14)

In [51]:
# Melihat informasi data

air_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70886 entries, 0 to 70885
Data columns (total 14 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Agency                                   70886 non-null  object 
 1   Region                                   70886 non-null  object 
 2   siteName                                 70886 non-null  object 
 3   SiteID                                   70886 non-null  object 
 4   LawaSiteID                               70886 non-null  object 
 5   siteType                                 70886 non-null  object 
 6   Latitude                                 70593 non-null  float64
 7   Longitude                                70593 non-null  float64
 8   Property                                 70886 non-null  object 
 9   DateCollected                            70886 non-null  object 
 10  Resample following exceedance?           56713

In [52]:
air_dataset['SwimmingGuidelinesTestResultDescription'].value_counts()

Suitable for swimming      55454
Unsuitable for swimming     7136
Caution advised             5751
Name: SwimmingGuidelinesTestResultDescription, dtype: int64

In [53]:
air_dataset['Property'].value_counts()

E-coli           32812
Enterococci      26323
Cyanobacteria    11751
Name: Property, dtype: int64

In [54]:
# memisahkan data dan label
X = air_dataset.drop(columns=['Agency','Region','siteName', 'SiteID', 'LawaSiteID','siteType', 'Resample following exceedance?', 'DateCollected','SwimIcon','SwimmingGuidelinesTestResultDescription'], axis=11)
Y = air_dataset['SwimmingGuidelinesTestResultDescription']

In [55]:
print(X)

        Latitude   Longitude Property  Value
0     -38.167698  176.347488   E-coli   25.0
1     -38.167698  176.347488   E-coli    2.0
2     -38.167698  176.347488   E-coli    1.0
3     -38.167698  176.347488   E-coli    5.0
4     -38.167698  176.347488   E-coli    2.0
...          ...         ...      ...    ...
70881 -42.376147  171.242946   E-coli   40.0
70882 -42.376147  171.242946   E-coli  150.0
70883 -42.376147  171.242946   E-coli   36.0
70884 -42.376147  171.242946   E-coli  100.0
70885 -42.376147  171.242946   E-coli  270.0

[70886 rows x 4 columns]


In [56]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70886 entries, 0 to 70885
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Latitude   70593 non-null  float64
 1   Longitude  70593 non-null  float64
 2   Property   70886 non-null  object 
 3   Value      70505 non-null  float64
dtypes: float64(3), object(1)
memory usage: 2.2+ MB


In [57]:
data_cleaned = air_dataset.dropna(subset=['SwimmingGuidelinesTestResultDescription'])
X = data_cleaned[['Latitude', 'Longitude', 'Value', 'Property']]
Y = data_cleaned['SwimmingGuidelinesTestResultDescription']

In [58]:
label_mapping = {
    'E-coli': 1,
    'Enterococci': 2,
    'Cyanobacteria': 3
}

X['Property'] = X['Property'].replace(label_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Property'] = X['Property'].replace(label_mapping)


In [59]:
from sklearn.impute import SimpleImputer

# Ubah tipe data kolom "Property" menjadi objek
X.loc[:, 'Property'] = X['Property'].astype('object')

# Gunakan SimpleImputer dengan strategi "most_frequent" untuk mengisi nilai yang hilang
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'Property'] = X['Property'].astype('object')


In [60]:
label_mapping = {
    'Suitable for swimming': 0,
    'Unsuitable for swimming': 1,
    'Caution advised': 2
}

Y = Y.replace(label_mapping)


In [61]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
70881    0
70882    0
70883    0
70884    0
70885    2
Name: SwimmingGuidelinesTestResultDescription, Length: 68341, dtype: int64


In [62]:
import numpy as np

unique_values, value_counts = np.unique(Y, return_counts=True)

for value, count in zip(unique_values, value_counts):
    print(f"{value}: {count}")

0: 55454
1: 7136
2: 5751


In [63]:
print(X)

[[-38.167698 176.347488 25.0 1]
 [-38.167698 176.347488 2.0 1]
 [-38.167698 176.347488 1.0 1]
 ...
 [-42.37614725 171.2429464 36.0 1]
 [-42.37614725 171.2429464 100.0 1]
 [-42.37614725 171.2429464 270.0 1]]


In [64]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
70881    0
70882    0
70883    0
70884    0
70885    2
Name: SwimmingGuidelinesTestResultDescription, Length: 68341, dtype: int64


3. Standarisasi Data

In [65]:
scaler = StandardScaler()

In [66]:
scaler.fit(X)

In [67]:
standarized_data = scaler.transform(X)

In [68]:
print(standarized_data)

[[ 0.90855572  0.79342662 -0.14086697 -0.93734091]
 [ 0.90855572  0.79342662 -0.15313303 -0.93734091]
 [ 0.90855572  0.79342662 -0.15366634 -0.93734091]
 ...
 [-0.71608814 -1.44063093 -0.1350006  -0.93734091]
 [-0.71608814 -1.44063093 -0.10086896 -0.93734091]
 [-0.71608814 -1.44063093 -0.0102068  -0.93734091]]


In [69]:
X = standarized_data

In [70]:
print(X)
print(Y)

[[ 0.90855572  0.79342662 -0.14086697 -0.93734091]
 [ 0.90855572  0.79342662 -0.15313303 -0.93734091]
 [ 0.90855572  0.79342662 -0.15366634 -0.93734091]
 ...
 [-0.71608814 -1.44063093 -0.1350006  -0.93734091]
 [-0.71608814 -1.44063093 -0.10086896 -0.93734091]
 [-0.71608814 -1.44063093 -0.0102068  -0.93734091]]
0        0
1        0
2        0
3        0
4        0
        ..
70881    0
70882    0
70883    0
70884    0
70885    2
Name: SwimmingGuidelinesTestResultDescription, Length: 68341, dtype: int64


4. Memisahkan Data Training dan Data Testing

In [71]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2,stratify=Y, random_state=2)

In [72]:
print(X.shape, X_train.shape, X_test.shape)

(68341, 4) (54672, 4) (13669, 4)


5. Membuat data latih menggunakan algoritma SVM

In [73]:
classifier = svm.SVC(kernel = 'linear')

In [74]:
classifier.fit(X_train, Y_train)

6. Membuat model evaluasi untuk mengukur tingkat akurasi

In [75]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [76]:
print('Akurasi data training adalah = ', training_data_accuracy)

Akurasi data training adalah =  0.9625036581796897


In [77]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [78]:
print('Akurasi data testing adlaah = ', test_data_accuracy)

Akurasi data testing adlaah =  0.9653961518765088


In [79]:
# Mengimpor library 

from sklearn.linear_model import LogisticRegression
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score 
from sklearn.metrics import classification_report

In [80]:
# Melakukan prediksi pada data uji

logreg = LogisticRegression(multi_class='auto', solver='lbfgs')
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

print("Accuracy_Score : %.2f "% accuracy_score(Y_test,Y_pred))
print("MSE : %.2f "% mean_squared_error(Y_test,Y_pred))
print("MAE : %.2f "% mean_absolute_error(Y_test,Y_pred))
print("RMSE : %.2f "% math.sqrt(mean_squared_error(Y_test,Y_pred)))
print("r_score : %.2f "% r2_score(Y_test,Y_pred))

Accuracy_Score : 0.96 
MSE : 0.11 
MAE : 0.06 
RMSE : 0.33 
r_score : 0.71 


In [81]:
confusion_matrix(Y_test, Y_pred)

array([[11092,     0,     0],
       [  233,  1141,    53],
       [  288,    11,   851]], dtype=int64)

In [82]:
# KLASIFIKASI DENGAN KNN

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
KNN = KNeighborsClassifier()

KNN.fit(X_train, Y_train)
Y_pred6 = KNN.predict(X_test)
score6 = metrics.accuracy_score(Y_test, Y_pred6)
print("Akurasi dengan menggunakan KNN: ", score6)

Akurasi dengan menggunakan KNN:  0.9881483649133075


In [83]:
# Predict 

Y_pred6 = KNN.predict(X_test)
print('Classification Report Test Model (KNeighborsClassifier) :')
print(classification_report(Y_test, Y_pred6))

Classification Report Test Model (KNeighborsClassifier) :
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     11092
           1       0.99      0.97      0.98      1427
           2       0.94      0.92      0.93      1150

    accuracy                           0.99     13669
   macro avg       0.97      0.96      0.97     13669
weighted avg       0.99      0.99      0.99     13669



In [84]:
# Klasifikasi dengan Decision Tree 

from sklearn.tree import DecisionTreeClassifier
DTR = DecisionTreeClassifier()

DTR.fit(X_train, Y_train)
y_pred7 = DTR.predict(X_test)
score7 = metrics.accuracy_score(Y_test, y_pred7)
print("Akurasi dengan menggunakan Decision Tree: ", score7)

Akurasi dengan menggunakan Decision Tree:  0.9999268417587241


In [85]:
# Predict

y_pred7 = DTR.predict(X_test)
print('Classification Report Test Model (DecisionTree) :')
print(classification_report(Y_test, y_pred7))

Classification Report Test Model (DecisionTree) :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11092
           1       1.00      1.00      1.00      1427
           2       1.00      1.00      1.00      1150

    accuracy                           1.00     13669
   macro avg       1.00      1.00      1.00     13669
weighted avg       1.00      1.00      1.00     13669



In [86]:
# Klasifikasi dengan Random Forest

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

rfc.fit(X_train, Y_train)
y_pred8 = rfc.predict(X_test)
score8 = metrics.accuracy_score(Y_test, y_pred8)
print("Akurasi dengan menggunakan Random Forest: ", score8)

Akurasi dengan menggunakan Random Forest:  0.9999268417587241


In [87]:
# Predict

y_pred8 = rfc.predict(X_test)
print('Classification Report Test Model (Random Forest) :')
print(classification_report(Y_test, y_pred8))

Classification Report Test Model (Random Forest) :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11092
           1       1.00      1.00      1.00      1427
           2       1.00      1.00      1.00      1150

    accuracy                           1.00     13669
   macro avg       1.00      1.00      1.00     13669
weighted avg       1.00      1.00      1.00     13669



In [88]:
# Klasifikasi dengan Support Vector Machine

from sklearn.svm import SVC
from sklearn import metrics

svm = SVC()

svm.fit(X_train, Y_train)
y_pred9 = svm.predict(X_test)
score9 = metrics.accuracy_score(Y_test, y_pred9)
print("Akurasi dengan menggunakan Support Vector Machine: ", score9)

Akurasi dengan menggunakan Support Vector Machine:  0.9547150486502305


In [89]:
# Predict

y_pred9 = svm.predict(X_test)
print('Classification Report Test Model (Support Vector Machine) :')
print(classification_report(Y_test, y_pred9))

Classification Report Test Model (Support Vector Machine) :
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     11092
           1       1.00      0.79      0.88      1427
           2       0.92      0.73      0.81      1150

    accuracy                           0.95     13669
   macro avg       0.96      0.84      0.89     13669
weighted avg       0.96      0.95      0.95     13669



7. Membuat Model Prediksi

In [94]:
input_data = (-38.167698, 176.347488, 25, 1)

input_data_as_numpy_array = np.array(input_data)

input_data_reshape = input_data_as_numpy_array.reshape(1,-1)

std_data = scaler.transform(input_data_reshape)
print(std_data)

prediction = rfc.predict(std_data)
print(prediction)

if (prediction[0] == 0):
    print('Anda boleh berenang')

elif(prediction[0] == 1):
    print('Anda tidak boleh berenang')
else :
    print('Berhati-hati')    

[[ 0.90855572  0.79342662 -0.14086697 -0.93734091]]
[0]
Anda boleh berenang


8. Simpan Model

In [91]:
import pickle

In [92]:
filename = 'air_model.sav'
pickle.dump(rfc, open(filename, 'wb'))

In [93]:
filename = 'scaler.sav'
pickle.dump(scaler, open(filename, 'wb'))