1. Menentukan Library yang akan di gunakan

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

2. Load Dataset

In [3]:
air_dataset = pd.read_csv('monitoringdata.csv')

In [4]:
air_dataset.head()

Unnamed: 0,Agency,Region,siteName,SiteID,LawaSiteID,siteType,Latitude,Longitude,Property,DateCollected,Resample following exceedance?,Value,SwimIcon,SwimmingGuidelinesTestResultDescription
0,Bay of Plenty Regional Council,Bay of Plenty region,FK325034,Lake Okareka at Steep St Reserve,EBOP-10005,Lake,-38.167698,176.347488,E-coli,10/25/2017 12:00,,25.0,green,Suitable for swimming
1,Bay of Plenty Regional Council,Bay of Plenty region,FK325034,Lake Okareka at Steep St Reserve,EBOP-10005,Lake,-38.167698,176.347488,E-coli,10/29/2017 12:00,,2.0,green,Suitable for swimming
2,Bay of Plenty Regional Council,Bay of Plenty region,FK325034,Lake Okareka at Steep St Reserve,EBOP-10005,Lake,-38.167698,176.347488,E-coli,11/5/2017 12:00,,1.0,green,Suitable for swimming
3,Bay of Plenty Regional Council,Bay of Plenty region,FK325034,Lake Okareka at Steep St Reserve,EBOP-10005,Lake,-38.167698,176.347488,E-coli,11/13/2017 0:19,,5.0,green,Suitable for swimming
4,Bay of Plenty Regional Council,Bay of Plenty region,FK325034,Lake Okareka at Steep St Reserve,EBOP-10005,Lake,-38.167698,176.347488,E-coli,11/19/2017 22:34,,2.0,green,Suitable for swimming


In [5]:
air_dataset.shape

(70886, 14)

In [6]:
# Melihat informasi data

air_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70886 entries, 0 to 70885
Data columns (total 14 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Agency                                   70886 non-null  object 
 1   Region                                   70886 non-null  object 
 2   siteName                                 70886 non-null  object 
 3   SiteID                                   70886 non-null  object 
 4   LawaSiteID                               70886 non-null  object 
 5   siteType                                 70886 non-null  object 
 6   Latitude                                 70593 non-null  float64
 7   Longitude                                70593 non-null  float64
 8   Property                                 70886 non-null  object 
 9   DateCollected                            70886 non-null  object 
 10  Resample following exceedance?           56713

In [7]:
air_dataset['SwimmingGuidelinesTestResultDescription'].value_counts()

Suitable for swimming      55454
Unsuitable for swimming     7136
Caution advised             5751
Name: SwimmingGuidelinesTestResultDescription, dtype: int64

In [8]:
air_dataset['Property'].value_counts()

E-coli           32812
Enterococci      26323
Cyanobacteria    11751
Name: Property, dtype: int64

In [9]:
# memisahkan data dan label
X = air_dataset.drop(columns=['Agency','Region','siteName', 'SiteID', 'LawaSiteID','siteType', 'Resample following exceedance?', 'DateCollected','SwimIcon','SwimmingGuidelinesTestResultDescription'], axis=11)
Y = air_dataset['SwimmingGuidelinesTestResultDescription']

In [10]:
print(X)

        Latitude   Longitude Property  Value
0     -38.167698  176.347488   E-coli   25.0
1     -38.167698  176.347488   E-coli    2.0
2     -38.167698  176.347488   E-coli    1.0
3     -38.167698  176.347488   E-coli    5.0
4     -38.167698  176.347488   E-coli    2.0
...          ...         ...      ...    ...
70881 -42.376147  171.242946   E-coli   40.0
70882 -42.376147  171.242946   E-coli  150.0
70883 -42.376147  171.242946   E-coli   36.0
70884 -42.376147  171.242946   E-coli  100.0
70885 -42.376147  171.242946   E-coli  270.0

[70886 rows x 4 columns]


In [11]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70886 entries, 0 to 70885
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Latitude   70593 non-null  float64
 1   Longitude  70593 non-null  float64
 2   Property   70886 non-null  object 
 3   Value      70505 non-null  float64
dtypes: float64(3), object(1)
memory usage: 2.2+ MB


In [12]:
data_cleaned = air_dataset.dropna(subset=['SwimmingGuidelinesTestResultDescription'])
X = data_cleaned[['Latitude', 'Longitude', 'Value', 'Property']]
Y = data_cleaned['SwimmingGuidelinesTestResultDescription']

In [13]:
label_mapping = {
    'E-coli': 1,
    'Enterococci': 2,
    'Cyanobacteria': 3
}

X['Property'] = X['Property'].replace(label_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Property'] = X['Property'].replace(label_mapping)


In [14]:
from sklearn.impute import SimpleImputer

# Ubah tipe data kolom "Property" menjadi objek
X.loc[:, 'Property'] = X['Property'].astype('object')

# Gunakan SimpleImputer dengan strategi "most_frequent" untuk mengisi nilai yang hilang
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'Property'] = X['Property'].astype('object')


In [15]:
label_mapping = {
    'Suitable for swimming': 0,
    'Unsuitable for swimming': 1,
    'Caution advised': 2
}

Y = Y.replace(label_mapping)


In [16]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
70881    0
70882    0
70883    0
70884    0
70885    2
Name: SwimmingGuidelinesTestResultDescription, Length: 68341, dtype: int64


In [17]:
import numpy as np

unique_values, value_counts = np.unique(Y, return_counts=True)

for value, count in zip(unique_values, value_counts):
    print(f"{value}: {count}")

0: 55454
1: 7136
2: 5751


In [18]:
print(X)

[[-38.167698 176.347488 25.0 1]
 [-38.167698 176.347488 2.0 1]
 [-38.167698 176.347488 1.0 1]
 ...
 [-42.37614725 171.2429464 36.0 1]
 [-42.37614725 171.2429464 100.0 1]
 [-42.37614725 171.2429464 270.0 1]]


In [19]:
print(Y)

0        0
1        0
2        0
3        0
4        0
        ..
70881    0
70882    0
70883    0
70884    0
70885    2
Name: SwimmingGuidelinesTestResultDescription, Length: 68341, dtype: int64


3. Standarisasi Data

In [20]:
scaler = StandardScaler()

In [21]:
scaler.fit(X)

In [22]:
standarized_data = scaler.transform(X)

In [23]:
print(standarized_data)

[[ 0.90855572  0.79342662 -0.14086697 -0.93734091]
 [ 0.90855572  0.79342662 -0.15313303 -0.93734091]
 [ 0.90855572  0.79342662 -0.15366634 -0.93734091]
 ...
 [-0.71608814 -1.44063093 -0.1350006  -0.93734091]
 [-0.71608814 -1.44063093 -0.10086896 -0.93734091]
 [-0.71608814 -1.44063093 -0.0102068  -0.93734091]]


In [24]:
X = standarized_data

In [25]:
print(X)
print(Y)

[[ 0.90855572  0.79342662 -0.14086697 -0.93734091]
 [ 0.90855572  0.79342662 -0.15313303 -0.93734091]
 [ 0.90855572  0.79342662 -0.15366634 -0.93734091]
 ...
 [-0.71608814 -1.44063093 -0.1350006  -0.93734091]
 [-0.71608814 -1.44063093 -0.10086896 -0.93734091]
 [-0.71608814 -1.44063093 -0.0102068  -0.93734091]]
0        0
1        0
2        0
3        0
4        0
        ..
70881    0
70882    0
70883    0
70884    0
70885    2
Name: SwimmingGuidelinesTestResultDescription, Length: 68341, dtype: int64


4. Memisahkan Data Training dan Data Testing

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2,stratify=Y, random_state=2)

In [27]:
print(X.shape, X_train.shape, X_test.shape)

(68341, 4) (54672, 4) (13669, 4)


5. Membuat data latih menggunakan algoritma SVM

In [28]:
classifier = svm.SVC(kernel = 'linear')

In [29]:
classifier.fit(X_train, Y_train)

6. Membuat model evaluasi untuk mengukur tingkat akurasi

In [30]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [31]:
print('Akurasi data training adalah = ', training_data_accuracy)

Akurasi data training adalah =  0.9625036581796897


In [32]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [33]:
print('Akurasi data testing adlaah = ', test_data_accuracy)

Akurasi data testing adlaah =  0.9653961518765088


7. Membuat Model Prediksi

In [37]:
input_data = (-38.167698, 176.347488, 25, 1)

input_data_as_numpy_array = np.array(input_data)

input_data_reshape = input_data_as_numpy_array.reshape(1,-1)

std_data = scaler.transform(input_data_reshape)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
    print('Anda boleh berenang')

elif(prediction[0] == 1):
    print('Anda tidak boleh berenang')
else :
    print('Berhati-hati')    

[[ 0.90855572  0.79342662 -0.14086697 -0.93734091]]
[0]
Anda boleh berenang


8. Simpan Model

In [35]:
import pickle

In [36]:
filename = 'air_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [None]:
filename = 'scaler.sav'
pickle.dump(scaler, open(filename, 'wb'))