## Import Library

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Import Data

In [3]:
train_df = pd.read_csv('data_train.csv')
x_train = train_df.drop(columns=['CropEncode'])
y_train = train_df['CropEncode']
x_train.head()

Unnamed: 0,Nitrogen,Phosphorus,Potassium,Temperature,Humidity,pH_Value,Rainfall
0,17,16,14,16.396243,92.181519,6.625539,102.944161
1,37,79,19,27.543848,69.347863,7.143943,69.408782
2,7,73,25,27.521856,63.132153,7.288057,45.208411
3,101,70,48,25.360592,75.031933,6.012697,116.553145
4,0,17,30,35.474783,47.972305,6.279134,97.790725


In [4]:
test_df = pd.read_csv('data_test.csv')
x_test = test_df.drop(columns=['CropEncode'])
y_test = test_df['CropEncode']
x_test.head()

Unnamed: 0,Nitrogen,Phosphorus,Potassium,Temperature,Humidity,pH_Value,Rainfall
0,101,17,47,29.494014,94.729813,6.185053,26.308209
1,98,8,51,26.179346,86.522581,6.259336,49.43051
2,59,62,49,43.360515,93.351916,6.941497,114.778071
3,44,60,55,34.280461,90.555616,6.825371,98.540477
4,30,137,200,22.9143,90.704756,5.603413,118.604465


## SVM

In [5]:
from sklearn import svm

clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)

In [6]:
from sklearn.metrics import mean_squared_error
from math import sqrt

train_pred = clf.predict(x_train)
mse = mean_squared_error(y_train, train_pred)
rmse = sqrt(mse)
print('Train RMSE: ', rmse)

Train RMSE:  1.0702591020190138


In [7]:
test_pred = clf.predict(x_test)
mse = mean_squared_error(y_test, test_pred)
rmse = sqrt(mse)
print('Test RMSE: ', rmse)

Test RMSE:  1.4946875623657638


## SVM with GridSearchCV

In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [11]:
grid_search.best_params_

{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

In [12]:
clf_grid = svm.SVC(C=10, gamma='scale', kernel='rbf')
clf_grid.fit(x_train, y_train)

In [13]:
train_pred_grid = clf_grid.predict(x_train)
mse = mean_squared_error(y_train, train_pred_grid)
rmse = sqrt(mse)
print('Train RMSE: ', rmse)

Train RMSE:  1.1453959220207737


In [14]:
test_pred_grid = clf_grid.predict(x_test)
mse = mean_squared_error(y_test, test_pred_grid)
rmse = sqrt(mse)
print('Test RMSE: ', rmse)

Test RMSE:  1.4174240399721283


## Hasil

Skor RMSE sebelum dan sudah dilakukan tuning:
- SVM (Model import dari sklearn):
  - Train = 1.0702591020190138
  - Test = 1.4946875623657638
- GridSearchCV:
  - Train = 1.1453959220207737
  - Test = 1.4174240399721283

Kesimpulan:
- Setelah tuning, RMSE pada data test menurun dari 1.4946875623657638 menjadi 1.4174240399721283. Penurunan ini menunjukkan bahwa model yang telah dituning memiliki kemampuan yang lebih baik dalam memprediksi data yang belum pernah dilihat sebelumnya, sehingga generalisasi model menjadi lebih baik.
- RMSE pada data train sedikit meningkat dari 1.0702591020190138 menjadi 1.1453959220207737. Peningkatan kecil ini menunjukkan bahwa model yang dituning mungkin sedikit kurang overfit terhadap data latih, yang sering kali merupakan trade-off yang diinginkan untuk meningkatkan kinerja pada data uji.