In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from tslearn.metrics import dtw
from tslearn.utils import to_time_series
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time

In [17]:
folder_name = 'sawah_classification/'

collected_data = ["campaka"]
curr_kec = collected_data[0]

#dtw_knn_2 -> datasetnya lebih besar
# https://code.earthengine.google.com/52607160b523092795f576172fb238b3

labeled_df = pd.read_csv(folder_name + "label_sawah_nonsawah_campaka.csv").drop([".geo", "system:index"], axis=1)
wilayah_df = pd.read_csv(folder_name + "full_campaka.csv").drop("system:index", axis=1)
cluster_geo = wilayah_df[['cluster_id', '.geo']]
wilayah_df = wilayah_df.drop(".geo", axis=1)

In [18]:
wilayah_df.head()

Unnamed: 0,1_1_1_20200110T030101_20200110T030652_T48MYT,1_1_1_20200115T030049_20200115T031557_T48MYT,1_1_1_20200120T030021_20200120T030612_T48MYT,1_1_1_20200125T030009_20200125T031245_T48MYT,1_1_1_20200130T025941_20200130T031830_T48MYT,1_1_1_20200204T025919_20200204T031238_T48MYT,1_1_1_20200209T025851_20200209T030437_T48MYT,1_1_1_20200214T025819_20200214T031912_T48MYT,1_1_1_20200219T025751_20200219T030336_T48MYT,1_1_1_20200224T025719_20200224T031654_T48MYT,...,2_LE07_121065_20221231,2_LE07_121065_20240112,2_LE07_122065_20200125,2_LE07_122065_20201226,2_LE07_122065_20220114,2_LE07_122065_20220524,2_LE07_122065_20230105,2_LE07_122065_20230225,2_LE07_122065_20230302,cluster_id
0,0.70105,0.70105,0.70105,0.46564,0.342311,0.342311,0.784618,0.785765,0.785765,0.774083,...,0.62262,0.771118,0.70105,0.047598,0.655566,0.79032,0.666707,0.054712,0.054712,314340425
1,,0.549854,0.549854,0.549854,0.425542,0.425542,0.531134,0.523425,0.523425,0.359632,...,0.400261,0.533082,0.549854,0.050287,0.49681,0.517567,0.419083,-0.320322,-0.320322,1179512119
2,0.634526,0.679525,0.679525,0.599508,0.333265,0.333265,0.702724,0.703249,0.703249,0.668738,...,0.611588,0.67301,0.679525,0.059639,0.67648,0.745061,0.713188,0.04668,0.04668,2092296463
3,0.332606,0.681451,0.681451,0.681451,0.341275,0.341275,0.814456,0.814456,0.814456,0.788976,...,0.765344,0.809831,0.681451,0.064449,0.798185,0.828702,0.804812,0.048421,0.048421,762622032
4,0.318706,0.612317,0.612317,0.612317,0.352568,0.352568,0.770156,0.771311,0.771311,0.719891,...,0.737324,0.782656,0.612317,0.049978,0.81908,0.839136,0.758973,0.012685,0.012685,-176244505


In [19]:
columns = wilayah_df.columns.tolist()
renamed_columns = []

for col in columns:
    colname = ""
    if "T48MYT" in col:
        colname = col.split("_")[3][:8]
    elif "LC08" in col:
        colname = col.split("_")[5]
    elif "LE07" in col:
        colname = col.split("_")[3]
    renamed_columns.append(colname)

renamed_columns = renamed_columns[:-1]

old_new_col = dict(zip(columns, renamed_columns))

In [20]:
wilayah_df = wilayah_df.rename(columns=old_new_col)
# print(wilayah_df)
# menghapus kolom yg namanya duplikat
wilayah_df = wilayah_df.loc[:, ~wilayah_df.columns.duplicated(keep='first')]

# mengurutkan kolom berdasarkan urutan tanggalnya
wilayah_df = wilayah_df.reindex(sorted(wilayah_df.columns), axis=1)

wilayah_df = wilayah_df.ffill(axis=1)
wilayah_df = wilayah_df.bfill(axis=1)

In [21]:
wilayah_labeled = wilayah_df[wilayah_df['cluster_id'].isin(labeled_df['cluster_id'])]
wilayah_not_labeled = wilayah_df[~wilayah_df['cluster_id'].isin(labeled_df['cluster_id'])]
print("labeled data: " + str(wilayah_labeled.shape))
print("non-labeled data: " + str(wilayah_not_labeled.shape))

labeled data: (98, 411)
non-labeled data: (3717, 411)


In [22]:
wilayah_labeled = pd.merge(wilayah_labeled, labeled_df, on='cluster_id', how='left')

X_wk = wilayah_labeled.iloc[:, :-2]
y_wk = wilayah_labeled['label']
print(X_wk.shape)

X_train, X_test, y_train, y_test = train_test_split(X_wk, y_wk, test_size=0.25, random_state=42, stratify=y_wk)

(99, 410)


In [23]:
def dtw_score(x, y):
    x_formatted = to_time_series(x)
    y_formatted = to_time_series(y)
    dtw_score = dtw(x_formatted, y_formatted)
    itakura_dtw = dtw(x_formatted, y_formatted, global_constraint="itakura", itakura_max_slope=2)
    sakoe_chiba_dtw = dtw(x_formatted, y_formatted, global_constraint="sakoe_chiba", sakoe_chiba_radius=2)
    return sakoe_chiba_dtw
    
    

knn_1 = KNeighborsClassifier(n_neighbors = 5, metric=dtw_score)
knn_1.fit(X_train, y_train)

y_pred = knn_1.predict(X_test)

In [24]:
def evaluate(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=np.unique(y_test), 
                yticklabels=np.unique(y_test))
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()
evaluate(y_test, y_pred)

In [25]:
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}


# print(scoring)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

stratified_cv_scores = cross_val_score(knn_1, X_wk, y_wk, cv=skf, scoring='accuracy')

print("\nStratified k-fold results:")
for i, score in enumerate(stratified_cv_scores):
    print(f"Fold {i+1}: {score:.4f}")

print(f"\nAverage accuracy (stratified): {np.mean(stratified_cv_scores):.4f}")
print(f"Standard deviation (stratified): {np.std(stratified_cv_scores):.4f}")
#with larger dataset

start_time = time.time()
final_model = KNeighborsClassifier(n_neighbors=5, metric=dtw_score)
final_model.fit(X_wk, y_wk)

pickle.dump(final_model, open(folder_name + 'saved_models/' + f'classification_{curr_kec}.pkl', 'wb'))
predict_all = final_model.predict(wilayah_not_labeled.iloc[:, :-1])
print("--- %s seconds ---" % (time.time() - start_time))


Stratified k-fold results:
Fold 1: 1.0000
Fold 2: 0.8000
Fold 3: 0.8000
Fold 4: 1.0000
Fold 5: 0.9000
Fold 6: 1.0000
Fold 7: 1.0000
Fold 8: 1.0000
Fold 9: 1.0000
Fold 10: 1.0000

Average accuracy (stratified): 0.9500
Standard deviation (stratified): 0.0806


In [27]:
unlabeled_df = wilayah_not_labeled.copy()
unlabeled_df['label'] = predict_all
predicted_df = unlabeled_df.copy()

model_df = wilayah_labeled.copy()
final_df = pd.concat([model_df, predicted_df], axis=0)
final_df.shape

(3816, 412)

In [28]:
sawah_clusters = final_df[final_df['label'] == 'sawah']
cluster_ids = sawah_clusters['cluster_id'].tolist()

In [29]:
print(cluster_ids)

[-917794643.0, 311245178.0, -1650217684.0, 2124792546.0, -688633003.0, 1227641531.0, 1388155823.0, -1270827916.0, 1052306339.0, 915438257.0, -1132344047.0, 214820671.0, 343287607.0, -1344971223.0, 1789762246.0, -1449462029.0, -452163700.0, -1029889705.0, -1627081551.0, 599283644.0, 587744584.0, -263205183.0, -193013299.0, 197816647.0, 135003530.0, -2142798840.0, 2098126633.0, -1340001761.0, -1874608788.0, -1741580654.0, -371864521.0, 455897423.0, 35335500.0, 98334114.0, 344793503.0, 172473050.0, 131791723.0, 185122031.0, -1727544214.0, -523515288.0, 377971710.0, 1951536363.0, -1362720936.0, 921279356.0, -65749960.0, 142836573.0, -836217663.0, 2074493430.0, -904121058.0, 79397419.0, 951742797.0, 2092296463.0, -2106052113.0, 1233805816.0, -693147847.0, 355636100.0, 1805711005.0, -814502746.0, 362115070.0, 421884622.0, -178900516.0, 1417642363.0, 1611174765.0, -921939249.0, -2001110783.0, 857631799.0, 437925902.0, 1231212897.0, -608069479.0, -824688566.0, -653032671.0, -172145166.0, 11985