In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from tslearn.metrics import dtw
from tslearn.utils import to_time_series
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time
from datetime import datetime
import os

In [2]:
class SawahClassification_KNNDTW:
    def __init__(self,
                 filename, # nama file sawah
                 labeled_filename, # nama file pelabelan
                 folder_name = 'raw_clusters_ts/',
                 output_folder = 'sawah_classification/',
                 metric = 'accuracy', # atau precision
                 evi = True, # EVI atau NDVI
                 random_state = 42
                ):
        self.labeled_filename = labeled_filename
        self.labeling_folder = 'labeling/'
        self.experiment_folder = 'experiment/'
        self.filename = "evi_" + filename if evi else filename
        self.kecamatan = filename.split(".")[0]
        self.evi = evi
        self.folder_name = folder_name
        self.output_folder = output_folder
        self.metric = metric
        self.best_params = {'acc': 0, 'dtw': None, 'k': None, 'year': None}
        self.best_model = None
        self.wilayah_df = None
        self.labeled_merged = None
        self.wilayah_not_labeled = None
        self.labeled_df = None
        self.X_sawah = None
        self.y_sawah = None
        self.final_df = None
        self.sawah_clusters = None
        self.random_state = random_state
        self.preprocess()
        
    def preprocess(self):
        start_time = time.time()
        wilayah_df = pd.read_csv(self.folder_name + self.filename)
        try:
            wilayah_df = wilayah_df.drop([".geo", "system:index"], axis=1)
        except:
            ""
        self.labeled_df = pd.read_csv(self.labeling_folder + self.labeled_filename).drop([".geo", "system:index"], axis=1)
        
        # menghapus kolom yg namanya duplikat
        wilayah_df = wilayah_df.loc[:, ~wilayah_df.columns.duplicated(keep='first')]
        
        # mengurutkan kolom berdasarkan urutan tanggalnya
        wilayah_df = wilayah_df.reindex(sorted(wilayah_df.columns), axis=1)
        cluster_id = wilayah_df['cluster_id']
        wilayah_df = wilayah_df.iloc[:, 1:-1].interpolate(axis=1)
        wilayah_df = wilayah_df.bfill(axis=1)
        wilayah_df = wilayah_df.ffill(axis=1)
        wilayah_df['cluster_id'] = cluster_id
        
        # print(wilayah_df)
        self.wilayah_df = wilayah_df

        wilayah_labeled = wilayah_df[wilayah_df['cluster_id'].isin(self.labeled_df['cluster_id'])]
        self.wilayah_not_labeled = wilayah_df[~wilayah_df['cluster_id'].isin(self.labeled_df['cluster_id'])]
        print("Labeled data: " + str(wilayah_labeled.shape))
        print("Non-labeled data: " + str(self.wilayah_not_labeled.shape))

        self.labeled_merged = pd.merge(wilayah_labeled, self.labeled_df, on='cluster_id', how='left')
        self.X_sawah = self.labeled_merged.iloc[:, :-2]
        self.y_sawah = self.labeled_merged['label']
        print("--- Preprocessing done in %s seconds ---" % (time.time() - start_time))

    def tune(self, dtw_tuner = None, knn_tuner = None, date_tuner = None, output_file='loaded_model.txt'):
        '''
        dtw_tuner = {constraint: ['default', 'sakoe_chiba', 'itakura'], slope:[1, 2, 3, 4, 5], radius: [1,3,5,7]}
        knn_tuner = [3, 5, 7, 9, 11]
        date_tuner = {'start': [2011, 2012, 2013, 2014, 2015], 'span': [3, 5, 7, 10]}
        ''' 
        output_file = self.experiment_folder + output_file

        if dtw_tuner is None:
            dtw_tuner = {
                'constraint': ['default'],
                'slope': [],
                'radius': []
            }
        
        if knn_tuner is None:
            knn_tuner = [5]
        
        if date_tuner is None:
            date_tuner = {
                'start': [2020],
                'span': [5]
            }

        with open(output_file, 'w') as f:
            # Write header information
            f.write("=" * 80 + "\n")
            f.write(f"SAWAH CLASSIFICATION KNN-DTW TUNING EXPERIMENT\n")
            f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Dataset: {self.filename}\n")
            f.write(f"Labels: {self.labeled_filename}\n")
            f.write(f"EVI mode: {self.evi}\n")
            f.write("=" * 80 + "\n\n")
            
            # Write experiment parameters
            f.write("EXPERIMENT PARAMETERS:\n")
            f.write(f"DTW constraints: {dtw_tuner['constraint']}\n")
            f.write(f"DTW slopes: {dtw_tuner['slope']}\n")
            f.write(f"DTW radii: {dtw_tuner['radius']}\n")
            f.write(f"KNN values: {knn_tuner}\n")
            f.write(f"Start years: {date_tuner['start']}\n")
            f.write(f"Year spans: {date_tuner['span']}\n\n")
            
        date_columns = [col for col in self.X_sawah.columns]

        # encode label menjadi numerik agar lebih hemat dalam memproses
        le = LabelEncoder()
        y_encoded = le.fit_transform(self.y_sawah)
        
        # hitung jumlah iterasi
        total_iterations = (len(dtw_tuner['constraint']) * len(dtw_tuner['slope']) * 
                            len(dtw_tuner['radius']) * len(knn_tuner) * 
                            len(date_tuner['start']) * len(date_tuner['span']))
        
        print(f"Total kombinasi yang diuji: {total_iterations}\n")
        
        with open(output_file, 'a') as f:
            f.write(f"Total combinations to test: {total_iterations}\n\n")
            f.write("RESULTS:\n")
            f.write("-" * 80 + "\n")
            f.write("{:<5} {:<15} {:<20} {:<15} {:<10} {:<15}\n".format(
                "Iter", "Year Range", "DTW Parameters", "k", "Accuracy", "Time (s)"))
            f.write("-" * 80 + "\n")
        
        print("Mulai hyperparameter tuning...")
        start_time = time.time()
        
        best_acc = 0
        best_params = {
            'dtw': None,
            'k': None,
            'year': None,
            'acc': 0
        }
        best_model = None

        iteration = 0
        for start_year in date_tuner['start']:
            for year_span in date_tuner['span']:
                # Filter date berdasarkan rentang date yang ingin diambil
                year_columns = [str(col) for col in date_columns 
                               if int(col[:4]) >= start_year and 
                               int(col[:4]) < start_year + year_span]

                # print(date_columns)
                # print(year_columns)
                # print(self.X_sawah.head()) 
                X_subset = self.X_sawah[year_columns]
                
                for constraint in dtw_tuner['constraint']:
                    for slope in dtw_tuner['slope']:
                        for radius in dtw_tuner['radius']:
                            
                            dtw_params = {
                                'constraint': constraint,
                                'slope': slope,
                                'radius': radius
                            }
                            
                            distance_metric = self.get_metric_func(dtw_params)
                            
                            for k in knn_tuner:
                                iteration_start_time = time.time()
                                iteration += 1
                                # print(f"Testing combination {iteration}/{total_iterations}: "
                                #       f"years={start_year}-{start_year+year_span}, "
                                #       f"DTW={dtw_params}, k={k}")
                                print(f"{iteration}/{total_iterations} ", end="")
                                
                                knn = KNeighborsClassifier(
                                    n_neighbors=k,
                                    metric=distance_metric
                                )
                                
                                curr_acc = self.k_fold(knn, X=X_subset, y=y_encoded)

                                iteration_time = time.time() - iteration_start_time
                                dtw_str = f"{constraint}"
                                if slope is not None:
                                    dtw_str += f", slope={slope}"
                                if radius is not None:
                                    dtw_str += f", rad={radius}"
                                    
                                with open(output_file, 'a') as f:
                                    f.write("{:<5} {:<15} {:<20} {:<15} {:<10.4f} {:<15.2f}\n".format(
                                        iteration, 
                                        f"{start_year}-{start_year+year_span}", 
                                        dtw_str,
                                        k,
                                        curr_acc,
                                        iteration_time
                                    ))
                                # print(f"Accuracy: {curr_acc:.4f}")
                                
                                if curr_acc > best_acc:
                                    best_acc = curr_acc
                                    best_params = {
                                        'dtw': dtw_params,
                                        'k': k,
                                        'year': {
                                            'start': start_year,
                                            'span': year_span
                                        },
                                        'acc': curr_acc
                                    }
                                    best_model = knn
                                    with open(output_file, 'a') as f:
                                        f.write(f">>> NEW BEST: Accuracy={best_acc:.4f} <<<\n")
                                    print(f"\nNew best accuracy: {best_acc:.4f} with parameters: {best_params}")
        
        # Update best parameters
        self.best_params = best_params
        self.best_model = best_model.fit(X_subset, self.y_sawah)

        total_time = time.time() - start_time
        
        with open(output_file, 'a') as f:
            f.write("-" * 80 + "\n\n")
            f.write("SUMMARY:\n")
            f.write(f"Total tuning time: {total_time:.2f} seconds\n")
            f.write(f"Best accuracy: {best_params['acc']:.4f}\n")
            f.write(f"Best parameters:\n")
            f.write(f"{best_params}")
        
        print(f"--- Tuning completed in {total_time:.2f} seconds ---")
        print(f"Best parameters: {best_params}")
    
    def k_fold(self, model, X=None, y=None, folds=10):
        skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.random_state)

        X_sawah = X if X is not None else self.X_sawah
        y_sawah = y if y is not None else self.y_sawah

        # print(X_sawah.shape)
        # print(y_sawah.shape)
        stratified_cv_scores = cross_val_score(model, X_sawah, y_sawah, cv=skf, scoring='accuracy')
        curr_acc = np.mean(stratified_cv_scores)
        return curr_acc
    
    def get_metric_func(self, dtw_params):
        constraint = dtw_params['constraint']
        slope = dtw_params['slope']
        radius = dtw_params['radius']
        
        def dtw_distance(x, y):
            x_formatted = to_time_series(x)
            y_formatted = to_time_series(y) 
            dtw_score = dtw(x_formatted, y_formatted)
            if(constraint == 'itakura'):
                dtw_score = dtw(x_formatted, y_formatted, global_constraint="itakura", itakura_max_slope=slope)
            elif(constraint == 'sakoe_chiba'):
                dtw_score = dtw(x_formatted, y_formatted, global_constraint="sakoe_chiba", sakoe_chiba_radius=radius)
            return dtw_score
        return dtw_distance

    def save_model(self):
        if self.best_model is None:
            print("Optimized model not found, try to run tune() or load_model() first")
        else:
            print("Saving...")
            pickle.dump(self.best_params, open(self.output_folder + 'saved_models/' + f'params_{self.filename.split(".")[0]}.pkl', 'wb'))
            print(f"Model successfully saved in /saved_models")

    def load_model(self, params_filename):
        print("Loading model...")
        start_time = time.time()
        best_params = pickle.load(open(self.output_folder + 'saved_models/' + params_filename, 'rb'))
        dtw_tuner = {'constraint': [best_params['dtw']['constraint']], 'slope':[best_params['dtw']['slope']], 'radius': [best_params['dtw']['radius']]}
        knn_tuner = [best_params['k']]
        date_tuner = {'start': [best_params['year']['start']], 'span': [best_params['year']['span']]}

        self.tune(dtw_tuner, knn_tuner, date_tuner)
        print(f"---Model loaded in {time.time()-start_time:.2f} seconds---")

    def predict(self, kecamatan_filename=None):
        print("Predicting entire dataset...")
        start_time = time.time()
        start_year = self.best_params['year']['start']
        year_span = self.best_params['year']['span']
        date_columns = [col for col in self.X_sawah.columns]
        year_columns = [str(col) for col in date_columns 
                               if int(col[:4]) >= start_year and 
                               int(col[:4]) < start_year + year_span]

        if kecamatan_filename is not None:
            kec_df = pd.read_csv(self.folder_name + kecamatan_filename)
            new_features = year_columns.copy()
            cluster_id = kec_df['cluster_id']

            adjusted_df = kec_df.copy()
            adjusted_df = adjusted_df.reindex(new_features, axis=1)
            adjusted_df = adjusted_df.interpolate(axis=1)
            adjusted_df = adjusted_df.ffill(axis=1)
            adjusted_df = adjusted_df.bfill(axis=1)
            adjusted_df['cluster_id'] = cluster_id

            # print(year_columns)
            X_subset = adjusted_df[year_columns]
            y_pred = self.best_model.predict(X_subset)
    
            final_df = kec_df.copy().drop([".geo", "system:index"], axis=1)
            final_df['label'] = y_pred
            self.final_df = final_df
        else:
            X_subset = self.wilayah_not_labeled[year_columns]
            y_pred = self.best_model.predict(X_subset)
    
            predict_res = self.wilayah_not_labeled.copy()
            predict_res['label'] = y_pred
    
            model_df = self.labeled_merged.copy()
            
            final_df = pd.concat([model_df, predict_res], axis=0)
            self.final_df = final_df
            
        sawah_clusters = self.final_df[self.final_df['label'] == 'sawah']
        sawah_clusters.drop("label", axis=1)
        output_filename = self.filename if kecamatan_filename is None else kecamatan_filename
        sawah_clusters.to_csv(self.output_folder + f"{output_filename.split(".")[0]}.csv")
        print("--- Prediction finished in %s seconds ---" % (time.time() - start_time))

In [3]:
test_tuning = SawahClassification_KNNDTW(filename='campaka_label_only.csv', labeled_filename='label_sawah_campaka.csv')

Labeled data: (725, 1035)
Non-labeled data: (0, 1035)
--- Preprocessing done in 0.18820786476135254 seconds ---


In [4]:
# 18    2015-2020       sakoe_chiba, slope=1, rad=15 11              0.8605     181.29         

dtw_tuner = {'constraint': ['sakoe_chiba'], 'slope':[1], 'radius': [15]}
knn_tuner = [11]
date_tuner = {'start': [2015], 'span': [5]}

In [6]:
test_tuning.save_model()

Saving...
Model successfully saved in /saved_models


In [7]:
test_tuning.load_model('params_evi_campaka.pkl')

Loading model...
Total kombinasi yang diuji: 1

Mulai hyperparameter tuning...
1/1 
New best accuracy: 0.8566 with parameters: {'dtw': {'constraint': 'sakoe_chiba', 'slope': 1, 'radius': 15}, 'k': 11, 'year': {'start': 2015, 'span': 5}, 'acc': 0.8566473000683527}
--- Tuning completed in 386.69 seconds ---
Best parameters: {'dtw': {'constraint': 'sakoe_chiba', 'slope': 1, 'radius': 15}, 'k': 11, 'year': {'start': 2015, 'span': 5}, 'acc': 0.8566473000683527}
---Model loaded in 386.73 seconds---


In [6]:
test_tuning.predict("evi_cilaku.csv")

Predicting entire dataset...
--- Prediction finished in 4008.861605167389 seconds ---


In [5]:
test_tuning.tune(dtw_tuner=dtw_tuner, knn_tuner=knn_tuner, date_tuner=date_tuner, output_file='temp_sawah.txt')

Total kombinasi yang diuji: 1

Mulai hyperparameter tuning...
1/1 
New best accuracy: 0.8566 with parameters: {'dtw': {'constraint': 'sakoe_chiba', 'slope': 1, 'radius': 15}, 'k': 11, 'year': {'start': 2015, 'span': 5}, 'acc': 0.8566473000683527}
--- Tuning completed in 346.71 seconds ---
Best parameters: {'dtw': {'constraint': 'sakoe_chiba', 'slope': 1, 'radius': 15}, 'k': 11, 'year': {'start': 2015, 'span': 5}, 'acc': 0.8566473000683527}


# Viz persawahan

In [8]:
classified_sawah = ['evi_warungkondang.csv', 'evi_gekbrong.csv', 'evi_cugenang.csv', 'evi_cianjur.csv', 'evi_cilaku.csv', 'evi_campaka.csv', 'evi_cibeber.csv']

In [10]:
classified_sawah = ['evi_warungkondang.csv', 'evi_gekbrong.csv', 'evi_cugenang.csv', 'evi_cianjur.csv', 'evi_cilaku.csv', 'evi_campaka.csv', 'evi_cibeber.csv']
for sawah in classified_sawah:
    df = pd.read_csv(f"sawah_classification/{sawah}")
    luas = (df.shape[0] * 81) / 100
    print(f"untuk sawah {sawah} mempunyai luas: {luas} hektar")
    cluster_ids = ', '.join(map(str, df['cluster_id'].tolist()))
    
    with open(f'sawah_classification/cluster_ids/{sawah.split(".")[0]}.txt', 'w') as f:
        f.write(cluster_ids)

untuk sawah evi_warungkondang.csv mempunyai luas: 1938.33 hektar
untuk sawah evi_gekbrong.csv mempunyai luas: 870.75 hektar
untuk sawah evi_cugenang.csv mempunyai luas: 1230.39 hektar
untuk sawah evi_cianjur.csv mempunyai luas: 994.68 hektar
untuk sawah evi_cilaku.csv mempunyai luas: 2556.36 hektar
untuk sawah evi_campaka.csv mempunyai luas: 2090.61 hektar
untuk sawah evi_cibeber.csv mempunyai luas: 3027.78 hektar


In [12]:
test_df.shape

(1228, 1049)

In [200]:
581 * 100

58100