In [None]:
import scipy.io
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import time
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
import shutil
import itertools
import xgboost as xgb
import scikitplot as skplt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.utils import class_weight

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Labels Names

In [None]:
indianpines_class_names = ['background',
                           'alfalfa',           'corn-notill',               'corn-min',               'corn',
                           'grass/pasture',     'grass/trees',    'grass/pasture-mowed',      'hay-windrowed',
                           'oats',          'soybeans-notill',           'soybeans-min',      'soybean-clean',
                           'wheat',                   'woods', 'bldg-grass-tree-drives', 'stone-steel towers']

## Define Draw Classification Map

In [None]:
def draw_classification_map(classifier_name, prediction, X, y, train_indices, test_indices):
    clmap = [0]*X.shape[0]
    for i in range(len(train_indices)):
        clmap[train_indices[i]] = y[train_indices[i]]

    for i in range(len(test_indices)):
        clmap[test_indices[i]] = prediction[i]

    plt.figure(figsize=(10, 10))
    plt.imshow(np.array(clmap).reshape((145, 145)), cmap='jet')
    plt.colorbar()
    plt.axis('off')
    plt.title(f'Classification Map ({classifier_name})')
    plt.savefig(f'{classifier_name}_classification_map.png')
    plt.show()

## Define Distribution Plot

In [None]:
def distribution_plot(n, df):
    plt.figure(figsize=(16, 6))
    sns.distplot(df['band-' + str(n)], color='mediumSpringGreen', bins=100, hist_kws={'alpha': 0.4})
    plt.xlabel('Band - ' + str(n), fontsize=14)
    plt.title('Distribution Plot of Band - ' + str(n), fontsize=16)
    plt.show()

## Define Box Plot

In [None]:
def box_plot(n, df):
    plt.figure(figsize=(16, 6))
    sns.boxplot(x=df["class"], y=df['band-' + str(n)], width=0.3)
    plt.title('Box Plot', fontsize=16)
    plt.xlabel('Class', fontsize=14)
    plt.ylabel(f'Band-{n}', fontsize=14)
    plt.show()

## Define Bar Plot

In [None]:
def bar_plot(df):
    plt.figure(figsize=(14, 8))
    ax = sns.countplot(x='class', data=df[['class']])
    for p in ax.patches:
        ax.annotate('{:.1f}%'.format(100 * p.get_height() / df.shape[0]), (p.get_x() + 0.1, p.get_height() + 5))
    plt.ylabel('Class count with percentage', fontsize=14)
    plt.xlabel('class', fontsize=14)
    plt.title('Bar Plot', fontsize=16)
    plt.show()

## Define List to Array

In [None]:
def list2array(X):
    return np.array([np.asarray(item, dtype=float) for item in X])

## Define Confusion Matrix

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          title='Confusion Matrix',
                          normalize=False,
                          cmap=plt.cm.Blues):
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized Confusion Matrix")
    else:
        print('Confusion Matrix')

    plt.figure(figsize=(15, 15))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(f'{title}.png')
    plt.show()

## Define Save Report

In [None]:
def save_report(filename, metrics, report, best_params, train_acc):
    with open(filename, "w") as file:
        file.write("Classification Report:\n")
        file.write(str(report))
        file.write(f'\n{str(train_acc)}')
        file.write(f"\n{str(metrics)}\n")
        file.write("\nBest Parameters:\n")
        file.write(str(best_params))

## Classification Pipeline

In [None]:
from sklearn.model_selection import StratifiedKFold


def run_classification_pipeline(X_train, X_test, y_train, y_test, classifier_name, classifier, param_grid, cv_folds=5):

    resample_strategy = SMOTE(sampling_strategy='auto', random_state=1)
    #feature_selection = SelectFromModel(RandomForestClassifier(n_estimators=100))
    # pca = PCA(n_components=100)  # Retains 95% of variance
    start_time = time.time()
    # Define a pipeline that includes scaling, SMOTE, and the classifier
    steps = [#('scaler', MinMaxScaler()),
            #('feature_selection', feature_selection),
             #('pca', pca),
             ('resampling', resample_strategy),
             ('classifier', classifier)]
    pipeline = ImbPipeline(steps=steps)

    # Define the cross-validator
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

    # Set up the grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Best model evaluation on the test set
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    y_pred = best_model.predict(X_test)

    print(f'{classifier_name} Train_Acc={grid_search.score(X_train, y_train):.3f}, Test_Cla_Acc={grid_search.score(X_test, y_test):.3f}, time_cost={time.time()-start_time:.3f}')

    return best_model, grid_search, best_params, y_pred


## Load Data

In [None]:
df_soil = pd.read_csv("dataset/dataset.csv")
df_soil.loc[:,'class'].value_counts()
X = df_soil.iloc[:, :-1].values
y = df_soil.iloc[:, -1].values
print(X.shape, y.shape)
height = 145
width = 145
band = 220
num_classes = 17
df_soil.head()

## Plot Bands

In [None]:
def plot_band(dataset):
    plt.figure(figsize=(8, 6))
    band_no = np.random.randint(dataset.shape[1])
    dataset = dataset.values.reshape(145,145,dataset.shape[1])
    plt.imshow(dataset[:,:, band_no], cmap='jet')
    plt.title(f'Band-{band_no}', fontsize=14)
    plt.axis('off')
    plt.colorbar()
    plt.show()
plot_band(df_soil)

## Visualizing ground truth of the image.

In [None]:
plt.figure(figsize=(8, 6))
ground_truth = df_soil['class']
ground_truth = ground_truth.values.reshape(145,145)
plt.imshow(ground_truth)
plt.axis('off')
plt.colorbar(ticks= range(0,16))
plt.show()

## Visualizing Spectral Signatures

In [None]:
def plot_signature(df):
    plt.figure(figsize=(12, 6))
    pixel_no = np.random.randint(df.shape[0])
    print("Pixel No: ",pixel_no)
    plt.plot(range(1, 221), df.iloc[pixel_no, :-1].values.tolist(), 'b--', label= f'Class - {df.iloc[pixel_no, -1]}')
    plt.legend()
    plt.title(f'Pixel({pixel_no}) signature', fontsize=14)
    plt.xlabel('Band Number', fontsize=14)
    plt.ylabel('Pixel Intensity', fontsize=14)
    plt.show()
plot_signature(df_soil)

## Box Plot

In [None]:
box_plot(50, df_soil)

In [None]:
distribution_plot(50, df_soil)

# for i in range(1, 221):
#     distribution_plot(i, df_soil)

## Splitting and Training Data

In [None]:
X_train, X_test, y_train, y_test, train_indexes, test_indexes = \
    train_test_split(X, y, range(X.shape[0]),train_size=0.7, random_state=123, stratify=y)
train_indices = list2array(train_indexes, isdata=False)
test_indices = list2array(test_indexes, isdata=False)

## Print Data Summary

In [None]:
def print_data_summary_1(y_train,y_test,y,num_classes):
    df = pd.DataFrame(np.random.randn(num_classes, 3),
                      index=[indianpines_class_names[i] for i in range(0,num_classes)],
                      columns=['Train', 'Test', 'Total'])
    df['Train'] = [sum(y_train==i) for i in range(0,num_classes)]
    df['Total'] = [sum(y==i) for i in range(0,num_classes)]
    df['Test'] = np.array(df['Total']) - np.array(df['Train'])
    print('Summary of training and testing samples:')
    print(df)
    print("Training samples: %d" % len(y_train))
    print("Test samples: %d" % len(y_test))
print_data_summary_1(y_train, y_test, y, num_classes)


In [None]:
class_wt = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

## Standardization

In [None]:
scaler = MinMaxScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test)

## Define Classifiers

In [None]:
classifiers = {
    'KNN': (KNeighborsClassifier(), {'classifier__n_neighbors': [3]}),
    'SVM_RBF': (SVC(probability=True), {'classifier__C': [1, 10, 50, 100, 0.1], 'classifier__kernel': ['rbf'], 'classifier__gamma': ['scale']}),
    'SVM_Poly': (SVC(probability=True), {'classifier__C': [1, 10, 50, 100, 0.1], 'classifier__kernel': ['poly'], 'classifier__degree':[1,2,3,4,5,6], 'classifier__gamma': ['scale']}),
    'Xgboost':(xgb.XGBClassifier(), {'classifier__colsample_bytree': [1.0], 'classifier__eta': [0.1], 'classifier__gamma': [0], 'classifier__max_depth': [20], 'classifier__min_child_weight': [10], 'classifier__n_estimator': [3000], 'classifier__nthreads': [-1], 'classifier__num_class': [16], 'classifier__objective': ['multi:softmax'], 'classifier__subsample': [1.0], 'classifier__tree_method': ['auto']}),
    'Random_Forest': (RandomForestClassifier(), {'classifier__n_estimators': [100], 'classifier__min_samples_split': [2], 'classifier__min_samples_leaf': [1]}),
    'Gradient_Boosting': (GradientBoostingClassifier(), {'classifier__n_estimators': [300]}),
    'MLP': (MLPClassifier(), {'classifier__hidden_layer_sizes': [400, 500]})
}

In [None]:
selected_classifiers = ['Random_Forest', 'KNN', 'SVM_RBF', 'Xgboost', 'Gradient_Boosting', 'MLP', 'SVM_Poly']
classifiers_to_run = {name: classifiers[name] for name in selected_classifiers if name in classifiers}


In [None]:
classifiers_to_run

## Run Classification Pipeline

In [None]:
for name, (classifier, params) in classifiers_to_run.items():
    best_model, grid_search, best_param, prediction = run_classification_pipeline(X_train=X_train_scaler,y_train=y_train,X_test=X_test_scaler, y_test=y_test,classifier_name=name, classifier=classifier, param_grid=params)

    print(f"Results for {name}:")
    print(f'Accuracy: {accuracy_score(y_test, prediction)}%')
    print(f"{name} Class Report: \n", classification_report(y_test, prediction))
    plot_confusion_matrix(y_test, prediction, classes=indianpines_class_names, normalize=True, title=f'{name} Confusion Matrix')
    prob = best_model.predict_proba(X_test_scaler)
    skplt.metrics.plot_roc(y_test, prob, title=f'{name} ROC Curves', figsize=(15, 15))
    draw_classification_map(name, prediction, X, y, train_indexes, test_indexes)
    save_report(f'{name}_report.txt', accuracy_score(y_test, prediction), classification_report(y_test, prediction), best_param, grid_search.score(X_train_scaler, y_train))
    print("-" * 80)
