In [1]:
!pip install tab2img



In [2]:
import warnings
warnings.filterwarnings('ignore')

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statistics import mean

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense,Activation,Dropout 
from tensorflow.keras.layers import BatchNormalization 
from tensorflow.keras.utils import to_categorical

from tab2img.converter import Tab2Img

In [5]:
# from google.colab import drive
# drive.mount('/content/drive')

### **Read CSV files**

In [7]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,Category of Trade,Volume/ Amount,Volume Type,Load Port,Discharge Port,Dangerous Goods,Organisation
0,7,26726,1,27,34,2,73
1,5,2600,1,44,54,2,69
2,7,10710,1,8,0,2,12
3,7,21611,1,8,40,2,12
4,7,5139,1,6,0,2,12


# **Seperate features and target**

In [8]:
# seperate features and target column
features = ["Category of Trade", "Volume Type", "Load Port", "Discharge Port", "Dangerous Goods"]
target = "Organisation"

# copy the feqatures and class dataframe in X and y variable
X = df[features]
y = df[target]

# **Scale features and train test split**

In [9]:
# scalling feature value between the range of 0-1
ss = StandardScaler()
X = ss.fit_transform(X)

# **customize functions**

In [22]:
#list of evaluation scores
evalution_scores = []

# save saller
pkl_file = open('target_encoder.pkl', 'rb')
le_target = pickle.load(pkl_file) 

# function for getting class labels via class integar using label encoder
def get_classes(y_test):
    y_test = y_test.unique()
    print(len(y_test))
    return le_target.inverse_transform(y_test)

# get unique classes in test set
def get_str_classes(y_test):
    y_test = y_test.unique()
    return [str(int) for int in y_test]

# split dataset in training and testing set 
def get_train_test_split():
    return train_test_split(X, y, test_size=0.2, random_state=42)


# split data into train, test and validation set with splitted feature and class
def get_train_test_val_split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.22, random_state=2)
    return X_train, X_test, X_val, y_train, y_test, y_val

# split data into train, test and validation set with joined feature and class
def get_train_test_val_split_df():
    train, test = train_test_split(df, test_size=0.2, random_state=42)
    train, val = train_test_split(train, test_size=0.25, random_state=2)
    return train, test, val


# compute the accuracy using true and predicted labels
def compute_accuracy(y_true, y_pred):
    correct_predictions = 0
    # iterate over each label and check
    for true, predicted in zip(y_true, y_pred):
        if true == predicted:
            correct_predictions += 1
    # compute the accuracy
    accuracy = correct_predictions/len(y_true)
    return accuracy


# compute other evaluation measures and print
def make_predictions(y_test, y_pred):
    accuracy=accuracy_score(y_test, y_pred)
    P, R, F, S = precision_recall_fscore_support(y_test, y_pred, average="weighted")
    
    accuracy = "{:.4f}".format(accuracy)
    P = "{:.4f}".format(P)
    R = "{:.4f}".format(R)
    F = "{:.4f}".format(F)

    print("Accuracy Score :", accuracy)
    print("=======================\n")

    print("Precision Score :", P)
    print("========================\n")

    print("Recall Score :", R)
    print("=====================\n")

    print("F1 Score :", F)
    print("=================\n")

    return accuracy, P, R, F


def DL_preprocessing(train, target):
    target = target.to_numpy()
    model = Tab2Img()
    images = model.fit_transform(train, target)
    
    size = (200, 200)
    reshaped_images = []
    for img in images:
        img = np.resize(img, (img.shape[0], img.shape[1], 3))
        reshaped_images.append(tf.keras.preprocessing.image.smart_resize(
            img, size, interpolation='bilinear'
        ))
    reshaped_images = np.array(reshaped_images)
    target = tf.keras.utils.to_categorical(target, 81)
    return reshaped_images, target


def plot_graph(history):
    print(history.history.keys())
    # summarize history for accuracy
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

# **Machine Learning Models**
## **Random Forest**

In [11]:
X_train, X_test, y_train, y_test = get_train_test_split()
print("\nshape of training features :", X_train.shape)
print("=======================================")
print("\n shape of training labels :", y_train.shape)
print("====================================")
print("\n shape of testing features :", X_test.shape)
print("======================================")
print("\n shape of testing labels :", y_test.shape)
print("==================================")


shape of training features : (17271, 5)

 shape of training labels : (17271,)

 shape of testing features : (4318, 5)

 shape of testing labels : (4318,)


In [12]:
# model initialization
clf = RandomForestClassifier(n_estimators=10)
# model training
clf = clf.fit(X_train, y_train)

In [13]:
# model predictions
y_pred = clf.predict(X_test)
# evaluate results
Acc, P, R, F = make_predictions(y_test, y_pred)
evalution_scores.append(["Random Forest", Acc, P, R, F])

Accuracy Score : 0.7909

Precision Score : 0.7942

Recall Score : 0.7909

F1 Score : 0.7639



# **Gradient Boosting**

In [14]:
X_train, X_test, y_train, y_test = get_train_test_split()
print("\nshape of training features :", X_train.shape)
print("=======================================")
print("\n shape of training labels :", y_train.shape)
print("====================================")
print("\n shape of testing features :", X_test.shape)
print("======================================")
print("\n shape of testing labels :", y_test.shape)
print("==================================")


shape of training features : (17271, 5)

 shape of training labels : (17271,)

 shape of testing features : (4318, 5)

 shape of testing labels : (4318,)


In [15]:
# model predictions
clf =  GradientBoostingClassifier(n_estimators=10)
# model training
clf = clf.fit(X_train, y_train)

In [16]:
# model predictions
y_pred = clf.predict(X_test)
# evaluate results
Acc, P, R, F = make_predictions(y_test, y_pred)
evalution_scores.append(["Random Forest", Acc, P, R, F])

Accuracy Score : 0.6971

Precision Score : 0.7046

Recall Score : 0.6971

F1 Score : 0.6725



# **DL Models**
## **Preprocessing**

In [17]:
train, target = DL_preprocessing(X, y)

In [27]:
np.save('data.npy', train) # save
np.save('target.npy', target) # save

In [29]:
train = np.load('data.npy', dtype='uint8')
target = np.load('target.npy', dtype='uint8') # load

TypeError: load() got an unexpected keyword argument 'dtype'

## **Resnet**

In [18]:
X_train, X_test, X_val, y_train, y_test, y_val = get_train_test_val_split(train, target)
print("\nshape of training features :", X_train.shape)
print("=======================================")
print("\n shape of training labels :", y_train.shape)
print("====================================")
print("\n shape of testing features :", X_test.shape)
print("======================================")
print("\n shape of testing labels :", y_test.shape)
print("==================================")
print("\n shape of validation features :", X_val.shape)
print("========================================")
print("\n shape of validation labels :", y_val.shape)
print("=====================================")


shape of training features : (15155, 200, 200, 3)

 shape of training labels : (15155, 81)

 shape of testing features : (2159, 200, 200, 3)

 shape of testing labels : (2159, 81)

 shape of validation features : (4275, 200, 200, 3)

 shape of validation labels : (4275, 81)


In [19]:
# input shape configure
input_t = tf.keras.Input(shape=(200, 200, 3))
res_model = tf.keras.applications.ResNet50(include_top=False, 
                    weights="imagenet", input_tensor=input_t)

for layer in res_model.layers[:143]:
    layer.trainable = False

# to_res = (224, 224)
model = tf.keras.models.Sequential()  
model.add(res_model)
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(81, activation='softmax'))

In [24]:
# early stopping
check_point = tf.keras.callbacks.ModelCheckpoint(filepath="cifar10.h5",
                                          monitor="val_acc  ",
                                          mode="max",
                                          save_best_only=True,
                                          )

# model compilation
model.compile(loss='categorical_crossentropy',
              optimizer= tf.keras.optimizers.RMSprop(lr=2e-5),
              metrics=['accuracy'])
 
# model training
history = model.fit(X_train, y_train, batch_size=256, epochs=10, verbose=1,
                validation_data=(X_val, y_val),
                callbacks=[check_point])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

KeyboardInterrupt: 

In [None]:
print(model.summary())
plot_graph(history)