In [1]:
import pandas as pd
import os, sys

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)
from motion_marmot.simple_scene_classifier import SimpleSceneClassifier

ssc = SimpleSceneClassifier("POC SSC")

In [2]:
print(ssc)

SimpleSceneClassifier(name=POC SSC)


In [3]:
file_name = "../../data/scene.csv"
data = pd.read_csv(file_name)
data["scaled_avg"] = data["avg"] / (data["width"] * data["height"])
data["scaled_std"] = data["std"] / (data["width"] * data["height"])
x = data.iloc[:, -2:].values
y = data["scene"]
print(data)

           avg       std  width  height  scene    scaled_avg  scaled_std
0     0.336788  2.415257   1920    1080      2  1.624168e-07    0.000001
1     0.408609  2.721680   1920    1080      2  1.970531e-07    0.000001
2     0.451202  3.356803   1920    1080      2  2.175937e-07    0.000002
3     0.529960  5.591995   1920    1080      2  2.555749e-07    0.000003
4     0.458626  3.192411   1920    1080      2  2.211736e-07    0.000002
...        ...       ...    ...     ...    ...           ...         ...
6396  0.354914  2.165539   1920    1080      3  1.711585e-07    0.000001
6397  0.473684  3.005130   1920    1080      3  2.284357e-07    0.000001
6398  0.446071  2.955398   1920    1080      3  2.151191e-07    0.000001
6399  0.505503  2.854361   1920    1080      3  2.437805e-07    0.000001
6400  0.555382  4.046104   1920    1080      3  2.678348e-07    0.000002

[6401 rows x 7 columns]


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

train_x, test_x, train_y, test_y = train_test_split(
    x, y, test_size=0.5, random_state=100
)
model = ssc.train_model(train_x, train_y)
prediction = model.predict(test_x)
accuracy = accuracy_score(prediction, test_y)
print(f"Accuracy: {accuracy}")
print(classification_report(prediction, test_y))

Accuracy: 0.8294283036551078
              precision    recall  f1-score   support

           0       0.96      0.93      0.94       860
           1       0.85      0.75      0.79       457
           2       0.87      0.82      0.84      1622
           3       0.42      0.69      0.53       262

    accuracy                           0.83      3201
   macro avg       0.77      0.80      0.78      3201
weighted avg       0.85      0.83      0.84      3201



In [5]:
data_pool_path = f"{module_path}/../../DataPool/ssc-dataset"
scene_dirs = os.listdir(f"{data_pool_path}/scene-by-jumbo-id")

labeled_df = pd.DataFrame()
for scene_dir in scene_dirs:
    file_name = f"{data_pool_path}/scene-by-jumbo-id/{scene_dir}/scene.csv"
    data = pd.read_csv(
        file_name,
        header=None,
        names=["total", "avg", "std", "variance", "width", "height", "scene"],
    )
    data["jumbo_id"] = scene_dir
    labeled_df = labeled_df.append(data)

In [6]:
y = labeled_df["scene"].values
x = labeled_df.drop(columns=["scene", "jumbo_id"]).values

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

train_x, test_x, train_y, test_y = train_test_split(
    x, y, test_size=0.5, random_state=100
)

k = 4
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, train_x, train_y, cv=5)
scores

array([0.87664042, 0.87270341, 0.84362681, 0.89750329, 0.85939553])

In [8]:
knn.fit(train_x, train_y)
knn_prediction = knn.predict(test_x)


In [9]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

accuracy_score(knn_prediction, test_y)

0.8828473863934857

In [10]:
print(classification_report(knn_prediction, test_y))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90      1853
           1       0.94      0.90      0.92      1066
           2       0.73      0.90      0.80       657
           3       0.73      0.92      0.81       231

    accuracy                           0.88      3807
   macro avg       0.84      0.89      0.86      3807
weighted avg       0.90      0.88      0.89      3807



In [11]:
print(confusion_matrix(knn_prediction, test_y))

[[1598   44  200   11]
 [  24  962   16   64]
 [  54   10  589    4]
 [   4   12    3  212]]


In [12]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

tree = DecisionTreeClassifier()
scores = cross_val_score(tree, train_x, train_y, cv=5)
scores

array([0.94750656, 0.96194226, 0.91852825, 0.94218134, 0.9500657 ])

In [13]:
tree.fit(train_x, train_y)
tree_prediction = tree.predict(test_x)

accuracy_score(tree_prediction, test_y)

0.9437877593905962

In [14]:
print(classification_report(tree_prediction, test_y))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      1693
           1       0.96      0.97      0.96      1025
           2       0.90      0.91      0.90       805
           3       0.89      0.92      0.90       284

    accuracy                           0.94      3807
   macro avg       0.93      0.94      0.93      3807
weighted avg       0.94      0.94      0.94      3807



In [15]:
print(confusion_matrix(tree_prediction, test_y))

[[1614    3   69    7]
 [   3  990   10   22]
 [  63   11  729    2]
 [   0   24    0  260]]


Seems like pure $Decision Tree$ is totally beat $KNN$ classifier.

\# TODO: need to do parameters tunning