In [1]:
import os
import json
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import altair as alt

In [2]:
data_dir = 'dataset/larger'

In [3]:
with open(os.path.join(data_dir, 'dataset_x.json'), 'r') as f:
    X = np.array(json.load(f))
X.shape

(12185, 18)

In [4]:
with open(os.path.join(data_dir, 'dataset_y.json'), 'r') as f:
    _y = [l.index(1) for l in json.load(f)]
    y = np.array([i for i in _y])
    # y = np.array([i[0] * 24 >= 75 for i in json.load(f)])
count = [0,0,0,0,0,0]
for i in y:
    count[i] += 1
print(count)
print(y)

[0, 492, 283, 788, 1139, 9483]
[5 5 5 ... 5 5 5]


In [5]:
category = [
    "good",
    "fair",
    "moderate",
    "poor",
    "very poor",
    "extremely poor",
]

In [6]:
def dist_chart(data):
    return alt.Chart.from_dict({
        "width": 600,
        "height": 300,
        "title": "Distribution of the data points for each air quality category",
        "data": {
            "values": [{"num_points": c, "category": category[i]} for i, c in enumerate(data)]
        },
        "encoding": {
            "x": {"field": "num_points", "type": "quantitative", "title": "Number of data points"},
            "y": {"field": "category", "type": "nominal", "sort": None, "title": "Category"},
        },
        "layer": [
            {"mark": "bar"},
            {
                "mark": {"type": "text", "dx": 2, "align": "left"},
                "encoding": {
                    "text": {"field": "num_points", "type": "quantitative"}
                }
            }
        ]
    })

In [7]:
dist_chart(count)

In [8]:
import random
to_keep = []
for i in range(y.shape[0]):
    if y[i] == 5:
        if random.random() < 0.3 / 5.0:
            to_keep.append(i)
    elif y[i] == 4:
        if random.random() < 1 / 2:
            to_keep.append(i)
    elif y[i] == 3:
        if random.random() < 3 / 5:
            to_keep.append(i)
    # if y[i]:
    #     if random.random() < 1.5 / 5.0:
    #         to_keep.append(i)
    else:
        to_keep.append(i)
count2 = [0,0,0,0,0,0]
for i in y[to_keep]:
    count2[int(i)] += 1
print(count2)

[0, 492, 283, 480, 564, 557]


In [9]:
dist_chart(count2)

In [10]:
# y = X[:, 14]

In [11]:
# import random
# X = np.array([[random.random(), random.random(), random.random(), random.random(), random.random(), random.random(), random.random(), i % 10] for i in range(1000)])
# y = np.array([i % 10 for i in range(1000)])

In [38]:
columns = [5, 7, 9, 10, 14, 15, 16, 17]

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X[to_keep, :][:, columns], y[to_keep], random_state = 0)

In [40]:
X_train.shape

(1782, 8)

In [41]:
X_test.shape

(594, 8)

In [42]:
model = MLPClassifier(
    hidden_layer_sizes=(100, 100, 100),
    random_state=1,
    max_iter=500,
)
# model = LinearSVC(random_state=0)

In [43]:
clf = OneVsOneClassifier(model).fit(X_train, y_train)

In [44]:
sum(clf.predict(X_test) == y_test)

280

In [45]:
y_test.shape

(594,)

In [46]:
sum(y_test)

1883

In [47]:
accuracy = clf.score(X_test, y_test)

In [48]:
accuracy

0.4713804713804714

In [49]:
cm = confusion_matrix(y_test, clf.predict(X_test))

In [50]:
cm

array([[73, 15, 10, 11, 10],
       [23, 20, 13,  8, 14],
       [13,  7, 38, 31, 27],
       [ 6,  8, 18, 72, 41],
       [ 5,  6,  8, 40, 77]])