In [2]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
path = './train/train_data.csv'

df = pd.read_csv(path)
df

Unnamed: 0,filen_name,label
0,train0001.png,8
1,train0002.png,8
2,train0003.png,8
3,train0004.png,8
4,train0005.png,8
...,...,...
4995,train4996.png,6
4996,train4997.png,6
4997,train4998.png,6
4998,train4999.png,6


In [24]:
train_file_name = df['filen_name']
train_label = df['label']

# image 파일을 불러온뒤 변수에 저장
train_image = []
for file in train_file_name:
    train_image.append(Image.open('./train/' + file))
image_to_number = np.array([np.array(image).flatten() for image in train_image])


In [25]:
import umap
reducer = umap.UMAP(random_state = 42, n_neighbors = 20)
reducer.fit(image_to_number)

UMAP(n_neighbors=20, random_state=42, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True})

In [26]:
embedding = reducer.transform(image_to_number)
assert(np.all(embedding == reducer.embedding_))
embedding.shape

(5000, 2)

In [43]:
X_train,X_test,y_train,y_test = train_test_split(embedding, train_label, test_size =0.3, shuffle=True)

In [44]:
from sklearn import tree,svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
tree_clf = tree.DecisionTreeClassifier()
svm_clf = svm.SVC(gamma=0.001)
gnb_clf = GaussianNB()
rf_clf = RandomForestClassifier(max_depth = 2,random_state=0)
knn_clf = KNeighborsClassifier(n_neighbors = 5, metric='euclidean')

model_list = {'tree' : tree_clf, 'svm' : svm_clf, 'gnb' : gnb_clf, 'rf' : rf_clf, 'knn' : knn_clf}
accuracy = {}

for i,clf in tqdm(model_list.items()):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_train)
    accuracy[i] = np.mean(cross_val_score(clf,X_train,y_train, scoring='accuracy', cv=3))
    
#tree_clf.fit(X_train,y_train)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.07it/s]


In [45]:
accuracy

{'tree': 0.9017141880070532,
 'svm': 0.8257131140673848,
 'gnb': 0.8982836562746347,
 'rf': 0.7505787368764523,
 'knn': 0.9297147641717657}

In [46]:
knn_clf.predict(X_train)[2194]

1

In [47]:
test_df = pd.read_csv('./test/test_data.csv') 

In [48]:
test_image = []
for file in test_df['file_name']:
    test_image.append(Image.open('./test/' + file))
test_number = np.array([np.array(image).flatten() for image in test_image])

reducer = umap.UMAP(random_state = 42, n_neighbors = 20)
reducer.fit(image_to_number)
embedding = reducer.transform(image_to_number)
assert(np.all(embedding == reducer.embedding_))

In [52]:
submission = pd.read_csv('./sample_submission.csv') 
preds = knn_clf.predict(embedding)
submission['label'] = preds
submission.to_csv('submission.csv', index=False)

In [51]:
preds

array([8, 8, 8, ..., 6, 6, 6], dtype=int64)

In [22]:
knn_clf.predict_proba(embedding)[0]

array([0. , 0. , 0. , 0.2, 0. , 0. , 0. , 0. , 0.8, 0. ])

In [23]:
preds[:10]

array([8, 8, 8, 8, 8, 8, 9, 8, 8, 8], dtype=int64)