In [1]:
import zipfile
import os

base_dir = "/kaggle/input/cian-datafest-2019/train.zip"

with zipfile.ZipFile(base_dir, 'r') as z:
    z.extractall()

In [88]:
import cv2
import numpy as np
import pandas as pd

img = cv2.imread("./train/indoor/3.jpg") / 255.
img = cv2.resize(img, (10, 10), interpolation=cv2.INTER_AREA)

In [98]:
def get_img_data(type_: str):
    result = []
    directory = os.fsencode(f"./train/{type_}/")
    d_type_ = 0 if type_ == "indoor" else 1

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".jpg"):
            img = cv2.imread(f"./train/{type_}/{filename}") / 255.
            img = cv2.resize(img, (10, 10), interpolation=cv2.INTER_AREA)
            result.append(img.reshape(-1))
            result[-1] = np.append(result[-1], d_type_)
        else:
            continue
    return pd.DataFrame(result)

In [99]:
data_indoor = get_img_data("indoor")

In [100]:
data_outdoor = get_img_data("outdoor")

In [120]:
df_final = pd.concat([data_indoor, data_outdoor], ignore_index=True)

In [140]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline

In [130]:
X = df_final.drop([300], axis=1)
y = df_final[300]

In [131]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

In [149]:
neigh = KNeighborsClassifier(n_neighbors=9).fit(X, y)

In [150]:
cross_val_score(neigh, X, y, cv=kf, scoring="accuracy")

array([0.8388916 , 0.83959655, 0.83270065])

In [147]:
knn_pipe = Pipeline([('knn', KNeighborsClassifier())])

params = [{'knn__n_neighbors': [3, 5, 7, 9]}]

gs_knn = GridSearchCV(knn_pipe,
                      param_grid=params,
                      scoring='accuracy',
                      cv=kf)

gs_knn.fit(X, y)
gs_knn.best_params_

{'knn__n_neighbors': 9}

In [148]:
gs_knn.score(X, y)

0.8732692238169264