In [1]:
from raif_hack.settings import TRAIN_PATH, TARGET, CATEGORICAL_OHE_FEATURES, NUM_FEATURES, TEST_PATH
from raif_hack.data import get_preprocessor

import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv(str(TRAIN_PATH))

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
train["date"] = pd.to_datetime(train["date"])

In [4]:
numeric_features = train[NUM_FEATURES].std().sort_values(ascending=False).index.tolist()[:50]

In [5]:
len(numeric_features)

50

In [6]:
train = train.sort_values("date")

In [7]:
train_all = train.copy()
y_all = train[TARGET].values

In [8]:
from datetime import datetime

In [9]:
test = train[train["date"] >= datetime(2020, 8, 1)].reset_index()
train = train[train["date"] < datetime(2020, 8, 1)].reset_index()

In [10]:
test = test[test["price_type"] == 1].reset_index()

In [11]:
x_train = train[CATEGORICAL_OHE_FEATURES + numeric_features]
y_train = train[TARGET].values

x_test = train[CATEGORICAL_OHE_FEATURES + numeric_features]
y_test = train[TARGET].values

In [12]:
preproc = get_preprocessor(numeric_features=numeric_features)

In [13]:
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline

In [14]:
import faiss


class FaissKNeighbors:
    def __init__(self, k=5):
        self.index = None
        self.k = k

    def fit(self, X):
        self.index = faiss.IndexFlatL2(X.shape[1])
        self.index.add(X.astype(np.float32))

    def predict(self, X):
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        return distances, indices

In [15]:
%%time

preproc = preproc.fit(x_train)



CPU times: user 2.2 s, sys: 331 ms, total: 2.53 s
Wall time: 2.53 s


In [16]:
inference = pd.read_csv(TEST_PATH)
inference = np.ascontiguousarray(preproc.transform(inference))

In [17]:
x_train = np.ascontiguousarray(preproc.transform(x_train))
x_test = np.ascontiguousarray(preproc.transform(x_test))
train_all = np.ascontiguousarray(preproc.transform(train_all))

In [18]:
%%time

model = FaissKNeighbors(k=2)
model.fit(x_train)

CPU times: user 34.6 ms, sys: 16.9 ms, total: 51.5 ms
Wall time: 49.9 ms


In [None]:
%%time

distances, neighbors = model.predict(x_test)

In [None]:
from numba import njit

@njit
def predict_knn(neighbors, known_targets):    
    predictions = []
    for k_neighbors in neighbors:
        predictions.append(known_targets[k_neighbors].mean().item())
        
    return np.array(predictions)

In [None]:
%%time

predictions = predict_knn(neighbors, y_test)

In [None]:
from raif_hack.metrics import deviation_metric

In [None]:
deviation_metric(y_test, predictions)

In [None]:
%%time

model = FaissKNeighbors(k=2)
model.fit(train_all)

In [None]:
%%time

distances, neighbors = model.predict(inference)

In [None]:
%%time

predictions = predict_knn(neighbors, y_all)

In [None]:
inference = pd.read_csv(TEST_PATH)

In [None]:
inference.shape

In [None]:
predictions.shape

In [None]:
preds = pd.DataFrame({
    "id": inference["id"].values,
    "per_square_meter_price": predictions
})

In [None]:
sample_submission = pd.read_csv("../data/test_submission.csv")[["id"]]

In [None]:
sample_submission = sample_submission.merge(preds[["id", "per_square_meter_price"]], on="id", how="left")

In [None]:
sample_submission.to_csv("submission.csv", index=False)