In [1]:
import pickle
from os import path, mkdir

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.interpolate import Rbf
from sklearn.model_selection import train_test_split

from localization import utils, dataset
from localization.models.rbf_model import Model

In [2]:
utils.make_deterministic(33)

def radial_log_basis_function(model, r):
    return np.log(r + model.epsilon)

Rbf.radial_log_basis_function = radial_log_basis_function
with open('output/filtered_model.bin', 'rb') as inp:
    model: Model = pickle.load(inp)


In [5]:
num_buildings = 3
num_floors_in_each_building = {0: 4,
                               1: 4,
                               2: 5}

sigma = 6.75
num_samples = 2
new_samples = 100_000 # 1_000_000
batch_size = 10_000  # 10_000

rng = np.random.default_rng(33)

data_dir = "data/generated-sample-5/"

In [6]:
data = dataset.load_ujiindoor_loc(data_folder='data/filtered')
routers = model.get_all_routers_in_this_floor(data.get_full_df().columns)

### Generation based on RBF $\mu$ and $\phi$

In [None]:
# generated_data = pd.DataFrame(columns=data.get_full_df().columns)

b = rng.integers(0, num_buildings, size=new_samples)
f = np.array([rng.integers(0, num_floors_in_each_building[building]) for building in b])
idxs = [rng.integers(0, len(model.x_building[building])) for building in b]
x = model.x_building[[b, idxs]]
y = model.y_building[[b, idxs]]
# x = np.array([rng.uniform(model.x_building[building][0], model.x_building[building][-1]) for building in b])
# y = np.array([rng.uniform(model.y_building[building][0], model.y_building[building][-1]) for building in b])

gen_data = {r: [] for r in routers}
for i in range(new_samples):
    for r in routers:
        if r in model.mu_rbf[b[i]][f[i]]:
            mu = model.mu_rbf[b[i]][f[i]][r](x[i], y[i])
            phi = np.minimum(model.phi_rbf[b[i]][f[i]][r](x[i], y[i]), 1)
            gen_data[r].append(rng.normal(mu, sigma) * rng.binomial(1, phi))
        else:
            gen_data[r].append(np.nan)

for r in routers:
    gen_data[r] = np.array(gen_data[r])

gen_data["BUILDINGID"] = b
gen_data["FLOOR"] = f
# gen_data["SPACEID"] = -np.ones(num_samples)
# gen_data["RELATIVEPOSITION"] = -np.ones(num_samples)
# gen_data["USERID"] = -np.ones(num_samples)
# gen_data["PHONEID"] = -np.ones(num_samples)
# gen_data["TIMESTAMP"] = -np.ones(num_samples)
# gen_data["LATITUDE"] = -np.ones(num_samples)
# gen_data["LONGITUDE"] = -np.ones(num_samples)
gen_data["x"] = x
gen_data["y"] = y
generated_data = pd.DataFrame(gen_data)

In [20]:
generated_data = generated_data.fillna(0.0)
generated_data = generated_data.clip(lower=0)
generated_data.sample(10)

Unnamed: 0,WAP001,WAP002,WAP007,WAP008,WAP009,WAP013,WAP014,WAP017,WAP018,WAP019,...,WAP512,WAP513,WAP514,WAP516,WAP517,WAP518,BUILDINGID,FLOOR,x,y
286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,31.653991,0.0,0.0,2,0,0.0,21.050275
402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.508494,5.122161,0.0,2,4,0.0,0.0
253,0.0,0.0,1.401449,0.0,0.0,8.060981,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,19.080972,24.269905
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.083097,0.0,0.0,2,2,0.0,0.0
372,0.0,0.0,0.0,0.0,0.0,0.0,28.046214,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0.0,0.0
110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,2,6.351711,73.395808
767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,22.211284,23.018716,0.0,2,2,0.0,32.174821
458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2,2,0.0,39.783516
234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2,2,4.138234,49.555294
661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.481993,0.0,0.0,2,4,0.0,0.0


In [None]:
train_df, test_df = train_test_split(generated_data, test_size=0.2)

In [None]:
train_df.to_csv('data/generated/trainingData.csv', index=False)
test_df.to_csv('data/generated/validationData.csv', index=False)

### Generation based on RBF $\mu$ and $\phi$ - batched

In [7]:
generated_data = pd.DataFrame(columns=routers+["BUILDINGID", "FLOOR", "x", "y"]+["LATITUDE", "LONGITUDE"])
if not path.exists(data_dir): mkdir(data_dir)
generated_data.to_csv(f'{data_dir}/trainingData.csv', index=False)
generated_data.to_csv(f'{data_dir}/validationData.csv', index=False)

In [8]:
num_batches = new_samples // batch_size

for _ in tqdm(range(num_batches)):
    b = rng.integers(0, num_buildings, size=batch_size)
    f = np.array([rng.integers(0, num_floors_in_each_building[building]) for building in b])
    idxs = [rng.integers(0, len(model.x_building[building])) for building in b]
    x = np.array([model.x_building[building][idx] for (building, idx) in zip(b, idxs)])
    y = np.array([model.y_building[building][idx] for (building, idx) in zip(b, idxs)])
    # x = np.array([rng.uniform(model.x_building[building][0], model.x_building[building][-1]) for building in b])
    # y = np.array([rng.uniform(model.y_building[building][0], model.y_building[building][-1]) for building in b])

    gen_data = {r: [] for r in routers}
    for i in range(batch_size):
        for r in routers:
            if r in model.mu_rbf[b[i]][f[i]]:
                mu = model.mu_rbf[b[i]][f[i]][r](x[i], y[i])
                phi = np.minimum(model.phi_rbf[b[i]][f[i]][r](x[i], y[i]), 1)
                gen_data[r].append(rng.normal(mu, sigma) * rng.binomial(1, phi))
            else:
                gen_data[r].append(np.nan)

    for r in gen_data:
        gen_data[r] = np.array(gen_data[r])

    gen_data.update({
        "BUILDINGID": b,
        "FLOOR": f,
        # "SPACEID": -1 * np.ones(batch_size),
        # "RELATIVEPOSITION": -1 * np.ones(batch_size),
        # "USERID": -1 * np.ones(batch_size),
        # "PHONEID": -1 * np.ones(batch_size),
        # "TIMESTAMP": -1 * np.ones(batch_size),
        "LATITUDE": -1 * np.ones(batch_size),
        "LONGITUDE": -1 * np.ones(batch_size),
        "x": x,
        "y": y
    })
    generated_data = pd.DataFrame(gen_data)
    generated_data = generated_data.fillna(0.0)
    generated_data = generated_data.clip(lower=0)
    train_df, test_df = train_test_split(generated_data, test_size=0.1)

    train_df.to_csv(f'{data_dir}/trainingData.csv', index=False, mode='a', header=False)
    test_df.to_csv(f'{data_dir}/validationData.csv', index=False, mode='a', header=False)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [05:55<00:00, 35.55s/it]


### Generation based on already calculated points ($\mu_b$ and $\phi_b$)

In [None]:
generated_data = pd.DataFrame(columns=data.get_full_df().columns)

for building in range(num_buildings):
    for floor in tqdm(range(model.num_floors_in_each_building[building])):
        filtered_dataset = data.get_floor_data(building=building, floor=floor)
        X_train = filtered_dataset.get_full_df()
        routers = model.get_all_routers_in_this_floor(X_train.columns)
        gen_data = {}

        for router in routers:
            if model.checking_non_null_minimum_percentage_of_samples(X_train, router):
                # X_train = model.get_mu_and_phi_estimation(X_train, router)
                phi_building = model.phi_rbf[building][floor][router]
                phi_building = np.where(phi_building > 1, 1, phi_building)
                mu_building = model.mu_rbf[building][floor][router]

                gen_data[router] = np.random.normal(np.tile(mu_building, num_samples), sigma) * \
                                          np.random.binomial(1, np.tile(phi_building, num_samples))

            # print(gen_data[router])
            # print(gen_data[router].shape)

        # first router of third building of first floor has a different shape
        if building == 2 and floor == 0:
            max_length = max(len(arr) for arr in gen_data.values())
            for key in gen_data:
                if len(gen_data[key]) < max_length:
                    gen_data[key] = np.pad(gen_data[key], (0, max_length - len(gen_data[key])), constant_values=0)

        # get the right values
        gen_data = pd.DataFrame(gen_data)
        # print(generated_data.shape, gen_data.shape)
        gen_data.loc[:, "BUILDINGID"] = [building] * gen_data.shape[0]
        gen_data.loc[:, "FLOOR"] = [floor] * gen_data.shape[0]
        gen_data.loc[:, "SPACEID"] = [0] * gen_data.shape[0]
        gen_data.loc[:, "RELATIVEPOSITION"] = [0] * gen_data.shape[0]
        gen_data.loc[:, "USERID"] = [-1] * gen_data.shape[0]
        gen_data.loc[:, "PHONEID"] = [-1] * gen_data.shape[0]
        gen_data.loc[:, "TIMESTAMP"] = [-1] * gen_data.shape[0]
        gen_data.loc[:, "LATITUDE"] = [0] * gen_data.shape[0]
        gen_data.loc[:, "LONGITUDE"] = [0] * gen_data.shape[0]
        gen_data.loc[:, "x"] = np.tile(model.x_building[building], num_samples)
        gen_data.loc[:, "y"] = np.tile(model.y_building[building], num_samples)
        generated_data = pd.concat([generated_data, pd.DataFrame(gen_data)], ignore_index=True)

100%|██████████| 4/4 [00:05<00:00,  1.45s/it]
100%|██████████| 4/4 [00:19<00:00,  4.80s/it]
100%|██████████| 5/5 [00:35<00:00,  7.18s/it]


In [None]:
generated_data = generated_data.fillna(0.0)
generated_data = generated_data.clip(lower=0)
generated_data.sample(10)

Unnamed: 0,WAP001,WAP002,WAP005,WAP006,WAP007,WAP008,WAP009,WAP010,WAP011,WAP012,...,LATITUDE,FLOOR,BUILDINGID,SPACEID,RELATIVEPOSITION,USERID,PHONEID,TIMESTAMP,x,y
20769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,1,0,0,0,0,0,0.0,10.356031
41120,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.66574,14.53368,...,0,1,2,0,0,0,0,0,0.63656,0.0
28798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,2,1,0,0,0,0,0,0.0,0.0
26655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,2,1,0,0,0,0,0,0.0,28.898784
37325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,2,0,0,0,0,0,0.0,4.14314
48268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.581022,17.183868,...,0,2,2,0,0,0,0,0,0.63656,34.761567
44526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.516054,32.029452,...,0,1,2,0,0,0,0,0,0.0,0.0
38978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,2,0,0,0,0,0,15.385154,0.0
16829,0.0,0.0,0.0,7.754054,0.0,12.717207,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,17.879972,0.0
50731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.17306,0.0,...,0,2,2,0,0,0,0,0,0.0,31.845526


In [None]:
train_df, test_df = train_test_split(generated_data, test_size=0.2)

In [None]:
train_df.to_csv('data/generated/trainingData.csv', index=False)
test_df.to_csv('data/generated/validationData.csv', index=False)

: 

In [None]:
generated_data

### Generation based on probs

In [None]:
p_xy = 1/model.power_probability_masks[0][0]['WAP007'][0].shape[0]
probs = []
for p in range(106):
    p_p = model.power_prior_probability_distribution[0][0]['WAP007'][p]
    p_xy_given_bfrp = model.power_probability_masks[0][0]['WAP007'][p]
    p_p_given_bfrxy = p_xy_given_bfrp *  p_p / p_xy
    probs.append(p_p_given_bfrxy)

probs = np.array(probs)

In [None]:
print(probs.shape)
print(probs[:][0].shape)

In [None]:
np.random.choice(106, 20, p=probs[:,0]/probs[:,0].sum())

In [None]:
data = dataset.load_ujiindoor_loc(data_folder='data')
data_fil = data.get_floor_data(building=0, floor=0, reset_means=True)

x_building, y_building = model.construct_building_map(data_fil, 0)

In [None]:
X_train = data_fil.get_full_df()

In [None]:
X_train.shape