# Overview
- nb013で作ったデータのEDA

In [1]:
import subprocess
cmd = "git rev-parse --short HEAD"
hash = subprocess.check_output(cmd.split()).strip().decode('utf-8')
print(hash)

cf1ce7c


# Const

In [2]:
NB = '014'
DIR_TRAIN = './../data_ignore/input/train/'
DIR_TEST = './../data_ignore/input/test/'
DIR_WIFI = './../data_ignore/input/wifi/'
PATH_SUB = './../data_ignore/input/sample_submission.csv'
PATH_99_SUB = './../data/input/floor_99per_acc_sub.csv'
DIR_SAVE_IGNORE = f'./../data_ignore/nb/{NB}/'
DIR_SAVE = f'./../data/nb/{NB}/'

# Import everything I need:)

In [3]:
import os
import time
import yaml
import json
import types
import random
import pickle
import builtins
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
from icecream import ic
from ipdb import set_trace as st
import matplotlib.pyplot as plt
from dataclasses import dataclass
# from tqdm import tqdm
from fastprogress import progress_bar, master_bar
from glob import glob
from loguru import logger
from collections import OrderedDict
from lmfit import Parameters, Minimizer, report_fit


# Function

In [4]:
def imports():
    for name, val in globals().items():
        # module imports
        if isinstance(val, types.ModuleType):
            yield name, val

            # functions / callables
        if hasattr(val, '__call__'):
            yield name, val


def noglobal(f):
    '''
    ref: https://gist.github.com/raven38/4e4c3c7a179283c441f575d6e375510c
    '''
    return types.FunctionType(f.__code__,
                              dict(imports()),
                              f.__name__,
                              f.__defaults__,
                              f.__closure__
                              )


def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [5]:
@dataclass
class ReadData:
    acce: np.ndarray
    acce_uncali: np.ndarray
    gyro: np.ndarray
    gyro_uncali: np.ndarray
    magn: np.ndarray
    magn_uncali: np.ndarray
    ahrs: np.ndarray
    wifi: np.ndarray
    ibeacon: np.ndarray
    waypoint: np.ndarray


def read_data_file(data_filename):
    acce = []
    acce_uncali = []
    gyro = []
    gyro_uncali = []
    magn = []
    magn_uncali = []
    ahrs = []
    wifi = []
    ibeacon = []
    waypoint = []

    with open(data_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        if not line_data or line_data[0] == '#':
            continue

        line_data = line_data.split('\t')

        if line_data[1] == 'TYPE_ACCELEROMETER':
            acce.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ACCELEROMETER_UNCALIBRATED':
            acce_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE':
            gyro.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE_UNCALIBRATED':
            gyro_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD':
            magn.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD_UNCALIBRATED':
            magn_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ROTATION_VECTOR':
            ahrs.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_WIFI':
            sys_ts = line_data[0]
            ssid = line_data[2]
            bssid = line_data[3]
            rssi = line_data[4]
            lastseen_ts = line_data[6]
            wifi_data = [sys_ts, ssid, bssid, rssi, lastseen_ts]
            wifi.append(wifi_data)
            continue

        if line_data[1] == 'TYPE_BEACON':
            ts = line_data[0]
            uuid = line_data[2]
            major = line_data[3]
            minor = line_data[4]
            rssi = line_data[6]
            ibeacon_data = [ts, '_'.join([uuid, major, minor]), rssi]
            ibeacon.append(ibeacon_data)
            continue

        if line_data[1] == 'TYPE_WAYPOINT':
            waypoint.append([int(line_data[0]), float(line_data[2]), float(line_data[3])])

    acce = np.array(acce)
    acce_uncali = np.array(acce_uncali)
    gyro = np.array(gyro)
    gyro_uncali = np.array(gyro_uncali)
    magn = np.array(magn)
    magn_uncali = np.array(magn_uncali)
    ahrs = np.array(ahrs)
    wifi = np.array(wifi)
    ibeacon = np.array(ibeacon)
    waypoint = np.array(waypoint)

    return ReadData(acce, acce_uncali, gyro, gyro_uncali, magn, magn_uncali, ahrs, wifi, ibeacon, waypoint)

In [6]:
def trace_plot(site, floor):
    # trace
    path_trace_list = sorted(glob(f'./../data_ignore/input/train/{site}/{floor}/*'))

    # json
    path_json = f'./../data_ignore/input/metadata/{site}/{floor}/floor_info.json'
    with open(path_json) as json_file:
        json_data = json.load(json_file)
    width_meter = json_data["map_info"]["width"]
    height_meter = json_data["map_info"]["height"]

    # show image
    title = f'site: {site}\nfloor: {floor}'
    cmap = plt.get_cmap("tab20b")
    path_img = f'./../data_ignore/input/metadata/{site}/{floor}/floor_image.png'
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    fig.patch.set_facecolor('white')
    fig.suptitle(title, fontsize=18)
    im = Image.open(path_img)
    ax.imshow(np.asarray(im), extent=(0, width_meter, 0, height_meter))

    for i_trace in range(len(path_trace_list)):
        data = read_data_file(path_trace_list[i_trace])
        waypoints = data.waypoint
        x_arr = waypoints[:, 1]
        y_arr = waypoints[:, 2]

        ax.plot(x_arr, y_arr, linewidth=2, alpha=0.5)
    return fig

In [7]:
def get_site_floor_fig(site, floor):
    # json
    path_json = f'./../data_ignore/input/metadata/{site}/{floor}/floor_info.json'
    with open(path_json) as json_file:
        json_data = json.load(json_file)
    width_meter = json_data["map_info"]["width"]
    height_meter = json_data["map_info"]["height"]
    s = max(width_meter, height_meter)
    w = round(10*width_meter/s)
    h = round(10*height_meter/s)

    # show image
    cmap = plt.get_cmap("tab20b")
    path_img = f'./../data_ignore/input/metadata/{site}/{floor}/floor_image.png'
    fig, ax = plt.subplots(1, 1, figsize=(w, h))
    fig.patch.set_facecolor('white')
    title = f'site: {site}\nfloor: {floor}'
    fig.suptitle(title, fontsize=18)
    im = Image.open(path_img)
    ax.imshow(np.asarray(im), extent=(0, width_meter, 0, height_meter))
    return fig, ax, width_meter, height_meter


def gaussian2D(x, y, height, cen_x, cen_y, sig):
    return height*np.exp(-(((cen_x-x)/sig)**2 + ((cen_y-y)/sig)**2)/2.0)


def residuals(p, x, y, z):
    height = p["height"].value
    cen_x = p["centroid_x"].value
    cen_y = p["centroid_y"].value
    sigma = p["sig"].value
    return (z - gaussian2D(x,y,height,cen_x, cen_y, sigma))

In [8]:
@noglobal
def fitting(x_arr, y_arr, rssi_arr, n_grid, width_meter, height_meter):
    gauss = np.zeros([n_grid, n_grid])
    x, y = np.meshgrid(np.linspace(0, width_meter, n_grid), np.linspace(0, height_meter, n_grid))
    c = np.exp(np.max(rssi_arr))
    for i in range(len(x_arr)):
        gauss += gaussian2D(x, y, 1/c*np.exp(rssi_arr[i]), x_arr[i], y_arr[i], 20)

    initial = Parameters()
    initial.add("height", value=np.max(1/c*np.exp(rssi_arr)), min=0.5)
    initial.add("centroid_x", value=width_meter/2, min=0, max=width_meter)
    initial.add("centroid_y", value=height_meter/2, min=0, max=height_meter)
    initial.add("sig", value=100)

    fit = Minimizer(residuals, initial, fcn_args=(x, y, gauss))
    out = fit.leastsq()
    best_params = [param.value for _, param in out.params.items()]   # height, centroid_x, centroid_y, sig
    best_fit = gaussian2D(x, y, best_params[0], best_params[1], best_params[2], best_params[3])
    return best_params, best_fit


@noglobal
def get_near_waypoints(df_wifi_ssid_bssid, df_waypoint, sec):
    ts_wp = df_waypoint['timestamp'].values
    x_arr = []
    y_arr = []
    rssi_arr = []
    for idx, row in df_wifi_ssid_bssid.iterrows():
        lastseen_ts = row.lastseen_ts
        rssi = row['rssi']
        logics = (lastseen_ts/1000.0 - sec <= ts_wp/1000.0) & (ts_wp/1000.0 < lastseen_ts/1000.0 + sec)
        if np.sum(logics) != 0:
            _x_arr = df_waypoint[logics].x
            _y_arr = df_waypoint[logics].y
            _rssi_arr = [rssi]*len(_x_arr)
            x_arr = x_arr + _x_arr.tolist()
            y_arr = y_arr + _y_arr.tolist()
            rssi_arr = rssi_arr + _rssi_arr

    df = pd.DataFrame({'x': x_arr, 'y': y_arr, 'rssi': rssi_arr})
    df = df.groupby(['x', 'y']).mean().reset_index()
    x_arr = df.x.values
    y_arr = df.y.values
    rssi_arr = df.rssi.values
    return x_arr, y_arr, rssi_arr

# Preparation

<br>

set

In [9]:
pd.set_option('display.max_rows', 500)

if not os.path.exists(DIR_SAVE_IGNORE):
    os.makedirs(DIR_SAVE_IGNORE)
if not os.path.exists(DIR_SAVE):
    os.makedirs(DIR_SAVE)

<br>

load dataset

In [10]:
sample_submission = pd.read_csv(PATH_SUB)

# EDA

In [394]:
site_list = [val.split('_')[0] for val in sample_submission.site_path_timestamp]
site_list = sorted(np.unique(site_list).tolist())

In [395]:
idx_site = 0
site = site_list[idx_site]
path = f'./../data/nb/013/nb013_bssid_position_{site}.csv'

In [396]:
df_bssid_pos = pd.read_csv(path)

<br>

bssid にいくつのbssidが紐付いているか

In [40]:
idx_bssid = 0
# bssid = df_bssid_pos['bssid'].unique()[idx_bssid]
for bssid in df_bssid_pos['bssid'].unique()[:33]:
    if df_bssid_pos[df_bssid_pos.bssid == bssid].ssid.nunique() != 1:
        print(bssid)
        display(df_bssid_pos[df_bssid_pos.bssid==bssid][['ssid', 'floor']].sort_values('ssid'))
        print('')

b0b1a58ed86926fdd2a0c584a85834d1cc9d2e0c


Unnamed: 0,ssid,floor
3355,78f32ce289cdaccfc83a01ef3cbf4af52942c063,F2
5040,78f32ce289cdaccfc83a01ef3cbf4af52942c063,F3
5620,78f32ce289cdaccfc83a01ef3cbf4af52942c063,F4
10,da39a3ee5e6b4b0d3255bfef95601890afd80709,B1
1828,da39a3ee5e6b4b0d3255bfef95601890afd80709,F1
3587,da39a3ee5e6b4b0d3255bfef95601890afd80709,F2



3520a54bb4efb13307fc259eec13fa1b53931949


Unnamed: 0,ssid,floor
3426,78f32ce289cdaccfc83a01ef3cbf4af52942c063,F2
5110,78f32ce289cdaccfc83a01ef3cbf4af52942c063,F3
5612,78f32ce289cdaccfc83a01ef3cbf4af52942c063,F4
16,da39a3ee5e6b4b0d3255bfef95601890afd80709,B1
1810,da39a3ee5e6b4b0d3255bfef95601890afd80709,F1
3621,da39a3ee5e6b4b0d3255bfef95601890afd80709,F2



7969541b4a6e4aedb64983b046fc8bdae157dc97


Unnamed: 0,ssid,floor
3389,78f32ce289cdaccfc83a01ef3cbf4af52942c063,F2
5095,78f32ce289cdaccfc83a01ef3cbf4af52942c063,F3
5611,78f32ce289cdaccfc83a01ef3cbf4af52942c063,F4
32,da39a3ee5e6b4b0d3255bfef95601890afd80709,B1
1816,da39a3ee5e6b4b0d3255bfef95601890afd80709,F1
3582,da39a3ee5e6b4b0d3255bfef95601890afd80709,F2





<br>

bssidとssidでgroupbyした場合、階数での変化が大きくことなるのか、stdがそれなりに大きいのがある。。。

In [73]:
df_bssid_pos.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_x,bssid_y,bssid_y
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std
bssid,ssid,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
000840e5c600de293cea57f13326f273c86c3988,7182afc4e5c212133d5d7d76eb3df6c24618302b,109.360597,38.202702,103.353381,5.283929
005246b6f51feb1a069e8f005d3e6aba2591b65b,737c5bea6f3540ca472ca57c814cb8e4218682ea,31.989152,6.14544,59.272956,11.647493
0076ff7a084cb2ac8c146139965ab1be296e72c4,9dfbe9dd0ea190e67cc9aad18150fc020708279d,117.134146,59.199621,111.699868,3.203099
0089ad1dd75b13e2c3ceda344988c9f89a83a2f9,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,139.756293,,89.002372,
009a3ed672be7bd1b9c4437b43a53296771af098,5731b8e08abc69d4c4d685c58164059207c93310,31.74787,0.610813,54.314438,2.870258
00ad587dcb9c7ce3788b92e22777a22ee0efea31,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,62.648435,51.026609,119.100129,8.221451
00af060fc145ee6a6a50475efa57b91cbf54237f,d839a45ebe64ab48b60a407d837fb01d3c0dfef9,74.964696,59.321907,81.120781,24.671024
00bcc61bdea4d52d050822d66952dd707c2fcdf3,5c8cc1443c4b580c96f3b830c90e3d3cc257409e,84.849745,4.278969,32.91235,10.712961
00c8933965e23cd2bb890ca08ee7f8a22f933df8,506a2ed42a391ab1c3e09b9674cdb59088d9c45b,128.740464,,123.742192,
00ef32991244ca6ce16c9ee3b910886401766bf6,f6cfaba131effbbf44d7a7b45d6312e7ac413ab5,55.813966,,50.988179,


<br>

bssid, ssid, floor で表示(groupby意味ない。見た目だけいい感じにした)
- bssid_x がfloor毎にブレていても、n_samples_rssi_over_m60 が大きな値を取っているところを信用すればよさそう！

In [75]:
feat1 = 'n_samples_rssi_over_m60'
feat2 = 'n_samples_rssi_over_m70'
df_bssid_pos.groupby(['bssid', 'ssid', 'floor'])[['ssid', 'bssid', 'bssid_x', 'bssid_y', 'floor', feat1, feat2]].agg(['mean']).head(20).style.bar(subset=[feat1, feat2], color=['teal'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bssid_x,bssid_y,n_samples_rssi_over_m60,n_samples_rssi_over_m70
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,mean,mean,mean
bssid,ssid,floor,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
000840e5c600de293cea57f13326f273c86c3988,7182afc4e5c212133d5d7d76eb3df6c24618302b,F1,81.466819,101.107457,18,37
000840e5c600de293cea57f13326f273c86c3988,7182afc4e5c212133d5d7d76eb3df6c24618302b,F2,88.323433,97.842757,3,6
000840e5c600de293cea57f13326f273c86c3988,7182afc4e5c212133d5d7d76eb3df6c24618302b,F3,102.516362,110.253787,0,1
000840e5c600de293cea57f13326f273c86c3988,7182afc4e5c212133d5d7d76eb3df6c24618302b,F4,165.135775,104.209521,0,3
005246b6f51feb1a069e8f005d3e6aba2591b65b,737c5bea6f3540ca472ca57c814cb8e4218682ea,F2,36.334634,51.036935,0,0
005246b6f51feb1a069e8f005d3e6aba2591b65b,737c5bea6f3540ca472ca57c814cb8e4218682ea,F3,27.643669,67.508977,0,0
0076ff7a084cb2ac8c146139965ab1be296e72c4,9dfbe9dd0ea190e67cc9aad18150fc020708279d,F3,75.273692,109.434935,0,8
0076ff7a084cb2ac8c146139965ab1be296e72c4,9dfbe9dd0ea190e67cc9aad18150fc020708279d,F4,158.9946,113.964801,0,7
0089ad1dd75b13e2c3ceda344988c9f89a83a2f9,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,B1,139.756293,89.002372,0,0
009a3ed672be7bd1b9c4437b43a53296771af098,5731b8e08abc69d4c4d685c58164059207c93310,F1,32.179781,56.344017,0,12


# 品質で絞ってみる
- ssid, bssidでgroupbyしたときの bssid_x,y の stdの平均を比較する
- 良い絞り方とは、stdが小さいかつ、n_samplesがそれなりに残ってる事と定義する

## とりあえず絞る

In [220]:
# default
print(f'n_samples: {len(df_bssid_pos)}')
df_bssid_pos.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples: 6556


bssid_x  mean    91.517198
         std     38.972670
bssid_y  mean    97.787566
         std     13.355605
dtype: float64

In [221]:
# n_samples >= n
n = 30
df_bssid_pos_grp = df_bssid_pos.query('n_samples >= @n')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
display(df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean())

n_samples_new: 3260


bssid_x  mean    89.984640
         std     27.690216
bssid_y  mean    93.328755
         std     10.202957
dtype: float64

In [222]:
n = 30
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m70 >= @n')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 750


bssid_x  mean    76.358474
         std     17.766991
bssid_y  mean    86.322051
         std      8.439526
dtype: float64

In [223]:
n = 31
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m65 >= @n')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 282


bssid_x  mean    78.056344
         std     19.561230
bssid_y  mean    79.828682
         std      9.077671
dtype: float64

In [333]:
n = 21
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m60 >= @n')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 254


bssid_x  mean    72.264548
         std      6.115453
bssid_y  mean    78.407226
         std      6.285392
dtype: float64

In [225]:
n = 10
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m55 >= @n')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 347


bssid_x  mean    76.689268
         std     45.486007
bssid_y  mean    90.348508
         std     13.781077
dtype: float64

---> n_samples_rssi_over_m60 で絞ると一番stdが小さくなる！！でもサンプル数が254個しかないんだよな...

## n_samples >= 30 の条件のもと絞ってみる

In [238]:
n = 40
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m70 >= @n & n_samples >= 30')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 322


bssid_x  mean    85.188499
         std     15.829805
bssid_y  mean    85.388175
         std     10.450296
dtype: float64

In [252]:
n = 35
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m65 >= @n & n_samples >= 30')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 178


bssid_x  mean    88.228703
         std     18.130738
bssid_y  mean    83.778134
         std      8.753632
dtype: float64

In [290]:
n = 17
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m60 >= @n & n_samples >= 30')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 357


bssid_x  mean    74.876323
         std      6.110074
bssid_y  mean    81.475961
         std      5.287771
dtype: float64

In [282]:
n = 5
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m55 >= @n & n_samples >= 30')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 604


bssid_x  mean    87.016805
         std     18.632011
bssid_y  mean    92.275059
         std      8.876705
dtype: float64

# あと1サイトぐらい見とく

In [334]:
idx_site = 5
site = site_list[idx_site]
path = f'./../data/nb/013/nb013_bssid_position_{site}.csv'
df_bssid_pos = pd.read_csv(path)
len(df_bssid_pos)

1356

---> 小さめ

## n_samples >= 30 の条件のもと絞ってみる

In [335]:
n = 40
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m70 >= @n & n_samples >= 30')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 36


bssid_x  mean    12.710567
         std           NaN
bssid_y  mean    64.248375
         std           NaN
dtype: float64

In [336]:
n = 35
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m65 >= @n & n_samples >= 30')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 20


bssid_x  mean    12.842954
         std           NaN
bssid_y  mean    55.263272
         std           NaN
dtype: float64

In [346]:
n = 17
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m60 >= @n & n_samples >= 30')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 44


bssid_x  mean    13.880577
         std      5.732712
bssid_y  mean    59.181389
         std     11.233474
dtype: float64

In [339]:
n = 5
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m55 >= @n & n_samples >= 30')
print(f'n_samples_new: {len(df_bssid_pos_grp)}')
df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['ssid', 'bssid', 'bssid_x', 'bssid_y']].agg(['mean', 'std']).mean()

n_samples_new: 72


bssid_x  mean    15.538480
         std      2.977633
bssid_y  mean    60.166884
         std     19.428094
dtype: float64

# 全部のサイトでbssid_xを確認。サンプル比率が5%になるように閾値調整

In [393]:
for site in site_list:
    print(f'site: {site}')
    path = f'./../data/nb/013/nb013_bssid_position_{site}.csv'
    df_bssid_pos = pd.read_csv(path)
    
    for n in range(0, 100):
#         n = 3
        df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m60 >= @n & n_samples >= 20')
        bssid_x_std = df_bssid_pos_grp.groupby(['bssid', 'ssid'])['bssid_x'].agg('std').mean()
        rate = len(df_bssid_pos_grp)/len(df_bssid_pos)
        if rate <= 0.05:
            break
    print(f'n_samples_new : {len(df_bssid_pos_grp)}')
    print(f'n             : {n}')
    print(f'std           : {bssid_x_std:.3f}')
    print(f'rate          : {len(df_bssid_pos_grp)/len(df_bssid_pos):.3f}')
    print(f'n_ssid_unique : {df_bssid_pos_grp.ssid.nunique()}')
    print(f'n_bssid_unique: {df_bssid_pos_grp.bssid.nunique()}')
    print('')

site: 5a0546857ecc773753327266
n_samples_new : 307
n             : 19
std           : 7.055
rate          : 0.047
n_ssid_unique : 66
n_bssid_unique: 295

site: 5c3c44b80379370013e0fd2b
n_samples_new : 289
n             : 9
std           : 24.509
rate          : 0.041
n_ssid_unique : 76
n_bssid_unique: 252

site: 5d27075f03f801723c2e360f
n_samples_new : 594
n             : 14
std           : 11.417
rate          : 0.047
n_ssid_unique : 149
n_bssid_unique: 561

site: 5d27096c03f801723c31e5e0
n_samples_new : 111
n             : 17
std           : 11.366
rate          : 0.047
n_ssid_unique : 69
n_bssid_unique: 104

site: 5d27097f03f801723c320d97
n_samples_new : 146
n             : 15
std           : 6.092
rate          : 0.045
n_ssid_unique : 42
n_bssid_unique: 140

site: 5d27099f03f801723c32511d
n_samples_new : 63
n             : 15
std           : 4.920
rate          : 0.046
n_ssid_unique : 38
n_bssid_unique: 57

site: 5d2709a003f801723c3251bf
n_samples_new : 67
n             : 14
std   

# bssidはssidの違いで異なる場所にあるのか？
- 結果としては、同じ位置にあると考えてよさそう
- ただし、低クオリティがあるとその限りではない

In [471]:
idx_site = 0
site = site_list[idx_site]
path = f'./../data/nb/013/nb013_bssid_position_{site}.csv'
df_bssid_pos = pd.read_csv(path)

<br>

↓ low quority だと結構ブレてる

In [472]:
n_ssid_in_bssid = df_bssid_pos.groupby(['bssid', 'ssid'])[['bssid_x']].mean().groupby('bssid').count().bssid_x.values
logics = n_ssid_in_bssid != 1
bssids = df_bssid_pos.groupby(['bssid', 'ssid'])[['bssid_x']].mean().groupby('bssid').count().index[logics]
for bssid in bssids[:10]:
    display(df_bssid_pos.query('bssid == @bssid').groupby(['bssid', 'ssid'])[['bssid_x', 'bssid_y']].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
01e25e4a25acd32baf5137b3031151f751fadbb4,0aeea673f8f57dc553bb8f338b0ff99004d25e14,124.815375,109.044916
01e25e4a25acd32baf5137b3031151f751fadbb4,c08b184f502ecb49441f201c7ed5648cfd1919e7,89.171743,95.402943


Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
02a1be3a5dab38320f879489d8a1e0f2a72768b3,78f32ce289cdaccfc83a01ef3cbf4af52942c063,124.742915,108.662573
02a1be3a5dab38320f879489d8a1e0f2a72768b3,da39a3ee5e6b4b0d3255bfef95601890afd80709,92.32911,98.484289


Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
04af33baa87957b484fdf8d5f17a64f972af7f2b,78f32ce289cdaccfc83a01ef3cbf4af52942c063,134.577576,56.792184
04af33baa87957b484fdf8d5f17a64f972af7f2b,da39a3ee5e6b4b0d3255bfef95601890afd80709,84.104587,38.963214


Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
069d1a6cd62e727b197abd1c5efb2f678157150a,78f32ce289cdaccfc83a01ef3cbf4af52942c063,57.172896,71.510932
069d1a6cd62e727b197abd1c5efb2f678157150a,da39a3ee5e6b4b0d3255bfef95601890afd80709,74.360013,73.45359


Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
0acd5446bf9e351dc039c99e653c4e398171a9b2,78f32ce289cdaccfc83a01ef3cbf4af52942c063,58.943197,72.035407
0acd5446bf9e351dc039c99e653c4e398171a9b2,da39a3ee5e6b4b0d3255bfef95601890afd80709,75.3423,78.519359


Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
0ae129da9fb6e90b624d53a7e34a3edbb0a7244b,78f32ce289cdaccfc83a01ef3cbf4af52942c063,121.647626,103.555996
0ae129da9fb6e90b624d53a7e34a3edbb0a7244b,da39a3ee5e6b4b0d3255bfef95601890afd80709,91.732763,89.350313


Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
0e005683e100c08b4b74c76e6392c3dd86490690,78f32ce289cdaccfc83a01ef3cbf4af52942c063,83.06101,110.205428
0e005683e100c08b4b74c76e6392c3dd86490690,da39a3ee5e6b4b0d3255bfef95601890afd80709,68.590869,99.019439


Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
0eafcd2f8e1fbad0cc7b0ee227d18c772d562e6d,78f32ce289cdaccfc83a01ef3cbf4af52942c063,66.544287,57.329467
0eafcd2f8e1fbad0cc7b0ee227d18c772d562e6d,da39a3ee5e6b4b0d3255bfef95601890afd80709,50.825542,54.239964


Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
11035847ca0854c99104df042eb0c28912b47855,78f32ce289cdaccfc83a01ef3cbf4af52942c063,114.04214,74.530871
11035847ca0854c99104df042eb0c28912b47855,da39a3ee5e6b4b0d3255bfef95601890afd80709,86.233712,28.125248


Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
11fa59b18892d70b03751701eff1707495ccf618,78f32ce289cdaccfc83a01ef3cbf4af52942c063,122.804459,105.238002
11fa59b18892d70b03751701eff1707495ccf618,da39a3ee5e6b4b0d3255bfef95601890afd80709,104.992342,94.211801


<br>

↓ high quority だと割と安定している

In [473]:
df_bssid_pos_grp = df_bssid_pos.query('n_samples_rssi_over_m60 >= 17 & n_samples >= 20')
n_ssid_in_bssid = df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['bssid_x']].mean().groupby('bssid').count().bssid_x.values
logics = n_ssid_in_bssid != 1
bssids = df_bssid_pos_grp.groupby(['bssid', 'ssid'])[['bssid_x']].mean().groupby('bssid').count().index[logics]
for bssid in bssids[:10]:
    display(df_bssid_pos_grp.query('bssid == @bssid').groupby(['bssid', 'ssid'])[['bssid_x', 'bssid_y']].mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
296ea9e41acd823ca04a211edefa7cc6457728a4,78f32ce289cdaccfc83a01ef3cbf4af52942c063,42.01813,56.430527
296ea9e41acd823ca04a211edefa7cc6457728a4,da39a3ee5e6b4b0d3255bfef95601890afd80709,40.203757,69.259264


Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
b42376db2f3421b282eaa7a344545193b1e41a92,78f32ce289cdaccfc83a01ef3cbf4af52942c063,37.168394,51.571003
b42376db2f3421b282eaa7a344545193b1e41a92,da39a3ee5e6b4b0d3255bfef95601890afd80709,42.062732,66.432616


Unnamed: 0_level_0,Unnamed: 1_level_0,bssid_x,bssid_y
bssid,ssid,Unnamed: 2_level_1,Unnamed: 3_level_1
fb61c720398cb95914f05f1abe37f51eee51279f,78f32ce289cdaccfc83a01ef3cbf4af52942c063,79.779645,41.78853
fb61c720398cb95914f05f1abe37f51eee51279f,da39a3ee5e6b4b0d3255bfef95601890afd80709,89.343475,31.706719
