In [1]:
%load_ext autoreload
%autoreload 2

%load_ext dotenv
%dotenv

In [2]:
import re

import numpy as np
import pandas as pd
import os
import featuretools as ft
from shl.prepare import normalize_epoch_time, normalize_lat_long, fillna_agg_by_label
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.parsing.preprocessing import \
    strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, \
    strip_short, stem_text

from shl.features import WifiFeature



In [3]:
train_wifi = normalize_epoch_time(pd.read_parquet('../data/train/WiFi.parquet'), 'Epoch time [ms]')
test_wifi = normalize_epoch_time(pd.read_parquet('../data/test/WiFi.parquet'), 'Epoch time [ms]')
validate_wifi = normalize_epoch_time(pd.read_parquet('../data/validate/WiFi.parquet'), 'Epoch time [ms]')
train_label = normalize_epoch_time(pd.read_parquet('../data/train/Label.parquet'), 'epoch_time')
test_label = normalize_epoch_time(pd.read_parquet('../data/test/Label_idx.parquet'), 'epoch_time')
validate_label = normalize_epoch_time(pd.read_parquet('../data/validate/Label.parquet'), 'epoch_time')

In [4]:
train_wifi_with_labels = train_label.merge(train_wifi, how='left', on='epoch_time_id').sort_values(by=['epoch_time_id', 'RSSI'], ascending=False)[['epoch_time', 'label', 'SSID', 'RSSI']]
train_wifi_with_labels

Unnamed: 0,epoch_time,label,SSID,RSSI
9073182,1499267852000,5,Virgin Media,-78.0
9073178,1499267852000,5,Virgin Media,-82.0
9073179,1499267852000,5,VM1689925,-85.0
9073181,1499267852000,5,BTWifi-with-FON,-86.0
9073180,1499267852000,5,BTWifi-X,-87.0
...,...,...,...,...
3,1490431583000,4,VM8236218,-48.0
4,1490431583000,4,Virgin Media,-48.0
1,1490431583000,4,TP-LINK_F5B890,-64.0
5,1490431583000,4,TALKTALKB93F0E,-71.0


In [5]:
train_wifi_with_labels_run = train_wifi_with_labels[train_wifi_with_labels['label'].isin([2,3])]
train_wifi_with_labels_car = train_wifi_with_labels[train_wifi_with_labels['label'].isin([5])]
train_wifi_with_labels_bus = train_wifi_with_labels[train_wifi_with_labels['label'].isin([6])]

In [6]:
def normalize_ssid(ssid: pd.Series):
    return ssid.str.replace("[0-9]{2}[0-9A-Z]+", "", regex=True) \
            .str.replace("^(BTHub|BTWiFi|BTOpenzone|DIRECT|EE(?:-BrightBox)?|HP-Print|MY WIFI|NETGEAR|PLUSNET|SKY|TALKTALK|TNCAP|VodafoneMobileWiFi|XLN For Small Biz|VM|ZyXEL|EXT2-BTHub|EXT2-PLUSNET).*$", "\\1", regex=True) \
            .str.strip()

def get_strongest_ssid_by_rssi(wifi: pd.DataFrame, n_top: int = 2, **limit_kwargs) -> pd.Series:
    return sort_by_most_strongets_rssi(limit_most_strongest(wifi, **limit_kwargs)).groupby(by='epoch_time_id').head(n_top).reset_index()

def limit_most_strongest(wifi: pd.DataFrame, rssi_min: int = -70, rssi_max: int = 0):
    return wifi.query(f'{rssi_min} <= RSSI <= {rssi_max}')

def sort_by_most_strongets_rssi(wifi: pd.DataFrame):
    return wifi.sort_values(by=['epoch_time_id', 'RSSI'], ascending=[True, False])

def drop_duplicates_ssid(wifi: pd.DataFrame, col: str):
    """Drops duplicates and empty SSIDs"""
    return sort_by_most_strongets_rssi(wifi)\
        .drop_duplicates(subset=['epoch_time_id', col], keep='first')\
        .replace('', np.nan)\
        .dropna()


def print_unique(ssid):
    unique_ssid = ssid.unique()
    print(np.sort(unique_ssid))
    print(unique_ssid.shape)

def prepare_ssid_feature(wifi: pd.DataFrame, col: str = 'SSID'):
    strongest_normalized = drop_duplicates_ssid(wifi.assign(**{col: lambda x: normalize_ssid(x[col])}), col)

    strongest_normalized = get_strongest_ssid_by_rssi(strongest_normalized)
    # create columns names
    strongest_normalized['pivot_variable'] = strongest_normalized.groupby(by='epoch_time_id').cumcount() + 1
    strongest_normalized['pivot_variable'] = strongest_normalized['pivot_variable'].apply(lambda x: f"{col}_{x}")
    # turn rows into columns
    return strongest_normalized.pivot(index="epoch_time_id", columns="pivot_variable", values=col).reset_index().sort_values(by='epoch_time_id')

def concat_ssid(wifi: pd.DataFrame, col: str = 'SSID', to_string: bool = True, top_n: int = 5, **limit_kwargs) -> pd.DataFrame:
    if to_string:
        agg_func = lambda x: ' '.join(x)
    else:
        agg_func = lambda x: list(np.hstack(x.values))
    return get_strongest_ssid_by_rssi(drop_duplicates_ssid(wifi, col), top_n, **limit_kwargs).groupby(by='epoch_time_id')[col].apply(agg_func).reset_index()

def concat_normalized_bssid(wifi: pd.DataFrame, **limit_kwargs) -> pd.DataFrame:
    return concat_ssid(wifi.assign(BSSID=lambda x: x['BSSID'].astype(str).str.slice(start=0, stop=8)), col='BSSID', to_string=False, **limit_kwargs)
# features_train_ssid = prepare_ssid_feature(train_wifi)
# print(features_train_ssid)
#
# features_train_ssid.to_parquet('../data/train/features_wifi_ssid.parquet')
# prepare_ssid_feature(test_wifi).to_parquet('../data/test/features_wifi_ssid.parquet')
# prepare_ssid_feature(validate_wifi).to_parquet('../data/validate/features_wifi_ssid.parquet')

def create_wifi_features(wifi: pd.DataFrame) -> pd.DataFrame:
    high_rssi = {
        'top_n': 5,
        'rssi_min': -70,
        'rssi_max': 0,
    }
    low_rssi = {
        'top_n': 5,
        'rssi_min': -100,
        'rssi_max': -69,
    }
    return concat_ssid(wifi, col='SSID', **high_rssi)\
        .merge(concat_ssid(wifi, col='SSID', **low_rssi), on='epoch_time_id', how="outer", suffixes=("_high", "_low"))\
        .merge(concat_ssid(wifi, col='Capabilities', **high_rssi), on='epoch_time_id', how="outer", suffixes=("", "_high"))\
        .merge(concat_ssid(wifi, col='Capabilities', **low_rssi), on='epoch_time_id', how="outer", suffixes=("", "_low"))\
        .merge(concat_normalized_bssid(wifi, **high_rssi), on='epoch_time_id', how="outer", suffixes=("", "_high"))\
        .merge(concat_normalized_bssid(wifi, **low_rssi), on='epoch_time_id', how="outer", suffixes=("", "_low"))

features_train_ssid_concat = fillna_agg_by_label(create_wifi_features(train_wifi), train_label)
display(features_train_ssid_concat)
features_train_ssid_concat.to_parquet('../data/train/features_wifi_ssid_cap_bssid_concat.parquet')

fillna_agg_by_label(create_wifi_features(test_wifi), test_label).to_parquet('../data/test/features_wifi_ssid_cap_bssid_concat.parquet')
fillna_agg_by_label(create_wifi_features(validate_wifi), validate_label).to_parquet('../data/validate/features_wifi_ssid_cap_bssid_concat.parquet')

Unnamed: 0,epoch_time_id,SSID_high,SSID_low,Capabilities,Capabilities_low,BSSID,BSSID_low
0,1490431583000,VM8236218 Virgin Media TP-LINK_F5B890,TALKTALKB93F0E PlusnetWireless0647C9,[WPA2-PSK-CCMP+TKIP][WPS][ESS] [WPA-PSK-CCMP+T...,[WPA-PSK-TKIP][WPA2-PSK-CCMP][WPS][ESS]\n,"[c0:05:c2, d2:05:c2, 90:f6:52]","[98:e7:f5, 30:91:8f]"
1,1490431584000,VM8236218 Virgin Media TP-LINK_F5B890,TALKTALKB93F0E PlusnetWireless0647C9,[WPA2-PSK-CCMP+TKIP][WPS][ESS] [WPA-PSK-CCMP+T...,[WPA-PSK-TKIP][WPA2-PSK-CCMP][WPS][ESS]\n,"[c0:05:c2, d2:05:c2, 90:f6:52]","[98:e7:f5, 30:91:8f]"
2,1490431585000,VM8236218 Virgin Media TP-LINK_F5B890,TALKTALKB93F0E PlusnetWireless0647C9,[WPA2-PSK-CCMP+TKIP][WPS][ESS] [WPA-PSK-CCMP+T...,[WPA-PSK-TKIP][WPA2-PSK-CCMP][WPS][ESS]\n,"[c0:05:c2, d2:05:c2, 90:f6:52]","[98:e7:f5, 30:91:8f]"
3,1490431586000,VM8236218 Virgin Media TP-LINK_F5B890,PlusnetWireless0647C9,[WPA2-PSK-CCMP+TKIP][WPS][ESS] [WPA-PSK-CCMP+T...,[WPA-PSK-TKIP][WPA2-PSK-CCMP][WPS][ESS]\n,"[c0:05:c2, d2:05:c2, 90:f6:52]",[30:91:8f]
4,1490431587000,VM8236218 Virgin Media TP-LINK_F5B890,TALKTALKB93F0E PlusnetWireless0647C9,[WPA2-PSK-CCMP+TKIP][WPS][ESS] [WPA2-EAP-CCMP]...,[WPA-PSK-TKIP][WPA2-PSK-CCMP][WPS][ESS]\n,"[c0:05:c2, d2:05:c2, 90:f6:52]","[98:e7:f5, 30:91:8f]"
...,...,...,...,...,...,...,...
1362443,1499268293000,,Virgin Media VM1689925 BTWifi-with-FON BTWifi-X,,[WPA2-EAP-CCMP][ESS] [WPA-PSK-CCMP+TKIP][WPA2-...,,"[52:0d:10, d2:05:c2, 40:0d:10, 8e:10:d4, ae:10..."
1362444,1499268294000,,Virgin Media VM1689925 BTWifi-with-FON BTWifi-X,,[WPA2-EAP-CCMP][ESS] [WPA-PSK-CCMP+TKIP][WPA2-...,,"[52:0d:10, d2:05:c2, 40:0d:10, 8e:10:d4, ae:10..."
1362445,1499268295000,,Virgin Media VM1689925 BTWifi-with-FON BTWifi-X,,[WPA2-EAP-CCMP][ESS] [WPA-PSK-CCMP+TKIP][WPA2-...,,"[52:0d:10, d2:05:c2, 40:0d:10, 8e:10:d4, ae:10..."
1362446,1499268296000,,Virgin Media VM1689925 BTWifi-with-FON BTWifi-X,,[WPA2-EAP-CCMP][ESS] [WPA-PSK-CCMP+TKIP][WPA2-...,,"[52:0d:10, d2:05:c2, 40:0d:10, 8e:10:d4, ae:10..."


In [7]:
create_wifi_features(validate_wifi)

Unnamed: 0,epoch_time_id,SSID_high,SSID_low,Capabilities,Capabilities_low,BSSID,BSSID_low
0,1497426496000,SKY484B1 BTHub3-PHSH BTWifi-with-FON BTWifi-X ...,EXT2-VM7640780 TheDarkSide VM7640780 Virgin Media,[WPA2-PSK-CCMP][WPS][ESS] [WPA-PSK-CCMP+TKIP][...,[WPA2-PSK-CCMP][ESS] [WPA-PSK-CCMP+TKIP][WPA2-...,"[70:50:af, 00:03:d8, 02:03:d8, 22:03:d8, 74:44...","[1c:a5:32, 00:1d:aa, c0:05:c2, d2:05:c2, 22:39..."
1,1497426497000,SKY484B1 BTHub3-PHSH BTWifi-with-FON virginmed...,EXT2-VM7640780 TheDarkSide VM7640780 Virgin Media,[WPA2-PSK-CCMP][WPS][ESS] [WPA-PSK-CCMP+TKIP][...,[WPA2-PSK-CCMP][ESS] [WPA-PSK-CCMP+TKIP][WPA2-...,"[70:50:af, 00:03:d8, 02:03:d8, 74:44:01, 22:03...","[1c:a5:32, 00:1d:aa, c0:05:c2, d2:05:c2, 22:39..."
2,1497426498000,SKY484B1 BTWifi-with-FON BTHub3-PHSH virginmed...,EXT2-VM7640780 TheDarkSide VM7640780 Virgin Media,[WPA2-PSK-CCMP][WPS][ESS] [WPA-PSK-CCMP+TKIP][...,[WPA2-PSK-CCMP][ESS] [WPA-PSK-CCMP+TKIP][WPA2-...,"[70:50:af, 02:03:d8, 00:03:d8, 74:44:01, 22:03...","[1c:a5:32, 00:1d:aa, c0:05:c2, d2:05:c2, 22:39..."
3,1497426499000,SKY484B1 BTWifi-with-FON BTHub3-PHSH virginmed...,EXT2-VM7640780 TheDarkSide VM7640780 Virgin Media,[WPA2-PSK-CCMP][WPS][ESS] [WPA-PSK-CCMP+TKIP][...,[WPA2-PSK-CCMP][ESS] [WPA-PSK-CCMP+TKIP][WPA2-...,"[70:50:af, 02:03:d8, 00:03:d8, 74:44:01, 22:03...","[1c:a5:32, 00:1d:aa, c0:05:c2, d2:05:c2, 22:39..."
4,1497426500000,SKY484B1 BTHub3-PHSH BTWifi-with-FON BTWifi-X ...,EXT2-VM7640780 TheDarkSide VM7640780 Virgin Media,[WPA2-PSK-CCMP][WPS][ESS] [WPA-PSK-CCMP+TKIP][...,[WPA2-PSK-CCMP][ESS] [WPA-PSK-CCMP+TKIP][WPA2-...,"[70:50:af, 00:03:d8, 02:03:d8, 22:03:d8, 74:44...","[1c:a5:32, 00:1d:aa, c0:05:c2, d2:05:c2, 22:39..."
...,...,...,...,...,...,...,...
128078,1500388975000,,TNCAPE25613,,[WPA-PSK-TKIP][WPA2-PSK-CCMP][WPS][ESS]\n,,[c4:ea:1d]
128079,1500389036000,,TNCAPE25613,,[WPA-PSK-TKIP][WPA2-PSK-CCMP][WPS][ESS]\n,,[c4:ea:1d]
128080,1500389095000,,TNCAPE25613,,[WPA-PSK-TKIP][WPA2-PSK-CCMP][WPS][ESS]\n,,[c4:ea:1d]
128081,1500389158000,,TNCAPE25613,,[WPA-PSK-TKIP][WPA2-PSK-CCMP][WPS][ESS]\n,,[c4:ea:1d]


In [32]:
features_wifi_train_with_labels = train_label.merge(features_train_ssid_concat, how='left', on='epoch_time_id').sort_values(by=['epoch_time_id'])
features_wifi_train_with_labels

In [26]:
validate_wifi.query('epoch_time_id == 1497426496000')

Unnamed: 0,Epoch time [ms],Ignore1,Ignore2,Unknown,BSSID,SSID,RSSI,Frequency [MHz],Capabilities,epoch_time_id
0,1497426495755,94062335923,25013238017,11,70:50:af:2d:ed:d9,SKY484B1,-36,2462,[WPA2-PSK-CCMP][WPS][ESS],1497426496000
1,1497426495755,94062335923,25013238017,11,74:44:01:f9:12:4c,virginmedia7072875,-66,2437,[WPA-PSK-CCMP+TKIP][WPA2-PSK-CCMP+TKIP][WPS][ESS],1497426496000
2,1497426495755,94062335923,25013238017,11,22:03:d8:03:39:62,BTWifi-X,-66,2462,[WPA-EAP-CCMP+TKIP][WPA2-EAP-CCMP+TKIP-preauth...,1497426496000
3,1497426495755,94062335923,25013238017,11,1c:a5:32:a2:fd:1b,EXT2-VM7640780,-72,2437,[WPA2-PSK-CCMP][ESS],1497426496000
4,1497426495755,94062335923,25013238017,11,00:03:d8:03:39:62,BTHub3-PHSH,-65,2462,[WPA-PSK-CCMP+TKIP][WPA2-PSK-CCMP+TKIP][WPS][ESS],1497426496000
5,1497426495755,94062335923,25013238017,11,c0:05:c2:b9:e3:71,VM7640780,-84,2437,[WPA-PSK-CCMP+TKIP][WPA2-PSK-CCMP+TKIP][WPS][ESS],1497426496000
6,1497426495755,94062335923,25013238017,11,d2:05:c2:b9:e3:71,Virgin Media,-85,2437,[WPA2-EAP-CCMP][ESS],1497426496000
7,1497426495755,94062335923,25013238017,11,22:39:96:ba:99:08,BTWifi-X,-89,2412,[WPA-EAP-CCMP+TKIP][WPA2-EAP-CCMP+TKIP-preauth...,1497426496000
8,1497426495755,94062335923,25013238017,11,02:03:d8:03:39:62,BTWifi-with-FON,-65,2462,[ESS],1497426496000
9,1497426495755,94062335923,25013238017,11,fa:8f:ca:56:88:62,,-45,2462,[ESS],1497426496000


In [20]:
concat_ssid(train_wifi.query('epoch_time_id == 1497272170000 or epoch_time_id == 1497272171000'))

Unnamed: 0,epoch_time_id,SSID
0,1497272170000,Virgin Media
1,1497272171000,Virgin Media WiFi


In [21]:
train_wifi.query('epoch_time_id == 1497272170000 or epoch_time_id == 1497272171000')

Unnamed: 0,Epoch time [ms],Ignore1,Ignore2,Unknown,BSSID,SSID,RSSI,Frequency [MHz],Capabilities,epoch_time_id
10209930,1497272170202,15304015012245,130219565604,40,2c:36:f8:0f:2d:89,Wifi Extra,-65,5500,[WPA2-EAP+FT/EAP-CCMP][ESS],1497272170000
10209931,1497272170202,15304015012245,130219565604,40,c8:f9:f9:29:43:1b,EE WiFi-Auto,-65,5320,[WPA2-EAP+FT/EAP-CCMP][ESS],1497272170000
10209932,1497272170202,15304015012245,130219565604,40,c8:f9:f9:29:43:1f,222666328,-65,5320,[WPA2-EAP+FT/EAP-CCMP][ESS],1497272170000
10209933,1497272170202,15304015012245,130219565604,40,2c:36:f8:0f:2d:8b,EE WiFi-Auto,-65,5500,[WPA2-EAP+FT/EAP-CCMP][ESS],1497272170000
10209934,1497272170202,15304015012245,130219565604,40,2c:36:f8:0f:2d:8f,222666328,-65,5500,[WPA2-EAP+FT/EAP-CCMP][ESS],1497272170000
...,...,...,...,...,...,...,...,...,...,...
10210005,1497272171210,15305023205474,130219565604,40,c8:f9:f9:29:43:13,Virgin Media WiFi,-55,2437,[ESS],1497272171000
10210006,1497272171210,15305023205474,130219565604,40,2c:36:f8:0f:2d:83,Virgin Media WiFi,-70,2437,[ESS],1497272171000
10210007,1497272171210,15305023205474,130219565604,40,2c:36:f8:0f:2e:83,Virgin Media WiFi,-77,2412,[ESS],1497272171000
10210008,1497272171210,15305023205474,130219565604,40,2c:36:f8:0f:28:f3,Virgin Media WiFi,-83,2462,[ESS],1497272171000


In [5]:
pd.concat([train_wifi['SSID'], test_wifi['SSID'], validate_wifi['SSID']]).unique()

array(['VM8236218', 'VM201213-2G', 'VM702835-2G_EXT', ...,
       'PAH_Colleague', 'HP-Print-c7-LaserJet 100', 'BTHub6-293G'],
      dtype=object)

In [None]:
%%time

wifi_feature = WifiFeature(train_wifi)
train_wifi_features = fillna_agg_by_label(wifi_feature.transform(train_wifi), train_label)
validate_wifi_features = fillna_agg_by_label(wifi_feature.transform(validate_wifi), validate_label)
test_wifi_features = fillna_agg_by_label(wifi_feature.transform(test_wifi), test_label)

In [None]:
validate_wifi_features

In [None]:
wifi_feature_names = wifi_feature.get_feature_names()
print(wifi_feature_names)

In [None]:
wifi_feature.hist([train_wifi_features, validate_wifi_features, test_wifi_features])

In [None]:
train_wifi_features.to_parquet('../data/train/features_wifi_names.parquet')
test_wifi_features.to_parquet('../data/test/features_wifi_names.parquet')
validate_wifi_features.to_parquet('../data/validate/features_wifi_names.parquet')

In [None]:
train_wifi_features.dtypes