In [1]:
import re

import numpy as np
import pandas as pd
import os
import featuretools as ft
from shl.prepare import normalize_epoch_time, normalize_lat_long
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.parsing.preprocessing import \
    strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, \
    strip_short, stem_text

from shl.features import WifiFeature

%load_ext autoreload
%autoreload 2



In [2]:
train_wifi = normalize_epoch_time(pd.read_parquet('../data/train/WiFi.parquet'), 'Epoch time [ms]')
test_wifi = normalize_epoch_time(pd.read_parquet('../data/test/WiFi.parquet'), 'Epoch time [ms]')
validate_wifi = normalize_epoch_time(pd.read_parquet('../data/validate/WiFi.parquet'), 'Epoch time [ms]')
train_label = normalize_epoch_time(pd.read_parquet('../data/train/Label.parquet'), 'epoch_time')
test_label = normalize_epoch_time(pd.read_parquet('../data/test/Label_idx.parquet'), 'epoch_time')
validate_label = normalize_epoch_time(pd.read_parquet('../data/validate/Label.parquet'), 'epoch_time')

In [3]:
train_label.merge(train_wifi, how='left', on='epoch_time_id')


Unnamed: 0,epoch_time,label,epoch_time_id,Epoch time [ms],Ignore1,Ignore2,Unknown,BSSID,SSID,RSSI,Frequency [MHz],Capabilities
0,1490431583000,4,1490431583000,1.490432e+12,7.138649e+11,2.426589e+08,6.0,c0:05:c2:29:40:ff,VM8236218,-43.0,5220.0,[WPA2-PSK-CCMP+TKIP][WPS][ESS]
1,1490431583000,4,1490431583000,1.490432e+12,7.138649e+11,2.426589e+08,6.0,c0:05:c2:29:40:f9,VM8236218,-48.0,2412.0,[WPA-PSK-CCMP+TKIP][WPA2-PSK-CCMP+TKIP][WPS][ESS]
2,1490431583000,4,1490431583000,1.490432e+12,7.138649e+11,2.426589e+08,6.0,d2:05:c2:29:40:f9,Virgin Media,-48.0,2412.0,[WPA2-EAP-CCMP][ESS]
3,1490431583000,4,1490431583000,1.490432e+12,7.138649e+11,2.426589e+08,6.0,90:f6:52:f5:b8:90,TP-LINK_F5B890,-64.0,2452.0,[WPA-PSK-CCMP][WPA2-PSK-CCMP][WPS][ESS]
4,1490431583000,4,1490431583000,1.490432e+12,7.138649e+11,2.426589e+08,6.0,98:e7:f5:b9:3f:14,TALKTALKB93F0E,-71.0,2432.0,[WPA-PSK-CCMP+TKIP][WPA2-PSK-CCMP+TKIP][WPS][ESS]
...,...,...,...,...,...,...,...,...,...,...,...,...
9073178,1499267852000,5,1499267852000,1.499268e+12,2.513964e+13,1.496746e+11,5.0,52:0d:10:1a:29:49,Virgin Media,-78.0,2437.0,[WPA2-EAP-CCMP][ESS]
9073179,1499267852000,5,1499267852000,1.499268e+12,2.513964e+13,1.496746e+11,5.0,d2:05:c2:bb:1b:69,Virgin Media,-82.0,2462.0,[WPA2-EAP-CCMP][ESS]
9073180,1499267852000,5,1499267852000,1.499268e+12,2.513964e+13,1.496746e+11,5.0,40:0d:10:1a:29:49,VM1689925,-85.0,2437.0,[WPA-PSK-CCMP+TKIP][WPA2-PSK-CCMP+TKIP][WPS][ESS]
9073181,1499267852000,5,1499267852000,1.499268e+12,2.513964e+13,1.496746e+11,5.0,ae:10:d4:0e:d6:d5,BTWifi-X,-87.0,2462.0,[WPA-EAP-CCMP+TKIP][WPA2-EAP-CCMP+TKIP-preauth...


In [4]:
def normalize_ssid(ssid: pd.Series):
    return ssid.str.replace("[0-9]{2}[0-9A-Z]+", "", regex=True) \
            .str.replace("^(BTHub|BTWiFi|BTOpenzone|DIRECT|EE(?:-BrightBox)?|HP-Print|MY WIFI|NETGEAR|PLUSNET|SKY|TALKTALK|TNCAP|VodafoneMobileWiFi|XLN For Small Biz|VM|ZyXEL|EXT2-BTHub|EXT2-PLUSNET).*$", "\\1", regex=True) \
            .str.strip()

def get_strongest_ssid_by_rssi(wifi: pd.DataFrame) -> pd.Series:
    return wifi.sort_values(by='RSSI', ascending=False).groupby(by='epoch_time_id').head(5).reset_index()

def print_unique(ssid):
    unique_ssid = ssid.unique()
    print(np.sort(unique_ssid))
    print(unique_ssid.shape)

def prepare_ssid_feature(wifi: pd.DataFrame):
    strongest_normalized = wifi.assign(SSID=lambda x: normalize_ssid(x['SSID']))\
        .drop_duplicates(subset=['epoch_time_id', 'SSID'], keep='first')

    strongest_normalized = get_strongest_ssid_by_rssi(strongest_normalized)
    # create columns names
    strongest_normalized['pivot_variable'] = strongest_normalized.groupby(by='epoch_time_id').cumcount() + 1
    strongest_normalized['pivot_variable'] = strongest_normalized['pivot_variable'].apply(lambda x: f"SSID_{x}")
    # turn rows into columns
    return strongest_normalized.pivot(index="epoch_time_id", columns="pivot_variable", values="SSID").reset_index().sort_values(by='epoch_time_id')



# print_unique(normalize_ssid(get_strongest_ssid_by_rssi(train_wifi)))
# print_unique(normalize_ssid(get_strongest_ssid_by_rssi(test_wifi)))
features_train_ssid = prepare_ssid_feature(train_wifi)
print(features_train_ssid)

features_train_ssid.to_parquet('../data/train/features_wifi_ssid.parquet')
prepare_ssid_feature(test_wifi).to_parquet('../data/test/features_wifi_ssid.parquet')
prepare_ssid_feature(validate_wifi).to_parquet('../data/validate/features_wifi_ssid.parquet')

pivot_variable  epoch_time_id        SSID_1 SSID_2    SSID_3       SSID_4  \
0               1490430923000  Virgin Media     VM  TALKTALK  TP-LINK_F5B   
1               1490430924000  Virgin Media     VM  TALKTALK  TP-LINK_F5B   
2               1490430925000  Virgin Media     VM  TALKTALK  TP-LINK_F5B   
3               1490430926000  Virgin Media     VM  TALKTALK  TP-LINK_F5B   
4               1490430927000  Virgin Media     VM  TALKTALK  TP-LINK_F5B   
...                       ...           ...    ...       ...          ...   
1050729         1499268457000  Virgin Media     VM  TALKTALK  TP-LINK_F5B   
1050730         1499268458000  Virgin Media     VM  TALKTALK  TP-LINK_F5B   
1050731         1499268459000  Virgin Media     VM  TALKTALK  TP-LINK_F5B   
1050732         1499268460000  Virgin Media     VM  TALKTALK  TP-LINK_F5B   
1050733         1499268461000  Virgin Media     VM  TALKTALK  TP-LINK_F5B   

pivot_variable SSID_5  
0                 NaN  
1                 NaN  
2  

In [5]:
pd.concat([train_wifi['SSID'], test_wifi['SSID'], validate_wifi['SSID']]).unique()

array(['VM8236218', 'VM201213-2G', 'VM702835-2G_EXT', ...,
       'PAH_Colleague', 'HP-Print-c7-LaserJet 100', 'BTHub6-293G'],
      dtype=object)

In [None]:
%%time

wifi_feature = WifiFeature(train_wifi)
train_wifi_features = wifi_feature.transform(train_wifi)
validate_wifi_features = wifi_feature.transform(validate_wifi)
test_wifi_features = wifi_feature.transform(test_wifi)

In [None]:
validate_wifi_features

In [None]:
wifi_feature_names = wifi_feature.get_feature_names()
print(wifi_feature_names)

In [None]:
wifi_feature.hist([train_wifi_features, validate_wifi_features, test_wifi_features])

In [None]:
train_wifi_features.to_parquet('../data/train/features_wifi_names.parquet')
test_wifi_features.to_parquet('../data/test/features_wifi_names.parquet')
validate_wifi_features.to_parquet('../data/validate/features_wifi_names.parquet')

In [None]:
train_wifi_features.dtypes