In [1]:
import re

import numpy as np
import pandas as pd
import os
import featuretools as ft
from shl.prepare import normalize_epoch_time, normalize_lat_long
from shl.features import fetch_location, NoLocationFoundException
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
import requests
import time
from tqdm import tqdm
import csv

from shl.features import WifiFeature

%load_ext autoreload
%load_ext dotenv
%autoreload 2
%dotenv

In [2]:
train_cell = normalize_epoch_time(pd.read_parquet('../data/train/Cells.parquet'), 'epoch_time')
train_location = normalize_epoch_time(pd.read_parquet('../data/train/Location.parquet'), 'epoch_time')
test_cell = normalize_epoch_time(pd.read_parquet('../data/test/Cells.parquet'), 'epoch_time')
test_location = normalize_epoch_time(pd.read_parquet('../data/test/Location.parquet'), 'epoch_time')
validate_cell = normalize_epoch_time(pd.read_parquet('../data/validate/Cells.parquet'), 'epoch_time')
validate_location = normalize_epoch_time(pd.read_parquet('../data/validate/Location.parquet'), 'epoch_time')

train_label = normalize_epoch_time(pd.read_parquet('../data/train/Label.parquet'), 'epoch_time')
test_label = normalize_epoch_time(pd.read_parquet('../data/test/Label_idx.parquet'), 'epoch_time')
validate_label = normalize_epoch_time(pd.read_parquet('../data/validate/Label.parquet'), 'epoch_time')

In [4]:
fetch_location(train_cell.query('epoch_time == 1490448212563'))

{'Latitude': 51.4723711, 'Longitude': -0.4135758, 'accuracy': 873}

In [5]:
def fetch_cell_with_missing_location(cell: pd.DataFrame, location: pd.DataFrame) -> pd.DataFrame:
    return cell.merge(location, on='epoch_time_id', how="left").sort_values(by='epoch_time_id').query('accuracy != accuracy and cell_type == cell_type')

print(fetch_cell_with_missing_location(train_cell, train_location)['epoch_time_id'].value_counts().shape)
print(fetch_cell_with_missing_location(validate_cell, validate_location)['epoch_time_id'].value_counts().shape)
print(fetch_cell_with_missing_location(test_cell, test_location)['epoch_time_id'].value_counts().shape)

(424308,)
(47129,)
(165563,)


In [6]:
print(train_location['epoch_time_id'].value_counts().shape)
print(test_location['epoch_time_id'].value_counts().shape)
print(validate_location['epoch_time_id'].value_counts().shape)

(908631,)
(560792,)
(101228,)


In [7]:
def convert_cell_to_location(cells_data, location_data, file_path: str, step: int = 60):
    with open(file_path, "w") as file:
        csv_writer = csv.DictWriter(file, fieldnames=['epoch_time_id', 'Latitude', 'Longitude', 'accuracy'])
        csv_writer.writeheader()
        cells_groups = fetch_cell_with_missing_location(cells_data, location_data).groupby('epoch_time_id')
        for i, (epoch_time, cells) in enumerate(tqdm(cells_groups, total=len(cells_groups))):
            if i % step != 0:
                continue
            try:
                csv_writer.writerow({'epoch_time_id': epoch_time, **fetch_location(cells)})
            except NoLocationFoundException as e:
                print(f'Missing location for:\n{e.cells}')
                continue
            finally:
                time.sleep(0.5)
            file.flush()

convert_cell_to_location(train_cell, train_location, '../data/train/features_cell_to_location.csv', step=60)
convert_cell_to_location(test_cell, train_location, '../data/train/features_cell_to_location.csv', step=60)
convert_cell_to_location(validate_cell, train_location, '../data/train/features_cell_to_location.csv', step=60)



  2%|▏         | 8520/424308 [02:53<2:20:47, 49.22it/s]


KeyboardInterrupt: 

In [19]:
train_cell['epoch_time_id'].value_counts().shape

(1236268,)

In [18]:
train_cell.query('epoch_time == 1490448212563')

Unnamed: 0,epoch_time,num_entries,cell_type,isRegistered,ci,MCC,MNC,PCI,TAC,asuLevel,dBm,level,cid,lac,PSC,epoch_time_id
89,1490448212563,2,LTE,1,128059300.0,234,10,425.0,4096.0,41,-99,3,,,,1490448213000
90,1490448212563,2,LTE,0,2147484000.0,234,10,424.0,2147484000.0,39,-101,3,,,,1490448213000


In [24]:
train_cell['ci'].astype('int64', errors='ignore')

0          1.280004e+08
1          2.147484e+09
2          2.147484e+09
3          1.280004e+08
4          2.147484e+09
               ...     
4474375             NaN
4474376             NaN
4474377             NaN
4474378             NaN
4474379             NaN
Name: ci, Length: 4474380, dtype: float64

In [3]:
validate_cell_with_label = validate_label.merge(train_cell, how='left', on='epoch_time_id').sort_values(['epoch_time_id', 'cell_type', 'dBm'])[['epoch_time_id', 'label', 'cell_type', 'dBm', 'asuLevel', 'ci']]

#.loc[:, ['epoch_time', 'label', 'SSID', 'RSSI']].sort_values(['epoch_time', 'RSSI'], ascending=[True, False]).groupby('epoch_time').agg({'label' : 'first', 'SSID' : 'first'}).reset_index()

In [10]:
validate_cell_with_label.to_csv('./validate_cell_with_label.csv')