In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

# constants
pca_components = 8  # I used 8 for a dataset with 14 features

In [104]:
df = pd.read_csv("hackathon_kpis_anonymised/ml_models_dataset_with_zone_v2.csv", index_col='timestamp')
df.head()

Unnamed: 0_level_0,cell_name,ho_failure_rate,num_voice_attempts,voice_drop_rate,num_data_attempts,voice_setup_failure_rate,voice_tot_failure_rate,avail_period_duration,bandwidth,throughput_rate,...,tech_freq_W,tech_freq_X,tech_freq_Y,tech_freq_Z,zone_-1,zone_0,zone_1,zone_2,zone_3,zone_4
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-04 00:00:00+00:00,22_21Q,0.000679,0.000679,0.000679,0.000679,0.000679,0.000679,1.0,0.09955,0.0006786382,...,0,0,0,0,0,0,1,0,0,0
2019-11-04 00:00:00+00:00,02_31Q,0.006013,0.0,0.006013,0.000781,0.006013,0.006013,1.0,0.09955,2.322707e-08,...,0,0,0,0,0,0,0,1,0,0
2019-11-04 00:00:00+00:00,25_11W,0.001574,0.001574,0.001574,0.001574,0.001574,0.001574,1.0,1.0,0.001573807,...,1,0,0,0,0,0,1,0,0,0
2019-11-04 00:00:00+00:00,28_11Y,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0
2019-11-04 00:00:00+00:00,12_21W,0.001032,0.013902,0.0,0.091727,0.0,0.0,1.0,1.0,0.0005298183,...,1,0,0,0,0,0,1,0,0,0


In [105]:
# Help functions

def get_cell_df(cell_name):
  return df[df['cell_name'] == cell_name]

def add_aggregated_time_information(data, window_size=5):
  time = np.zeros(shape=data.shape)
  for i in range(window_size, time.shape[0]):
    time[i] = np.mean(data[i-window_size:i], axis=0)

  return np.concatenate((data, time), axis=1)

In [106]:
def getIsolationForestScore(df, pca_components):
    """
    df: pandas dataframe, preprocessed (only numerical values)
    pca_components: int, number of components to reduce to with pca

    returns: list of floats, score for each example in df
    """
    pca = PCA(n_components=pca_components)
    data = pca.fit_transform(df)
    data = add_aggregated_time_information(data)
    model =  IsolationForest(contamination = 0.1, random_state=42)
    model.fit(data)
    return model.score_samples(data)

def runIsolationForestForAllCells(df, pca_components):
    """
    df: pandas dataframe, index: timestamp - cell_name must be included - preprocessed
    """
    scores_per_cell = {}
    #df['anom_scores'] = np.zeros(df.shape[0])
    for i, cell in enumerate(df['cell_name'].unique()):
        # Hardcode dropping of cells with very little variation
        if cell == '29_11Q':
            continue
        if cell == '28_21Y':
            continue
        
        # Get df with only the current cell_name-rows
        df_temp = get_cell_df(cell)
        df_temp = df_temp.drop(columns='cell_name')
        # Reduce dimentions, fit iForest
        scores = getIsolationForestScore(df_temp, pca_components)
        # Add scores to original df
        #df.loc[df['cell_name'] == cell, 'anom_scores'] = scores
        scores_per_cell[cell] = {'score': scores, 'timestep':df_temp.index} # Low = abnormal
        
        if i%50 == 0:
            print('.... done with:', i)
    return scores_per_cell

In [107]:
#df_with_scores = runIsolationForestForAllCells(df, pca_components)
scores_per_cell = runIsolationForestForAllCells(df, pca_components)

.... done with: 0
.... done with: 50
.... done with: 100
.... done with: 150
.... done with: 200
.... done with: 250
.... done with: 300
.... done with: 350
.... done with: 400


In [None]:
#cell_names_to_plot = ['00_11Z', '22_21Q', '13_21X'] # Not 29_11Q or 28_21Y
cell_name_to_plot = df['cell_name'].unique()

cell_names_to_plots = [cell for cell in cell_name_to_plot if cell != '29_11Q']
cell_names_to_plots2 = [cell for cell in cell_names_to_plots if cell != '28_21Y']

for cell_to_plot in cell_names_to_plots2[:10]:
    
  plt.scatter(scores_per_cell[cell_to_plot]['timestep'], scores_per_cell[cell_to_plot]['score'])

plt.show()

In [108]:
import json

for key, val in scores_per_cell.items():
    val['score'] = list(val['score'])
    val['timestep'] = list(val['timestep'])

saved = scores_per_cell

In [109]:
with open('iforest_scores.json', 'w') as outfile:
    json.dump(scores_per_cell, outfile)

# Extra: save as DF for visualization

In [93]:
import json
from utils import normalize
import numpy as np
from tqdm.auto import tqdm

df = pd.read_csv("hackathon_kpis_anonymised/ml_models_dataset_with_zone_v2.csv")
df.head()

Unnamed: 0,timestamp,cell_name,ho_failure_rate,num_voice_attempts,voice_drop_rate,num_data_attempts,voice_setup_failure_rate,voice_tot_failure_rate,avail_period_duration,bandwidth,...,tech_freq_W,tech_freq_X,tech_freq_Y,tech_freq_Z,zone_-1,zone_0,zone_1,zone_2,zone_3,zone_4
0,2019-11-04 00:00:00+00:00,22_21Q,0.000679,0.000679,0.000679,0.000679,0.000679,0.000679,1.0,0.09955,...,0,0,0,0,0,0,1,0,0,0
1,2019-11-04 00:00:00+00:00,02_31Q,0.006013,0.0,0.006013,0.000781,0.006013,0.006013,1.0,0.09955,...,0,0,0,0,0,0,0,1,0,0
2,2019-11-04 00:00:00+00:00,25_11W,0.001574,0.001574,0.001574,0.001574,0.001574,0.001574,1.0,1.0,...,1,0,0,0,0,0,1,0,0,0
3,2019-11-04 00:00:00+00:00,28_11Y,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,1,0,0,0,0,1,0,0
4,2019-11-04 00:00:00+00:00,12_21W,0.001032,0.013902,0.0,0.091727,0.0,0.0,1.0,1.0,...,1,0,0,0,0,0,1,0,0,0


In [25]:
with open('preds/isolation_forest_scores_v1.json') as f:
    saved = json.load(f)

In [94]:
saved.keys()

dict_keys(['22_21Q', '02_31Q', '25_11W', '28_11Y', '12_21W', '09_12Q', '19_32Q', '05_22Q', '01_21Z', '23_21W', '19_21V', '04_31Q', '25_31R', '12_21X', '12_11Q', '12_21Y', '05_12Q', '25_22Q', '23_22W', '12_11W', '01_31Z', '01_22W', '14_12Q', '00_32Q', '00_22Q', '01_21Y', '03_32Q', '14_31X', '06_21R', '12_12W', '13_12Q', '19_21X', '23_31W', '13_32Q', '27_11Q', '24_11Q', '09_32Q', '26_11Q', '19_32W', '15_11V', '25_11R', '03_12Q', '13_32W', '07_31R', '01_21R', '26_31Q', '14_21Y', '00_31Q', '14_22Q', '08_21Y', '19_12W', '23_32W', '20_21Q', '09_31R', '01_12W', '01_32Q', '18_22Q', '04_21Z', '15_21Y', '13_11Q', '17_22Q', '17_11Q', '07_22Q', '15_31X', '22_31Y', '13_31V', '18_32Q', '00_21V', '13_21X', '25_21W', '07_31Q', '00_31V', '20_21Z', '01_11W', '12_22Q', '23_11W', '17_32Q', '14_31Z', '25_31W', '01_31V', '11_31V', '06_32Q', '22_11Z', '15_32Q', '00_21Y', '22_31R', '04_11Q', '04_22Q', '04_31Z', '07_12Q', '14_32Q', '05_32Q', '06_22W', '21_11X', '05_12W', '12_21R', '15_22Q', '05_31Q', '24_22Q',

In [95]:
dfs = []

for cell_name, data in tqdm(saved.items()):
    cell_df = df.loc[df['cell_name'] == cell_name]
    
    scores = normalize(-np.array(data['score']))
    timestamps = data['timestep']
    score_df = pd.DataFrame(zip(timestamps, scores), columns=['timestamp', 'score'])
    
    merged_df = cell_df.merge(score_df, on='timestamp')
    merged_df = cell_df.reset_index().merge(score_df, on='timestamp').set_index('index')
    
    print(cell_df.index)
    print(merged_df.index)
    
    for col in merged_df.columns:
        if col in ['timestamp', 'cell_name', 'score', 'index']:
            continue
        merged_df[col] = merged_df['score']
    merged_df.drop(columns='score', inplace=True)
    
    merged_df.to_pickle('preds/iforest_'+cell_name+'.pkl')
    dfs.append(merged_df)
    break
merged_df

  0%|          | 0/403 [00:00<?, ?it/s]

Int64Index([      0,     177,     608,     623,     995,    1060,    1398,
               1548,    1688,    1984,
            ...
            1038736, 1038869, 1039079, 1040395, 1040639, 1040919, 1041162,
            1041467, 1041719, 1042441],
           dtype='int64', length=2694)
Int64Index([      0,     177,     608,     623,     995,    1060,    1398,
               1548,    1688,    1984,
            ...
            1038736, 1038869, 1039079, 1040395, 1040639, 1040919, 1041162,
            1041467, 1041719, 1042441],
           dtype='int64', name='index', length=2694)


Unnamed: 0_level_0,timestamp,cell_name,ho_failure_rate,num_voice_attempts,voice_drop_rate,num_data_attempts,voice_setup_failure_rate,voice_tot_failure_rate,avail_period_duration,bandwidth,...,tech_freq_W,tech_freq_X,tech_freq_Y,tech_freq_Z,zone_-1,zone_0,zone_1,zone_2,zone_3,zone_4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2019-11-04 00:00:00+00:00,22_21Q,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
177,2019-11-04 01:00:00+00:00,22_21Q,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
608,2019-11-04 05:00:00+00:00,22_21Q,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
623,2019-11-04 06:00:00+00:00,22_21Q,0.083090,0.083090,0.083090,0.083090,0.083090,0.083090,0.083090,0.083090,...,0.083090,0.083090,0.083090,0.083090,0.083090,0.083090,0.083090,0.083090,0.083090,0.083090
995,2019-11-04 09:00:00+00:00,22_21Q,0.695131,0.695131,0.695131,0.695131,0.695131,0.695131,0.695131,0.695131,...,0.695131,0.695131,0.695131,0.695131,0.695131,0.695131,0.695131,0.695131,0.695131,0.695131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1040919,2020-12-21 07:00:00+00:00,22_21Q,0.490754,0.490754,0.490754,0.490754,0.490754,0.490754,0.490754,0.490754,...,0.490754,0.490754,0.490754,0.490754,0.490754,0.490754,0.490754,0.490754,0.490754,0.490754
1041162,2020-12-21 09:00:00+00:00,22_21Q,0.097454,0.097454,0.097454,0.097454,0.097454,0.097454,0.097454,0.097454,...,0.097454,0.097454,0.097454,0.097454,0.097454,0.097454,0.097454,0.097454,0.097454,0.097454
1041467,2020-12-21 12:00:00+00:00,22_21Q,0.107934,0.107934,0.107934,0.107934,0.107934,0.107934,0.107934,0.107934,...,0.107934,0.107934,0.107934,0.107934,0.107934,0.107934,0.107934,0.107934,0.107934,0.107934
1041719,2020-12-21 15:00:00+00:00,22_21Q,0.074413,0.074413,0.074413,0.074413,0.074413,0.074413,0.074413,0.074413,...,0.074413,0.074413,0.074413,0.074413,0.074413,0.074413,0.074413,0.074413,0.074413,0.074413


In [53]:
big_df = pd.concat(dfs)

  0%|          | 0/403 [00:00<?, ?it/s]

In [70]:
dfs = []

for cell_name, data in tqdm(saved.items()):
    cell_df = df.loc[df['cell_name'] == cell_name]
    
    scores = normalize(-np.array(data['score']))
    timestamps = data['timestep']
    score_df = pd.DataFrame(zip(timestamps, scores), columns=['timestamp', 'score'])
    
    merged_df = score_df.set_index('timestamp').join(cell_df.set_index('timestamp'))
    
    for col in merged_df.columns:
        if col in ['timestamp', 'cell_name', 'score']:
            continue
        merged_df[col] = merged_df['score']
    merged_df.drop(columns='score', inplace=True)
    
    merged_df.to_pickle('preds/iforest_'+cell_name+'.pkl')
    dfs.append(merged_df)
    break

big_df = pd.concat(dfs)
big_df

  0%|          | 0/403 [00:00<?, ?it/s]

Unnamed: 0_level_0,cell_name,ho_failure_rate,num_voice_attempts,voice_drop_rate,num_data_attempts,voice_setup_failure_rate,voice_tot_failure_rate,avail_period_duration,bandwidth,throughput_rate,...,tech_freq_W,tech_freq_X,tech_freq_Y,tech_freq_Z,zone_-1,zone_0,zone_1,zone_2,zone_3,zone_4
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-04 00:00:00+00:00,22_21Q,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,...,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999
2019-11-04 01:00:00+00:00,22_21Q,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,...,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999
2019-11-04 05:00:00+00:00,22_21Q,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,...,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999,0.082999
2019-11-04 06:00:00+00:00,22_21Q,0.116520,0.116520,0.116520,0.116520,0.116520,0.116520,0.116520,0.116520,0.116520,...,0.116520,0.116520,0.116520,0.116520,0.116520,0.116520,0.116520,0.116520,0.116520,0.116520
2019-11-04 09:00:00+00:00,22_21Q,0.566736,0.566736,0.566736,0.566736,0.566736,0.566736,0.566736,0.566736,0.566736,...,0.566736,0.566736,0.566736,0.566736,0.566736,0.566736,0.566736,0.566736,0.566736,0.566736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-21 07:00:00+00:00,22_21Q,0.440640,0.440640,0.440640,0.440640,0.440640,0.440640,0.440640,0.440640,0.440640,...,0.440640,0.440640,0.440640,0.440640,0.440640,0.440640,0.440640,0.440640,0.440640,0.440640
2020-12-21 09:00:00+00:00,22_21Q,0.251149,0.251149,0.251149,0.251149,0.251149,0.251149,0.251149,0.251149,0.251149,...,0.251149,0.251149,0.251149,0.251149,0.251149,0.251149,0.251149,0.251149,0.251149,0.251149
2020-12-21 12:00:00+00:00,22_21Q,0.235670,0.235670,0.235670,0.235670,0.235670,0.235670,0.235670,0.235670,0.235670,...,0.235670,0.235670,0.235670,0.235670,0.235670,0.235670,0.235670,0.235670,0.235670,0.235670
2020-12-21 15:00:00+00:00,22_21Q,0.089859,0.089859,0.089859,0.089859,0.089859,0.089859,0.089859,0.089859,0.089859,...,0.089859,0.089859,0.089859,0.089859,0.089859,0.089859,0.089859,0.089859,0.089859,0.089859


In [56]:
big_df.sort_index().to_pickle('preds/iforest.pkl')