In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

# constants
pca_components = 8  # I used 8 for a dataset with 14 features

In [10]:
df = pd.read_csv("ml_models_dataset_with_zone.csv", index_col='timestamp')
df.head()

Unnamed: 0_level_0,cell_name,ho_failure_rate,num_voice_attempts,voice_drop_rate,num_data_attempts,voice_setup_failure_rate,voice_tot_failure_rate,avail_period_duration,bandwidth,throughput_rate,...,tech_freq_W,tech_freq_X,tech_freq_Y,tech_freq_Z,zone_-1,zone_0,zone_1,zone_2,zone_3,zone_4
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-04 00:00:00+00:00,22_21Q,0.000679,0.000679,0.000679,0.000679,0.000679,0.000679,1.0,0.09955,0.0006786382,...,0,0,0,0,0,0,1,0,0,0
2019-11-04 00:00:00+00:00,02_31Q,0.006013,0.0,0.006013,0.000781,0.006013,0.006013,1.0,0.09955,2.322707e-08,...,0,0,0,0,0,0,0,1,0,0
2019-11-04 00:00:00+00:00,25_11W,0.001574,0.001574,0.001574,0.001574,0.001574,0.001574,1.0,1.0,0.001573807,...,1,0,0,0,0,0,1,0,0,0
2019-11-04 00:00:00+00:00,28_11Y,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0
2019-11-04 00:00:00+00:00,12_21W,0.001032,0.013902,0.0,0.091727,0.0,0.0,1.0,1.0,0.0005298183,...,1,0,0,0,0,0,1,0,0,0


In [11]:
# Help functions

def get_cell_df(cell_name):
  return df[df['cell_name'] == cell_name]

def add_aggregated_time_information(data, window_size=5):
  time = np.zeros(shape=data.shape)
  for i in range(window_size, time.shape[0]):
    time[i] = np.mean(data[i-window_size:i], axis=0)

  return np.concatenate((data, time), axis=1)

In [48]:
def getIsolationForestScore(df, pca_components):
    """
    df: pandas dataframe, preprocessed (only numerical values)
    pca_components: int, number of components to reduce to with pca

    returns: list of floats, score for each example in df
    """
    pca = PCA(n_components=pca_components)
    data = pca.fit_transform(df)
    data = add_aggregated_time_information(data)
    model =  IsolationForest(contamination = 0.1, random_state=42)
    model.fit(data)
    return model.score_samples(data)

def runIsolationForestForAllCells(df, pca_components):
    """
    df: pandas dataframe, index: timestamp - cell_name must be included - preprocessed
    """
    df['anom_scores'] = np.zeros(df.shape[0])
    for i, cell in enumerate(df['cell_name'].unique()[:5]):
        # Hardcode dropping of cells with very little variation
        if cell == '29_11Q':
            continue
        if cell == '28_21Y':
            continue
        
        # Get df with only the current cell_name-rows
        df_temp = get_cell_df(cell)
        df_temp = df_temp.drop(columns='cell_name')
        # Reduce dimentions, fit iForest
        scores = getIsolationForestScore(df_temp, pca_components)
        # Add scores to original df
        df.loc[df['cell_name'] == cell, 'anom_scores'] = scores
        print(df)
        if i%50 == 0:
            print('.... done with:', i)
    return df

In [49]:
df_with_scores = runIsolationForestForAllCells(df, pca_components)

                          cell_name  ho_failure_rate  num_voice_attempts  \
timestamp                                                                  
2019-11-04 00:00:00+00:00    22_21Q         0.000679            0.000679   
2019-11-04 00:00:00+00:00    02_31Q         0.006013            0.000000   
2019-11-04 00:00:00+00:00    25_11W         0.001574            0.001574   
2019-11-04 00:00:00+00:00    28_11Y         0.000000            0.000000   
2019-11-04 00:00:00+00:00    12_21W         0.001032            0.013902   
...                             ...              ...                 ...   
2020-12-21 23:00:00+00:00    00_11Y         0.000000            0.005561   
2020-12-21 23:00:00+00:00    00_31R         0.333333            0.001854   
2020-12-21 23:00:00+00:00    03_31Q         0.000339            0.000000   
2020-12-21 23:00:00+00:00    13_31Q         0.000152            0.000000   
2020-12-21 23:00:00+00:00    21_11X         0.333333            0.002780   

           

                          cell_name  ho_failure_rate  num_voice_attempts  \
timestamp                                                                  
2019-11-04 00:00:00+00:00    22_21Q         0.000679            0.000679   
2019-11-04 00:00:00+00:00    02_31Q         0.006013            0.000000   
2019-11-04 00:00:00+00:00    25_11W         0.001574            0.001574   
2019-11-04 00:00:00+00:00    28_11Y         0.000000            0.000000   
2019-11-04 00:00:00+00:00    12_21W         0.001032            0.013902   
...                             ...              ...                 ...   
2020-12-21 23:00:00+00:00    00_11Y         0.000000            0.005561   
2020-12-21 23:00:00+00:00    00_31R         0.333333            0.001854   
2020-12-21 23:00:00+00:00    03_31Q         0.000339            0.000000   
2020-12-21 23:00:00+00:00    13_31Q         0.000152            0.000000   
2020-12-21 23:00:00+00:00    21_11X         0.333333            0.002780   

           

                          cell_name  ho_failure_rate  num_voice_attempts  \
timestamp                                                                  
2019-11-04 00:00:00+00:00    22_21Q         0.000679            0.000679   
2019-11-04 00:00:00+00:00    02_31Q         0.006013            0.000000   
2019-11-04 00:00:00+00:00    25_11W         0.001574            0.001574   
2019-11-04 00:00:00+00:00    28_11Y         0.000000            0.000000   
2019-11-04 00:00:00+00:00    12_21W         0.001032            0.013902   
...                             ...              ...                 ...   
2020-12-21 23:00:00+00:00    00_11Y         0.000000            0.005561   
2020-12-21 23:00:00+00:00    00_31R         0.333333            0.001854   
2020-12-21 23:00:00+00:00    03_31Q         0.000339            0.000000   
2020-12-21 23:00:00+00:00    13_31Q         0.000152            0.000000   
2020-12-21 23:00:00+00:00    21_11X         0.333333            0.002780   

           

In [52]:
df_with_scores.describe()

Unnamed: 0,ho_failure_rate,num_voice_attempts,voice_drop_rate,num_data_attempts,voice_setup_failure_rate,voice_tot_failure_rate,avail_period_duration,bandwidth,throughput_rate,data_setup_failure_rate,...,tech_freq_X,tech_freq_Y,tech_freq_Z,zone_-1,zone_0,zone_1,zone_2,zone_3,zone_4,anom_scores
count,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0,...,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0,1042688.0
mean,0.1533228,0.01087738,0.0007895436,0.01824066,0.0009470607,0.0009869756,0.9990099,0.343508,0.0002523121,0.002379716,...,0.08109041,0.07512698,0.1261844,0.0001256368,0.2540913,0.3129258,0.1979298,0.1379943,0.09693312,-0.005023199
std,0.1713096,0.02641081,0.00804665,0.03614324,0.01473337,0.009447519,0.0234538,0.3452026,0.001820752,0.0184173,...,0.272974,0.2635963,0.3320573,0.01120808,0.4353494,0.4636847,0.3984391,0.3448942,0.2958668,0.04527398
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.8721487
25%,3.194319e-05,0.0,0.0,0.001442963,0.0,0.0,1.0,0.09954977,1.74605e-06,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.001573807,0.001853568,0.0,0.005785091,0.0,0.0,1.0,0.09954977,8.982702e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.3333333,0.009267841,9.856101e-05,0.01891738,5.506305e-05,0.0001727429,1.0,0.4997499,0.0002741481,0.001250855,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [None]:
#cell_names_to_plot = ['00_11Z', '22_21Q', '13_21X'] # Not 29_11Q or 28_21Y
cell_name_to_plot = df['cell_name'].unique()

cell_names_to_plots = [cell for cell in cell_name_to_plot if cell != '29_11Q']
cell_names_to_plots2 = [cell for cell in cell_names_to_plots if cell != '28_21Y']

for cell_to_plot in cell_names_to_plots2:
    
  plt.scatter(df_with_scores[df_with_scores['cell_name'] == cell_to_plot].index, df_with_scores[df_with_scores['cell_name'] == cell_to_plot]['anom_scores'])

plt.show()