# Isolation Forest

## Data preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

ModuleNotFoundError: No module named 'pandas'

In [None]:
df = pd.read_csv("hackathon_kpis_anonymised.csv")
df.head()

In [None]:
df.info()
df.groupby('cell_name')


In [None]:
nans_2 = {}
for key, val in nans.items():
  perc = val['total_nans']/val['num_timesteps']
  if perc < 0.15:
    nans_2[key] = perc

nans_2

In [None]:
def get_cell_df(cell_name):
  return df[df['cell_name'] == cell_name]

In [None]:
def preprocess(df):
  df_temp = df.fillna(df.median(), axis='index')
  df_temp = df_temp.drop(columns='cell_name')
  df_temp = df_temp.sort_values('timestamp')
  return df_temp.set_index('timestamp')

In [None]:
def dont_use_df(df):
  # skip this df if a whole column is nan
  return df.isna().sum().sum() > 0

In [None]:
def add_aggregated_time_information(data, window_size=5):
  time = np.zeros(shape=data.shape)
  for i in range(window_size, time.shape[0]):
    time[i] = np.mean(data[i-window_size:i], axis=0)

  return np.concatenate((data, time), axis=1)

## Isolation forest

In [None]:
skipped = []
scores_per_cell = {}
for cell in df['cell_name'].unique():
  df_temp = get_cell_df(cell)
  df_temp = preprocess(df_temp)
  if dont_use_df(df_temp):
    skipped.append(cell)
    continue
  pca = PCA(n_components=8)
  data = pca.fit_transform(df_temp)
  data = add_aggregated_time_information(data)
  model =  IsolationForest(contamination = 0.1, random_state=42)
  model.fit(data)
  scores_per_cell[cell] = {'score':model.score_samples(data), 'timestep':df_temp.index} # Low = abnormal

In [None]:
cell_name_to_plot = df['cell_name'].unique()[5]
cell_name_to_plot2 = df['cell_name'].unique()[6]
plt.scatter(scores_per_cell[cell_name_to_plot]['timestep'], scores_per_cell[cell_name_to_plot]['score'])
plt.scatter(scores_per_cell[cell_name_to_plot2]['timestep'], scores_per_cell[cell_name_to_plot2]['score'])
plt.show()