In [None]:
!pip install pycaret[full]

Collecting pycaret[full]
  Downloading pycaret-3.3.2-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.1/486.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting joblib<1.4,>=1.2.0 (from pycaret[full])
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>1.4.0 (from pycaret[full])
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyod>=1.1.3 (from pycaret[full])
  Downloading pyod-1.1.3.tar.gz (160 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.5/160.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting imbalance

In [None]:
import pandas as pd

In [None]:
import plotly.graph_objects as go
import plotly.express as px

def plot_predict(predict_df: pd.DataFrame, wtr: str, title: str, model_name: str):
  # plot value on y-axis and date on x-axis**
  fig = px.line(predict_df, x=predict_df.index, y=wtr, title=title, template = 'plotly_dark')

  # create list of outlier_dates**
  outlier_dates = predict_df[predict_df['Anomaly'] == 1].index

  # obtain y value of anomalies to plot**
  y_values = [predict_df.loc[i][wtr] for i in outlier_dates]

  fig.add_trace(go.Scatter(x=outlier_dates, y=y_values, mode = 'markers',
                  name = 'Anomaly',
                  marker=dict(color='red',size=10)))

  fig.show()
  # fig.write_html(f"/content/drive/MyDrive/RedLabHack/models_plot/{model_name}.html")
  fig.write_image(f"/content/drive/MyDrive/RedLabHack/models_plot/{model_name}.png")

In [None]:
def prepare_data(data: pd.DataFrame, WITH_RESAMPLING = True):
  data['point'] = pd.to_datetime(data['point'])
  data.drop(['agent_version', 'labels', 'account_id', 'name', 'language', 'app_name', 'app_id', 'host', 'display_host'], axis=1, inplace=True)
  data.set_index('point', drop=True, inplace=True)
  if WITH_RESAMPLING:
    # data = data.resample('H').sum()
    # creature features from date**
    data['day'] = [i.day for i in data.index]
    data['day_name'] = [i.day_name() for i in data.index]
    data['day_of_year'] = [i.dayofyear for i in data.index]
    data['week_of_year'] = [i.weekofyear for i in data.index]
    data['hour'] = [i.hour for i in data.index]
    data['minute'] = [i.minute for i in data.index]
    data['is_weekday'] = [i.isoweekday() for i in data.index]

  return data


In [None]:
data = pd.read_csv("/content/drive/MyDrive/RedLabHack/Dataset/filtered_data.csv")

In [None]:
data.head()

Unnamed: 0,account_id,name,point,call_count,total_call_time,total_exclusive_time,min_call_time,max_call_time,sum_of_squares,instances,language,app_name,app_id,scope,host,display_host,pid,agent_version,labels
0,1,Apdex,2024-04-15 23:32:00,3896.0,1.5,0.5,0.5,0.5,0.0,1.0,java,[GMonit] Collector,17592186045423,,575f5ba20b4b,575f5ba20b4b,1,8.5.0,{}
1,1,Apdex,2024-04-15 23:33:00,3916.5,0.0,1.0,0.5,0.5,0.0,1.0,java,[GMonit] Collector,17592186045423,,575f5ba20b4b,575f5ba20b4b,1,8.5.0,{}
2,1,Apdex,2024-04-15 23:34:00,3991.5,0.5,1.0,0.5,0.5,0.0,1.0,java,[GMonit] Collector,17592186045423,,575f5ba20b4b,575f5ba20b4b,1,8.5.0,{}
3,1,Apdex,2024-04-15 23:35:00,3990.5,0.5,0.5,0.5,0.5,0.0,1.0,java,[GMonit] Collector,17592186045423,,575f5ba20b4b,575f5ba20b4b,1,8.5.0,{}
4,1,Apdex,2024-04-15 23:36:00,3912.5,2.0,1.0,0.5,0.5,0.0,1.0,java,[GMonit] Collector,17592186045423,,575f5ba20b4b,575f5ba20b4b,1,8.5.0,{}


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86566 entries, 0 to 86565
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   account_id            86566 non-null  int64  
 1   name                  86566 non-null  object 
 2   point                 86566 non-null  object 
 3   call_count            86566 non-null  float64
 4   total_call_time       86566 non-null  float64
 5   total_exclusive_time  86566 non-null  float64
 6   min_call_time         86566 non-null  float64
 7   max_call_time         86566 non-null  float64
 8   sum_of_squares        86566 non-null  float64
 9   instances             86566 non-null  float64
 10  language              86566 non-null  object 
 11  app_name              86566 non-null  object 
 12  app_id                86566 non-null  int64  
 13  scope                 0 non-null      float64
 14  host                  86566 non-null  object 
 15  display_host       

In [None]:
data = prepare_data(data)

In [None]:
data.head(10)

Unnamed: 0_level_0,call_count,total_call_time,total_exclusive_time,min_call_time,max_call_time,sum_of_squares,instances,scope,pid,day,day_name,day_of_year,week_of_year,hour,minute,is_weekday
point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2024-04-15 23:32:00,3896.0,1.5,0.5,0.5,0.5,0.0,1.0,,1,15,Monday,106,16,23,32,1
2024-04-15 23:33:00,3916.5,0.0,1.0,0.5,0.5,0.0,1.0,,1,15,Monday,106,16,23,33,1
2024-04-15 23:34:00,3991.5,0.5,1.0,0.5,0.5,0.0,1.0,,1,15,Monday,106,16,23,34,1
2024-04-15 23:35:00,3990.5,0.5,0.5,0.5,0.5,0.0,1.0,,1,15,Monday,106,16,23,35,1
2024-04-15 23:36:00,3912.5,2.0,1.0,0.5,0.5,0.0,1.0,,1,15,Monday,106,16,23,36,1
2024-04-15 23:37:00,3892.5,2.5,2.0,0.5,0.5,0.0,1.0,,1,15,Monday,106,16,23,37,1
2024-04-15 23:38:00,3738.5,1.5,3.0,0.5,0.5,0.0,1.0,,1,15,Monday,106,16,23,38,1
2024-04-15 23:39:00,3643.0,1.5,2.0,0.5,0.5,0.0,1.0,,1,15,Monday,106,16,23,39,1
2024-04-15 23:40:00,3698.5,1.5,0.0,0.5,0.5,0.0,1.0,,1,15,Monday,106,16,23,40,1
2024-04-15 23:41:00,3669.0,1.0,0.0,0.5,0.5,0.0,1.0,,1,15,Monday,106,16,23,41,1


In [None]:
data = data.groupby(level=0).sum()

In [None]:
from pycaret.anomaly import *

In [None]:
setup(data, session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Original data shape,"(43272, 16)"
2,Transformed data shape,"(43272, 28)"
3,Numeric features,15
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


<pycaret.anomaly.oop.AnomalyExperiment at 0x7d4e5f4a59f0>

In [None]:
# to check all the available models
all_models = models()
all_models_ids = ['cluster', 'cof', 'knn', 'lof', 'mcd', 'sod']
fraction = 0.05

- Clustering-Based Local Outlier (cluster)
- Connectivity-Based Local Outlier	(cof)
- K-Nearest Neighbors  (knn)
- Local Outlier Factor (lof)
- Minimum Covariance Determinant (mcd)
- Subspace Outlier Detection	(sod)

In [None]:
for model_id in all_models_ids:
  model = exp.create_model(model_id, fraction=fraction)
  model_anomalies = exp.assign_model(model)
  model_predict = exp.predict_model(model, data)
  exp.save_model(model, model_only=True, model_name=f"/content/drive/MyDrive/RedLabHack/Saved_model/{model_id}")
  plot_predict(model_anomalies, "Anomaly_Score", f"Calls anomalies detection : {model_id} : Fraction: {fraction}", model_id)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

Model Successfully Saved


Processing:   0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
model = create_model('knn', fraction=fraction)
model_anomalies = assign_model(model)

Processing:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
model_anomalies.head(10)

Unnamed: 0_level_0,call_count,total_call_time,total_exclusive_time,min_call_time,max_call_time,sum_of_squares,instances,scope,pid,day,day_name,day_of_year,week_of_year,hour,is_weekday,Anomaly,Anomaly_Score
point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2024-04-15 23:00:00,199012.5,1137.89502,1139.39502,14.002987,138.587555,1088.380737,56.0,0.0,56,15,Monday,106,16,23,1,0,14640.516688
2024-04-16 00:00:00,357381.0,1676.259766,1653.259766,30.006462,267.23822,2607.42627,120.0,0.0,120,16,Tuesday,107,16,0,2,0,5385.215956
2024-04-16 01:00:00,273943.0,924.430115,901.430115,30.00655,86.944229,111.735596,120.0,0.0,120,16,Tuesday,107,16,1,2,0,2742.354935
2024-04-16 02:00:00,228604.5,715.322388,690.822388,30.006557,59.895206,42.265999,120.0,0.0,120,16,Tuesday,107,16,2,2,0,1784.752227
2024-04-16 03:00:00,218871.5,1068.339478,1051.839478,30.006735,150.534348,1369.345825,120.0,0.0,120,16,Tuesday,107,16,3,2,0,1588.920397
2024-04-16 04:00:00,217443.0,751.734436,727.734436,30.006546,117.307449,527.938965,120.0,0.0,120,16,Tuesday,107,16,4,2,0,1231.084428
2024-04-16 05:00:00,236363.5,1104.629639,1054.129639,30.00647,199.728363,1110.362915,120.0,0.0,120,16,Tuesday,107,16,5,2,0,1070.08089
2024-04-16 06:00:00,272986.0,1193.455078,1132.455078,30.006342,154.499741,496.99118,120.0,0.0,120,16,Tuesday,107,16,6,2,0,1840.679106
2024-04-16 07:00:00,336379.5,2378.216309,2277.716309,30.006191,873.610657,68495.992188,120.0,0.0,120,16,Tuesday,107,16,7,2,0,23733.004264
2024-04-16 08:00:00,391277.5,3271.618408,3122.118408,30.006245,1012.115662,21019.722656,120.0,0.0,120,16,Tuesday,107,16,8,2,0,16694.514225


In [None]:
model_anomalies[model_anomalies['Anomaly'] == 1]

Unnamed: 0_level_0,call_count,total_call_time,total_exclusive_time,min_call_time,max_call_time,sum_of_squares,instances,scope,pid,day,day_name,day_of_year,week_of_year,hour,is_weekday,Anomaly,Anomaly_Score
point,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2024-04-16 15:00:00,506899.5,7624.06543,7294.56543,30.006207,2855.081299,774487.3,120.0,0.0,120,16,Tuesday,107,16,15,2,1,234612.3
2024-04-16 16:00:00,499421.0,6038.006836,5824.006836,30.006245,1565.705322,468122.2,120.0,0.0,120,16,Tuesday,107,16,16,2,1,89692.51
2024-04-16 19:00:00,479174.5,5768.54541,5580.04541,30.006144,1309.948486,491140.5,120.0,0.0,120,16,Tuesday,107,16,19,2,1,101160.7
2024-04-18 12:00:00,564519.5,5817.526855,6128.026855,30.00626,1934.099609,357401.4,120.0,0.0,120,18,Thursday,109,16,12,4,1,57558.71
2024-04-19 11:00:00,935065.5,10873.005859,10317.505859,30.006281,3172.061523,245461.3,120.0,0.0,120,19,Friday,110,16,11,5,1,228225.1
2024-04-19 12:00:00,748922.0,8410.811523,7823.811523,30.006044,2838.305176,211163.6,120.0,0.0,120,19,Friday,110,16,12,5,1,84490.79
2024-04-19 15:00:00,668835.5,10813.501953,10171.001953,30.005997,3434.873291,981377.5,120.0,0.0,120,19,Friday,110,16,15,5,1,404945.8
2024-04-19 16:00:00,612700.5,7432.439453,7200.939453,29.006207,2961.650635,414817.7,120.0,0.0,120,19,Friday,110,16,16,5,1,89743.44
2024-04-19 19:00:00,552382.5,7804.53125,7358.03125,30.006281,2133.059326,593561.6,120.0,0.0,120,19,Friday,110,16,19,5,1,143537.2
2024-04-20 10:00:00,508365.0,8493.126953,8224.126953,30.006224,2707.082275,1015007.0,120.0,0.0,120,20,Saturday,111,16,10,6,1,394108.7
