In [251]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# be careful with that:
import warnings
warnings.filterwarnings('ignore')

In [252]:
# load data

data_heatmap = pd.read_csv('/Users/xyc/Desktop/ENGI 4800/Criteo/A_data_heatmap_300x250_3p.csv')

In [253]:
# expand so that each row corresponds to 1 click:

data_heatmap_expanded = data_heatmap.loc[data_heatmap.index.repeat(data_heatmap['clicks'])].reset_index(drop=True)
data_heatmap_expanded['clicks'] = 1

click_stat = data_heatmap_expanded.groupby(['click_x','click_y'])['clicks'].count().reset_index()

In [254]:
data_heatmap

Unnamed: 0.1,Unnamed: 0,domain,grid_id,click_x,click_y,display_height,display_width,clicks
0,0,ID_1763,333519,179,4,250,300,1
1,1,ID_1763,333519,262,185,250,300,1
2,2,ID_1763,333519,122,188,250,300,1
3,3,ID_1763,333519,263,197,250,300,1
4,4,ID_1763,333519,242,159,250,300,1
...,...,...,...,...,...,...,...,...
1935421,1935421,ID_1501,333519,163,145,250,300,8
1935422,1935422,ID_1501,333519,41,139,250,300,4
1935423,1935423,ID_1501,333519,213,97,250,300,2
1935424,1935424,ID_1501,333519,271,245,250,300,1


In [255]:
data_heatmap_expanded

Unnamed: 0.1,Unnamed: 0,domain,grid_id,click_x,click_y,display_height,display_width,clicks
0,0,ID_1763,333519,179,4,250,300,1
1,1,ID_1763,333519,262,185,250,300,1
2,2,ID_1763,333519,122,188,250,300,1
3,3,ID_1763,333519,263,197,250,300,1
4,4,ID_1763,333519,242,159,250,300,1
...,...,...,...,...,...,...,...,...
2584618,1935425,ID_1501,333519,136,107,250,300,1
2584619,1935425,ID_1501,333519,136,107,250,300,1
2584620,1935425,ID_1501,333519,136,107,250,300,1
2584621,1935425,ID_1501,333519,136,107,250,300,1


In [256]:
# Binning clicks in nr_of_x_bins, nr_of_y_bins:

NR_OF_X_BINS = 61
NR_OF_Y_BINS = 51

max_width = data_heatmap['display_width'].max()
max_height = data_heatmap['display_height'].max()

width_bins = np.linspace(1, max_width, NR_OF_X_BINS)
height_bins = np.linspace(1, max_height, NR_OF_Y_BINS)

width_bins_max = len(width_bins)-2 # start at 0
height_bins_max = len(height_bins)-2

data_heatmap_expanded['click_x_bin'] = pd.cut(data_heatmap_expanded['click_x'], 
                                      bins = width_bins, 
                                      labels=False, 
                                      include_lowest=True)

data_heatmap_expanded['click_y_bin'] = pd.cut(data_heatmap_expanded['click_y'], 
                                      bins=height_bins, 
                                      labels=False, 
                                      include_lowest=True)


aggregated_clicks = data_heatmap_expanded.groupby(['grid_id',
                                           'domain', 
                                           'click_x_bin', 
                                           'click_y_bin']).size().reset_index(name='clicks_sum')


#aggregated_clicks['clicks_sum'].astype('int64');

In [257]:
# add empty bins --> easier to generate vectors
# takes a few seconds

from itertools import product

domains_grids = data_heatmap[['domain', 'grid_id']].drop_duplicates()

aux = pd.DataFrame(list(product(range(0,width_bins_max+1), 
                                range(0,height_bins_max+1))), 
                   columns=['click_x_bin', 'click_y_bin'])

domains_grids['key'] = 1
aux['key'] = 1

expanded_set = pd.merge(domains_grids, aux, on='key').drop('key', axis=1)

data_binned = pd.merge(expanded_set, aggregated_clicks, 
                  on = ['domain', 'grid_id', 'click_x_bin', 'click_y_bin'], 
                  how = 'left').fillna(0)
data_binned['clicks_sum'] = data_binned['clicks_sum'].astype('int64')
#data_binned

In [258]:
# for clustering

data_binned[(data_binned.domain == 'ID_1763') &
            (data_binned.grid_id == 333519)]['clicks_sum'].to_numpy()

array([0, 0, 0, ..., 0, 0, 0])

In [259]:
## K-NN

In [260]:
# Reshape data into a 50*60 (or 60*50) vector for each grid_id and domain combination
heatmap_vectors = data_binned.pivot_table(index=['grid_id', 'domain'], columns=['click_x_bin', 'click_y_bin'], values='clicks_sum', fill_value=0).reset_index()
heatmap_vectors_flat = heatmap_vectors.drop(columns=['grid_id', 'domain']).values

In [261]:
heatmap_vectors_flat

array([[ 0,  0,  0, ...,  0,  0,  0],
       [54, 36, 12, ...,  0,  0,  0],
       [ 0,  0,  1, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [262]:
# Initialize NearestNeighbors (k-NN)
knn = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='euclidean')

# Fit the model
knn.fit(heatmap_vectors_flat)

# Calculate the distances and indices of the k-neighbors for each point
distances, indices = knn.kneighbors(heatmap_vectors_flat)

# Calculate the anomaly score as the mean distance to the k-neighbors
anomaly_scores = distances.mean(axis=1)

# Convert anomaly scores into a DataFrame
anomaly_scores_df = pd.DataFrame({
    'grid_id': heatmap_vectors['grid_id'],
    'domain': heatmap_vectors['domain'],
    'anomaly_score': anomaly_scores
})

# Identify potential anomalies by setting a threshold, for example using a percentile
threshold = np.percentile(anomaly_scores, 95)  # Consider top 5% as potential anomalies
anomaly_scores_df['is_anomaly'] = anomaly_scores_df['anomaly_score'] > threshold

anomaly_scores_df

Unnamed: 0,grid_id,domain,anomaly_score,is_anomaly
0,333346,ID_1,22.896447,False
1,333346,ID_10,159.845436,True
2,333346,ID_1002,23.771816,False
3,333346,ID_1005,62.837153,False
4,333346,ID_1010,17.883612,False
...,...,...,...,...
1728,333519,ID_989,29.311979,False
1729,333519,ID_990,17.780857,False
1730,333519,ID_995,15.671548,False
1731,333519,ID_996,44.081592,False


In [263]:
anomalies_count_per_grid = anomaly_scores_df.groupby('grid_id')['is_anomaly'].sum().reset_index(name='anomaly_count')
total_domains_per_grid = anomaly_scores_df.groupby('grid_id').size().reset_index(name='total_domains')
anomaly_analysis = pd.merge(anomalies_count_per_grid, total_domains_per_grid, on='grid_id')
anomaly_analysis['anomaly_percentage'] = (anomaly_analysis['anomaly_count'] / anomaly_analysis['total_domains']) * 100

anomaly_analysis, anomaly_scores_df[anomaly_scores_df['is_anomaly']].reset_index(drop=True)

(   grid_id  anomaly_count  total_domains  anomaly_percentage
 0   333346             56            861            6.504065
 1   333519             31            872            3.555046,
     grid_id   domain  anomaly_score  is_anomaly
 0    333346    ID_10     159.845436        True
 1    333346  ID_1037     134.008816        True
 2    333346  ID_1062     125.865737        True
 3    333346  ID_1163     100.554652        True
 4    333346  ID_1165     107.320468        True
 ..      ...      ...            ...         ...
 82   333519   ID_357     170.458783        True
 83   333519   ID_398     118.845367        True
 84   333519   ID_458     127.543661        True
 85   333519   ID_720     158.635198        True
 86   333519    ID_83     181.986617        True
 
 [87 rows x 4 columns])

In [264]:
# "Clearly broken" domain IDs for grid_id 333519 and 333346 from Martin
clearly_broken_333519 = set([
    'ID_1184', 'ID_1281', 'ID_1305', 'ID_1353', 'ID_1448', 'ID_1522',
    'ID_1544', 'ID_162', 'ID_1682', 'ID_1690', 'ID_1824', 'ID_1888',
    'ID_1929', 'ID_2076', 'ID_2097', 'ID_2226', 'ID_2249', 'ID_2268',
    'ID_2331', 'ID_2339', 'ID_2386', 'ID_2396', 'ID_2438', 'ID_258',
    'ID_2609', 'ID_2680', 'ID_2863', 'ID_2883', 'ID_2908', 'ID_3061',
    'ID_3243', 'ID_3250', 'ID_3314', 'ID_3382', 'ID_3397', 'ID_3402',
    'ID_3420', 'ID_3459', 'ID_3470', 'ID_3540', 'ID_398', 'ID_409',
    'ID_484', 'ID_489', 'ID_526', 'ID_549', 'ID_580', 'ID_665',
    'ID_810', 'ID_84', 'ID_857', 'ID_86', 'ID_905', 'ID_927', 'ID_934',
    'ID_962', 'ID_986'
])

clearly_broken_333346 = set([
    'ID_1247', 'ID_162', 'ID_2534', 'ID_2742', 'ID_526', 'ID_2201',
       'ID_1165', 'ID_743', 'ID_199', 'ID_2145', 'ID_2569', 'ID_643',
       'ID_1305', 'ID_3180', 'ID_3158', 'ID_136', 'ID_293', 'ID_1753',
       'ID_1849', 'ID_2226', 'ID_1462', 'ID_626', 'ID_2863', 'ID_3243',
       'ID_3250', 'ID_1708', 'ID_1238', 'ID_580', 'ID_84', 'ID_2568',
       'ID_2340', 'ID_1803', 'ID_3470', 'ID_139', 'ID_2619', 'ID_2908',
       'ID_1281', 'ID_3308', 'ID_2883', 'ID_1320', 'ID_1333', 'ID_1062',
       'ID_149', 'ID_260', 'ID_599', 'ID_1513', 'ID_3402', 'ID_1888',
       'ID_2972', 'ID_398', 'ID_2339', 'ID_1030', 'ID_3382', 'ID_2076',
       'ID_1646', 'ID_1077', 'ID_10', 'ID_1153', 'ID_1533', 'ID_2609',
       'ID_1214', 'ID_810', 'ID_2097', 'ID_2386', 'ID_1585', 'ID_549',
       'ID_3420', 'ID_3397', 'ID_2006', 'ID_561', 'ID_1542', 'ID_1570',
       'ID_1819', 'ID_1413', 'ID_1240', 'ID_793', 'ID_831', 'ID_3188',
       'ID_1437', 'ID_556', 'ID_1134', 'ID_314', 'ID_2836', 'ID_1483',
       'ID_2676', 'ID_665', 'ID_15', 'ID_2755', 'ID_2194', 'ID_1682',
       'ID_2775', 'ID_1923', 'ID_1846', 'ID_1262', 'ID_3314', 'ID_1448',
       'ID_1770', 'ID_2626', 'ID_3296', 'ID_489', 'ID_1963', 'ID_1268',
       'ID_1568', 'ID_1094', 'ID_2267', 'ID_409', 'ID_1650', 'ID_1522',
       'ID_927', 'ID_2302', 'ID_1929', 'ID_1435', 'ID_1353'
])

In [265]:
# Filter anomaly domains for grid_id 333519
anomaly_domains_333519 = set(anomaly_scores_df[(anomaly_scores_df['grid_id'] == 333519) & (anomaly_scores_df['is_anomaly'])]['domain'])

# Find the overlap between detected anomalies and "Clearly broken" domains
overlap_333519 = anomaly_domains_333519.intersection(clearly_broken_333519)

# Calculate the overlap percentage
overlap_percentage_333519 = (len(overlap_333519) / len(clearly_broken_333519)) * 100

print(f"Overlap for grid_id 333519: {overlap_333519}")
print(f"Overlap percentage: {overlap_percentage_333519}%")

Overlap for grid_id 333519: {'ID_2076', 'ID_398'}
Overlap percentage: 3.508771929824561%


In [267]:
# Filter anomaly domains for grid_id 333346
anomaly_domains_333346 = set(anomaly_scores_df[(anomaly_scores_df['grid_id'] == 333346) & (anomaly_scores_df['is_anomaly'])]['domain'])

# Find the overlap between detected anomalies and "Clearly broken" domains
overlap_333346 = anomaly_domains_333346.intersection(clearly_broken_333346)

# Calculate the overlap percentage
overlap_percentage_333346 = (len(overlap_333346) / len(clearly_broken_333346)) * 100

print(f"Overlap for grid_id 333346: {overlap_333346}")
print(f"Overlap percentage: {overlap_percentage_333346}%")

Overlap for grid_id 333346: {'ID_489', 'ID_2569', 'ID_1165', 'ID_3180', 'ID_1568', 'ID_10', 'ID_2836', 'ID_1448', 'ID_1062', 'ID_2883', 'ID_2339', 'ID_3382', 'ID_2076', 'ID_2609'}
Overlap percentage: 12.389380530973451%
