In [1]:
import argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
import torch
from torch.utils.data import Dataset
import numpy as np
import pandas as pd
from itertools import product

class HeatmapDataset(Dataset):
    def __init__(self, num_samples, width, height, num_clusters, points_per_cluster, nr_of_x_bins, nr_of_y_bins, transform=None):
        self.num_samples = num_samples
        self.width = width
        self.height = height
        # self.num_clusters = num_clusters
        self.points_per_cluster = points_per_cluster
        self.nr_of_x_bins = nr_of_x_bins
        self.nr_of_y_bins = nr_of_y_bins
        self.transform = transform

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        image, label = self.generate_synthetic_heatmap()
        if self.transform:
            image = self.transform(image)
        return image, label

    def generate_synthetic_heatmap(self):
        # Generate synthetic clicks
        synthetic_clicks = []
        num_clusters = np.random.randint(0,12)
        for _ in range(num_clusters):
            center = (np.random.randint(0, self.width), np.random.randint(0, self.height))
            cov_matrix = np.array([[np.random.rand() * 10, 0], [0, np.random.rand() * 10]])
            points = np.random.multivariate_normal(center, cov_matrix, self.points_per_cluster)
            synthetic_clicks.extend(points)

        # Bin the clicks
        width_bins = np.linspace(0, self.width, self.nr_of_x_bins)
        height_bins = np.linspace(0, self.height, self.nr_of_y_bins)

        click_x_bin = pd.cut([p[0] for p in synthetic_clicks], bins=width_bins, labels=False, include_lowest=True)
        click_y_bin = pd.cut([p[1] for p in synthetic_clicks], bins=height_bins, labels=False, include_lowest=True)

        # Aggregate clicks into bins
        data = pd.DataFrame({'click_x_bin': click_x_bin, 'click_y_bin': click_y_bin})
        aggregated_clicks = data.groupby(['click_x_bin', 'click_y_bin']).size().reset_index(name='clicks_sum')

        # Pivot to create a heatmap matrix
        heatmap_matrix = aggregated_clicks.pivot(index='click_y_bin', columns='click_x_bin', values='clicks_sum').fillna(0).to_numpy()
        image = np.uint8(heatmap_matrix * 255 / np.max(heatmap_matrix))

        # Assume label is number of clusters for simplicity, adjust as needed
        label = num_clusters
        # label = self.num_clusters

        # START JEAN: make the dimension  correct

        n_y = NR_OF_Y_BINS- image.shape[0]
        img = torch.Tensor(image)
        zeros = torch.zeros((n_y, img.shape[1]), dtype=img.dtype)
        result = torch.cat((zeros, img), dim=0)
        print(result.shape)

        n_x = NR_OF_X_BINS- img.shape[1]
        zeros = torch.zeros((NR_OF_Y_BINS, n_x), dtype=img.dtype)
        result = torch.cat((zeros, result), dim=1)
        # print(result.shape)
        image = result.float().unsqueeze(0)
        # END JEAN
        # image = torch.from_numpy(image).float().unsqueeze(0)  # Add channel dimension for CNN compatibility
        return image, label

# Example usage
num_samples = 1000
width = 60
height = 50
num_clusters = 5
points_per_cluster = 100
NR_OF_X_BINS = 61
NR_OF_Y_BINS = 51

dataset = HeatmapDataset(num_samples, width, height, num_clusters, points_per_cluster, NR_OF_X_BINS, NR_OF_Y_BINS)

# Example: visualizing the first synthetic heatmap and its label
import matplotlib.pyplot as plt

image, label = dataset[0]
plt.imshow(image.squeeze(), cmap='hot')  # Remove channel dimension for plotting
plt.title(f'Number of Clusters: {label}')
plt.colorbar()
plt.show()


# TO IGNORE

In [2]:
!python3 img_pipeline.py 333519 200 '../datasets/train/data_heatmap_train.csv'

df = pd.read_csv('temp/heatmaps.csv')

Grid ID:  333519
Nb components:  200
Data directory:  ../datasets/train/data_heatmap_train.csv


TODO: 
1. Plot as heatmap

In [3]:
df
plot as heatmap = 0

Unnamed: 0.1,Unnamed: 0,domain,grid_id,clicks_sum_for_0_0,clicks_sum_for_0_1,clicks_sum_for_0_2,clicks_sum_for_0_3,clicks_sum_for_0_4,clicks_sum_for_0_5,clicks_sum_for_0_6,...,clicks_sum_for_59_40,clicks_sum_for_59_41,clicks_sum_for_59_42,clicks_sum_for_59_43,clicks_sum_for_59_44,clicks_sum_for_59_45,clicks_sum_for_59_46,clicks_sum_for_59_47,clicks_sum_for_59_48,clicks_sum_for_59_49
0,0,ID_1000,333519,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,ID_1010,333519,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,ID_1021,333519,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,ID_1024,333519,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,4,ID_1055,333519,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,605,ID_978,333519,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
606,606,ID_990,333519,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
607,607,ID_995,333519,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
608,608,ID_996,333519,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
