In [None]:
from pathlib import Path
from typing import Any, cast, TYPE_CHECKING
import kagglehub
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import torch
from torch import tensor
import torch.nn as nn

from digix.utility.data import clean_numeric_data, load_data
from digix.analysis.evaluation import evaluate_synthetic_data_quality

if TYPE_CHECKING:
    from torch import Tensor
    from torch.optim import Optimizer
    from matplotlib.axes import Axes

In [None]:
model_path = Path("models")

In [None]:
data_path: Path = Path(kagglehub.dataset_download("xiaojiu1414/digix-global-ai-challenge"))

training_data = load_data(
    feeds_path=data_path / "train" / "train_data_feeds.csv",
    ads_path=data_path / "train" / "train_data_ads.csv"
).sample(frac=0.01, random_state=42).reset_index(drop=True)  # pyright: ignore[reportUnknownMemberType]

training_data, test_data = train_test_split(training_data, test_size=0.2, random_state=42)
training_data, validation_data = train_test_split(training_data, test_size=0.2, random_state=42)


numeric_features = training_data.select_dtypes(include='number').columns.tolist()

X_train_numeric = training_data[numeric_features].drop(columns=['label'])
y_train_numeric = training_data[numeric_features]['label']
X_train_numeric = clean_numeric_data(X_train_numeric)

X_validation_numeric = validation_data[numeric_features].drop(columns=['label'])
y_validation_numeric = validation_data[numeric_features]['label']
X_validation_numeric = clean_numeric_data(X_validation_numeric)

X_test_numeric = test_data[numeric_features].drop(columns=['label'])
y_test_numeric = test_data[numeric_features]['label']
X_test_numeric = clean_numeric_data(X_test_numeric)

In [None]:
X_train_numeric

In [None]:
X_train_numeric.drop(columns=[
    "log_id", "user_id", "adv_id", "site_id", "adv_prim_id", "slot_id"
])

In [None]:
print(f'{X_train_numeric["residence"].nunique()} "residence" categories')
print(f'{X_train_numeric["gender"].nunique()} "gender" categories')
print(f'{X_train_numeric["city"].nunique()} "city" categories')
print(f'{X_train_numeric["city_rank"].nunique()} "city_rank" categories')
print(f'{X_train_numeric["device_name"].nunique()} "device_name" categories')
print(f'{X_train_numeric["net_type"].nunique()} "net_type" categories')
print(f'{X_train_numeric["series_dev"].nunique()} "series_dev" categories')
print(f'{X_train_numeric["series_group"].nunique()} "series_group" categories')
print(f'{X_train_numeric["emui_dev"].nunique()} "emui_dev" categories')

In [None]:
class GenerativeNetwork(nn.Module):
    def __init__(
        self,
        input_size: int,
        n_hidden_layers: int,
        hidden_layer_size: int,
        n_classes: int,
        transformer: bool = False
    ):
        super().__init__()  # pyright: ignore[reportUnknownMemberType]
        self.input_size: int = input_size
        self.input_layer = nn.Linear(self.input_size, hidden_layer_size)

        self.hidden_layers = nn.ModuleList([
            nn.Linear(hidden_layer_size, hidden_layer_size)
            for _ in range(n_hidden_layers)
        ])
        self.activation = nn.ReLU()
        self.output_layer = nn.Linear(hidden_layer_size, n_classes)
    
    def forward(self, inputs: "Tensor") -> "Tensor":
        x = self.activation(self.input_layer(inputs.float()))
        for layer in self.hidden_layers:
            x = self.activation(x + layer(x))
        return self.activation(self.output_layer(x))

class GenerativeNormalNetwork(nn.Module):
    def __init__(
        self,
        input_size: int,
        n_hidden_layers: int,
        hidden_layer_size: int,
    ):
        super().__init__()  # pyright: ignore[reportUnknownMemberType]
        self.input_size: int = input_size
        self.input_layer = nn.Linear(self.input_size, hidden_layer_size)
        self.hidden_layers = nn.ModuleList([
            nn.Linear(hidden_layer_size, hidden_layer_size)
            for _ in range(n_hidden_layers)
        ])
        self.activation = nn.ReLU()
        self.output_layer = nn.Linear(hidden_layer_size, 2)
    
    def forward(self, inputs: "Tensor") -> "Tensor":
        x = self.activation(self.input_layer(inputs.float()))
        for layer in self.hidden_layers:
            x = self.activation(x + layer(x))
        x = self.output_layer(x)
        x[:, 1] = self.activation(x[:, 1]) + 1e-9
        return x
    
class GenerativeExponentialNetwork(nn.Module):
    def __init__(
        self,
        input_size: int,
        n_hidden_layers: int,
        hidden_layer_size: int,
    ):
        super().__init__()  # pyright: ignore[reportUnknownMemberType]
        self.input_size: int = input_size
        self.input_layer = nn.Linear(self.input_size, hidden_layer_size)
        self.hidden_layers = nn.ModuleList([
            nn.Linear(hidden_layer_size, hidden_layer_size)
            for _ in range(n_hidden_layers)
        ])
        self.activation = nn.ReLU()
        self.output_layer = nn.Linear(hidden_layer_size, 1)
    
    def forward(self, inputs: "Tensor") -> "Tensor":
        x = self.activation(self.input_layer(inputs.float()))
        for layer in self.hidden_layers:
            x = self.activation(x + layer(x))
        return self.activation(self.output_layer(x)) + 1e-9


In [None]:
city_one_hot: pd.DataFrame = pd.get_dummies(X_train_numeric["city"])
city: "Tensor" = tensor(city_one_hot.to_numpy()).long().argmax(dim=1)  # pyright: ignore[reportUnknownMemberType]

city_rank_one_hot: pd.DataFrame = pd.get_dummies(X_train_numeric["city_rank"])
city_rank: "Tensor" = tensor(city_rank_one_hot.to_numpy()).long().argmax(dim=1)  # pyright: ignore[reportUnknownMemberType]

gender_one_hot: pd.DataFrame = pd.get_dummies(X_train_numeric["gender"])
gender: "Tensor" = tensor(gender_one_hot.to_numpy()).long().argmax(dim=1)  # pyright: ignore[reportUnknownMemberType]

net_one_hot: pd.DataFrame = pd.get_dummies(X_train_numeric["net_type"])
net: "Tensor" = tensor(city_rank_one_hot.to_numpy()).long().argmax(dim=1)  # pyright: ignore[reportUnknownMemberType]

age_one_hot: pd.DataFrame = pd.get_dummies(X_train_numeric["age"])
age: "Tensor" = tensor(age_one_hot.to_numpy()).long().argmax(dim=1)  # pyright: ignore[reportUnknownMemberType]

In [None]:
city_one_hot_test: pd.DataFrame = pd.get_dummies(X_test_numeric["city"])
city_test: "Tensor" = tensor(city_one_hot_test.to_numpy()).long().argmax(dim=1)  # pyright: ignore[reportUnknownMemberType]

city_rank_one_hot_test: pd.DataFrame = pd.get_dummies(X_test_numeric["city_rank"])
city_rank_test: "Tensor" = tensor(city_rank_one_hot_test.to_numpy()).long().argmax(dim=1)  # pyright: ignore[reportUnknownMemberType]

gender_one_hot_test: pd.DataFrame = pd.get_dummies(X_test_numeric["gender"])
gender_test: "Tensor" = tensor(gender_one_hot_test.to_numpy()).long().argmax(dim=1)  # pyright: ignore[reportUnknownMemberType]

In [None]:
city_rank_one_hot

In [None]:
torch.cat([
    tensor(city_one_hot.to_numpy()),
    tensor(gender_one_hot.to_numpy())
], dim=1).shape

In [None]:
def train(
    model: "nn.Module",
    train: "Tensor",
    y: "Tensor",
    criterion: "nn.Module",
    optimizer: "Optimizer",
    epochs: int,
    silent: bool = False
):
    if epochs < 1:
        raise ValueError("Must specify at least 1 epoch")
    
    training_loss: list[float] = []
    model.train()
    for epoch in range(epochs):

        generated: Tensor = model(train)
        loss_ = criterion(generated, y)

        optimizer.zero_grad()
        loss_.backward()
        optimizer.step()

        epoch_loss = loss_.item()
        training_loss.append(epoch_loss)
        
        if not silent:
            print(f'Epoch {epoch + 1} training loss: {epoch_loss}')

    return training_loss

num_city_ranks: int = len(city_rank_one_hot.columns)
city_rank_model = GenerativeNetwork(
    input_size=1,
    n_hidden_layers=6,
    hidden_layer_size=128,
    n_classes=num_city_ranks
)
try:
    city_rank_model.load_state_dict(
        torch.load(model_path / "city_rank.pt")  # pyright: ignore[reportUnknownMemberType]
    )
except FileNotFoundError:
    city_rank_optimizer = torch.optim.AdamW(params=city_rank_model.parameters(), lr=1e-2)
    city_rank_criterion = nn.CrossEntropyLoss()

    train(
        city_rank_model,
        train=torch.rand((city_rank.shape[0], 1)),
        y=city_rank,
        optimizer=city_rank_optimizer,
        criterion=city_rank_criterion,
        epochs=10**2
    )

In [None]:
class LogRegression(Module):
    def __init__(self, input_size: int, output_size: int):
        super().__init__()  # pyright: ignore[reportUnknownMemberType]
        self.regression = nn.Linear(input_size, output_size)

    def forward(self, inputs: "Tensor") -> "Tensor":
        return self.regression(inputs.float())
    

class NormalRegression(Module):
    def __init__(self, input_size: int, output_size: int):
        super().__init__()  # pyright: ignore[reportUnknownMemberType]
        self.regression = nn.Linear(input_size, output_size)

    def forward(self, inputs: "Tensor") -> "Tensor":
        return self.regression(inputs.float())
    

In [None]:
simple_city_rank_model = LogRegression(input_size=1, output_size=4)

simple_city_rank_optimizer = torch.optim.AdamW(params=simple_city_rank_model.parameters(), lr=1e-2)
simple_city_rank_criterion = nn.CrossEntropyLoss()

train(
    simple_city_rank_model,
    train=torch.rand((city_rank.shape[0], 1)),
    y=city_rank,
    optimizer=simple_city_rank_optimizer,
    criterion=simple_city_rank_criterion,
    epochs=10**4
)

In [None]:
plt.hist(city_rank_test, alpha=0.5)

with torch.no_grad():
    plt.hist(
        torch.multinomial(torch.softmax(
            simple_city_rank_model(torch.rand((city_rank_test.shape[0], 1))),
            dim=1
        ), num_samples=1),
        alpha=0.5
    )



In [None]:
num_genders: int = len(gender_one_hot.columns)
gender_model = GenerativeNetwork(
    input_size=num_city_ranks,
    n_hidden_layers=6,
    hidden_layer_size=128,
    n_classes=num_genders
)

try:
    gender_model.load_state_dict(
        torch.load(model_path / "gender.pt")  # pyright: ignore[reportUnknownMemberType]
    )
except FileNotFoundError:
    gender_optimizer = torch.optim.AdamW(params=gender_model.parameters(), lr=1e-5)
    gender_criterion = nn.CrossEntropyLoss()

    train(
        gender_model,
        train=tensor(city_rank_one_hot.to_numpy()).long(),  # pyright: ignore[reportUnknownMemberType]
        y=gender,
        optimizer=gender_optimizer,
        criterion=gender_criterion,
        epochs=10**2 * 5
    )

In [None]:
simple_gender_model = LogRegression(input_size=4, output_size=3)

simple_gender_optimizer = torch.optim.AdamW(params=simple_gender_model.parameters(), lr=1e-2)
simple_gender_criterion = nn.CrossEntropyLoss()

train(
    simple_gender_model,
    train=tensor(city_rank_one_hot.to_numpy()).long(),  # pyright: ignore[reportUnknownMemberType]
    y=gender,
    optimizer=simple_gender_optimizer,
    criterion=simple_gender_criterion,
    epochs=10**4
)

In [None]:
plt.hist(gender_test, alpha=0.5, bins=num_genders, density=True)

with torch.no_grad():
    plt.hist(
        torch.multinomial(torch.softmax(
            simple_gender_model(tensor(city_rank_one_hot_test.to_numpy())),
            dim=1
        ), num_samples=1),
        alpha=0.5,
        bins=num_genders,
        density=True
    )



In [None]:
num_ages: int = len(age_one_hot.columns)
age_model = GenerativeNetwork(
    input_size=num_city_ranks + num_genders,
    n_hidden_layers=6,
    hidden_layer_size=12,
    n_classes=num_ages
)

try:
    age_model.load_state_dict(
        torch.load(model_path / "age.pt")  # pyright: ignore[reportUnknownMemberType]
    )
except FileNotFoundError:
    age_optimizer = torch.optim.AdamW(params=age_model.parameters(), lr=1e-4)
    age_criterion = nn.CrossEntropyLoss()

    train(
        age_model,
        train=torch.cat([
            tensor(city_rank_one_hot.to_numpy()),
            tensor(gender_one_hot.to_numpy())
        ], dim=1),
        y=age,
        optimizer=age_optimizer,
        criterion=age_criterion,
        epochs=10**3
    )

In [None]:
# simple_age_model = GenerativeNetwork(
#     input_size=num_city_ranks + num_genders,
#     n_hidden_layers=6,
#     hidden_layer_size=12,
#     n_classes=num_ages
# )
# simple_age_optimizer = torch.optim.AdamW(params=simple_age_model.parameters(), lr=1e-4)
# simple_age_criterion = nn.CrossEntropyLoss()

train(
    simple_age_model,
    train=torch.cat([
        tensor(city_rank_one_hot.to_numpy()),
        tensor(gender_one_hot.to_numpy())
    ], dim=1),
    y=age,
    optimizer=simple_age_optimizer,
    criterion=simple_age_criterion,
    epochs=10**4
)

In [None]:
plt.hist(age, alpha=0.5)

with torch.no_grad():
    plt.hist(
        torch.multinomial(torch.softmax(
            simple_age_model(torch.cat([
                tensor(city_rank_one_hot.to_numpy()),
                tensor(gender_one_hot.to_numpy())
            ], dim=1)),
            dim=1
        ), num_samples=1),
        alpha=0.5
    )



In [None]:
num_cities = len(city_one_hot.columns)
city_model = GenerativeNetwork(
    input_size=num_city_ranks + num_genders + num_ages,
    n_hidden_layers=6,
    hidden_layer_size=48,
    n_classes=num_cities
)

try:
    city_model.load_state_dict(
        torch.load(model_path / "city.pt")  # pyright: ignore[reportUnknownMemberType]
    )
except FileNotFoundError:
    city_optimizer = torch.optim.AdamW(params=city_model.parameters(), lr=1e-4)
    city_criterion = nn.CrossEntropyLoss()

    train(
        city_model,
        train=torch.cat([
            tensor(city_rank_one_hot.to_numpy()),
            tensor(gender_one_hot.to_numpy()),
            tensor(age_one_hot.to_numpy()),
        ], dim=1),
        y=city,
        optimizer=city_optimizer,
        criterion=city_criterion,
        epochs=10**3 * 2
    )

In [None]:
plt.hist(city, alpha=0.5, bins=num_cities // 2)

with torch.no_grad():
    plt.hist(
        torch.multinomial(torch.softmax(
            city_model(torch.cat([
                tensor(city_rank_one_hot.to_numpy()),
                tensor(gender_one_hot.to_numpy()),
                tensor(age_one_hot.to_numpy())
            ], dim=1)),
            dim=1
        ), num_samples=1),
        alpha=0.5,
        bins=num_cities // 2
    )



In [None]:
class NormalNegLogLikelihood(nn.Module):
    def forward(self, predicted: "Tensor", y: "Tensor"):
        return (
            torch.log(predicted[:, 1]) + ((torch.log(y) - predicted[:, 0]) / predicted[:, 1]).square()
        ).mean()

unique_category_criterion = NormalNegLogLikelihood()


In [None]:

import math


unique_category_model = GenerativeNormalNetwork(
    input_size=num_city_ranks + num_genders + num_ages + num_cities,
    n_hidden_layers=6,
    hidden_layer_size=48
)

unique_category_optimizer = torch.optim.AdamW(params=unique_category_model.parameters(), lr=1e-4)

train(
    unique_category_model,
    train=torch.cat([
        tensor(city_rank_one_hot.to_numpy()),
        tensor(gender_one_hot.to_numpy()),
        tensor(age_one_hot.to_numpy()),
        tensor(city_one_hot.to_numpy())
    ], dim=1),
    y=tensor(X_train_numeric["unique_news_categories"].to_numpy()),
    optimizer=unique_category_optimizer,
    criterion=unique_category_criterion,
    epochs=10**3 * 3
);

In [None]:
# simple_unique_category_model = GenerativeNormalNetwork(
#     input_size=num_city_ranks + num_genders + num_ages + num_cities,
#     n_hidden_layers=0,
#     hidden_layer_size=12
# )

simple_unique_category_optimizer = torch.optim.AdamW(params=simple_unique_category_model.parameters(), lr=1e-5)

# simple_unique_category_criterion = NormalNegLogLikelihood()

train(
    simple_unique_category_model,
    train=torch.cat([
        tensor(city_rank_one_hot.to_numpy()),   # pyright: ignore
        tensor(gender_one_hot.to_numpy()),   # pyright: ignore
        tensor(age_one_hot.to_numpy()),   # pyright: ignore
        tensor(city_one_hot.to_numpy())# pyright: ignore    
    ], dim=1),
    y=tensor(X_train_numeric["unique_news_categories"].to_numpy()),  # pyright: ignore
    optimizer=simple_unique_category_optimizer,
    criterion=simple_unique_category_criterion,
    epochs=10**3 * 3
);

In [None]:
plt.hist(X_train_numeric["unique_news_categories"], alpha=0.5, density=True)
with torch.no_grad():
    category_pred = simple_unique_category_model(torch.cat([
        tensor(city_rank_one_hot.to_numpy()),
        tensor(gender_one_hot.to_numpy()),
        tensor(age_one_hot.to_numpy()),
        tensor(city_one_hot.to_numpy())
    ], dim=1))
    plt.hist(
        torch.normal(category_pred[:, 0], category_pred[:, 1]),
        alpha=0.5,
        density=True
    )
# torch.cat([
#         tensor(city_rank_one_hot_test.to_numpy()),
#         tensor(gender_one_hot_test.to_numpy()),
#         tensor(city_one_hot_test.to_numpy())
#     ], dim=1).float()

In [None]:
class ExponentialNegLogLikelihood(nn.Module):
    def forward(self, predicted: "Tensor", y: "Tensor"):
        return (
            -torch.log(predicted) + predicted * y
        ).mean()

upvote_model = GenerativeNormalNetwork(
    input_size=num_city_ranks + num_genders + num_ages + num_cities,
    n_hidden_layers=6,
    hidden_layer_size=12
)
upvote_optimizer = torch.optim.AdamW(params=upvote_model.parameters(), lr=1e-4)
upvote_criterion = NormalNegLogLikelihood()

train(
    upvote_model,
    train=torch.cat([
        tensor(city_rank_one_hot.to_numpy()),
        tensor(gender_one_hot.to_numpy()),
        tensor(age_one_hot.to_numpy()),
        tensor(city_one_hot.to_numpy())
    ], dim=1),
    y=tensor(X_train_numeric["total_upvotes"].to_numpy()),
    optimizer=upvote_optimizer,
    criterion=upvote_criterion,
    epochs=10**3 * 3
);

In [None]:
plt.hist(X_train_numeric["total_upvotes"], alpha=0.5, density=True)
with torch.no_grad():
    upvote_pred = upvote_model(torch.cat([
        tensor(city_rank_one_hot.to_numpy()),
        tensor(gender_one_hot.to_numpy()),
        tensor(age_one_hot.to_numpy()),
        tensor(city_one_hot.to_numpy())
    ], dim=1))
    plt.hist(
        torch.relu(torch.normal(upvote_pred[:, 0], upvote_pred[:, 1])),
        alpha=0.5,
        density=True
    )
# torch.cat([
#         tensor(city_rank_one_hot_test.to_numpy()),
#         tensor(gender_one_hot_test.to_numpy()),
#         tensor(city_one_hot_test.to_numpy())
#     ], dim=1).float()

In [None]:
refresh_model = GenerativeNormalNetwork(
    input_size=num_city_ranks + num_genders + num_ages,
    n_hidden_layers=6,
    hidden_layer_size=5
)
refresh_optimizer = torch.optim.AdamW(params=refresh_model.parameters(), lr=1e-3)
refresh_criterion = NormalNegLogLikelihood()

train(
    refresh_model,
    train=torch.cat([
        tensor(city_rank_one_hot.to_numpy()),
        tensor(gender_one_hot.to_numpy()),
        tensor(age_one_hot.to_numpy()),
        # tensor(city_one_hot.to_numpy())
    ], dim=1),
    y=tensor(X_train_numeric["avg_refresh_times"].to_numpy()),
    optimizer=refresh_optimizer,
    criterion=refresh_criterion,
    epochs=10**3
);

In [None]:
plt.hist(X_train_numeric["avg_refresh_times"], alpha=0.5, density=True)
with torch.no_grad():
    refresh_pred = refresh_model(torch.cat([
        tensor(city_rank_one_hot.to_numpy()),
        tensor(age_one_hot.to_numpy()),
        tensor(gender_one_hot.to_numpy()),
        # tensor(city_one_hot.to_numpy())
    ], dim=1))
    plt.hist(
        torch.relu(torch.normal(refresh_pred[:, 0], refresh_pred[:, 1])),
        alpha=0.5,
        density=True
    )
# torch.cat([
#         tensor(city_rank_one_hot_test.to_numpy()),
#         tensor(gender_one_hot_test.to_numpy()),
#         tensor(city_one_hot_test.to_numpy())
#     ], dim=1).float()

# Transformer

In [None]:
class GenerativeTransformer(nn.Module):
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        model_dim: int,
        feed_forward_dim: int,
        n_heads: int,
    ):
        super().__init__()  # pyright: ignore
        self.input_dim: int = input_dim
        self.output_dim: int = output_dim
        self.model_dim: int = model_dim

        self.input_projection = nn.Linear(input_dim, model_dim)
        self.attention = nn.MultiheadAttention(model_dim, num_heads=n_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(model_dim, feed_forward_dim),
            nn.ReLU(),
            nn.Linear(feed_forward_dim, model_dim)
        )
        self.output_projection = nn.Linear(model_dim, output_dim)

    def forward(self, inputs: "Tensor") -> "Tensor":
        x = self.input_projection(inputs)
        x = x.transpose(0, 1)
        attention, _ = self.attention(x, x, x)
        x = x + attention
        x = x + self.feed_forward(x)
        return self.output_projection(x)
    
transformer_model = GenerativeTransformer(
    input_dim=2,
    output_dim=1,
    model_dim=16,
    feed_forward_dim=16*4,
    n_heads=4
)
# transformer_model(tensor([
#     [[1.0, 2.0]],
#     [[1.0, 2.0]]
# ]))
transformer_model(tensor(city_rank_one_hot.to_numpy()).unsqueeze(1).float())


In [None]:
@torch.no_grad()
def generate(n: int) -> "Tensor":
    x: "Tensor" = torch.rand((n, 1))
    city_rank_predictions = torch.softmax(city_rank_model(x), dim=1)
    
    x = torch.softmax(city_rank_predictions, dim=1)
    gender_predictions = torch.softmax(gender_model(x), dim=1)

    x = torch.cat([x, gender_predictions], dim=1)
    age_predictions = torch.softmax(age_model(x), dim=1)

    x = torch.cat([x, age_predictions], dim=1)
    city_predictions = torch.softmax(city_model(x), dim=1)

    x = torch.cat([x, torch.softmax(city_predictions, dim=1)], dim=1)
    unique_category_predictions = unique_category_model(x)
    unique_category_predictions = torch.normal(
        mean=unique_category_predictions[:, 0],
        std=unique_category_predictions[:, 1]
    )

    upvote_predictions = upvote_model(x)
    upvote_predictions = torch.relu(torch.normal(
        mean=upvote_predictions[:, 0],
        std=upvote_predictions[:, 1]
    ))

    city_rank_values = city_rank_one_hot.columns[torch.multinomial(
        city_rank_predictions,
        num_samples=1
    ).squeeze().tolist()]
    gender_values = gender_one_hot.columns[torch.multinomial(
        gender_predictions,
        num_samples=1
    ).squeeze().tolist()]
    age_values = age_one_hot.columns[torch.multinomial(
        age_predictions,
        num_samples=1
    ).squeeze().tolist()]
    city_values = city_one_hot.columns[torch.multinomial(
        city_predictions,
        num_samples=1
    ).squeeze().tolist()]
    return pd.DataFrame({
        "city_rank": city_rank_values,
        "gender": gender_values,
        "age": age_values,
        "city": city_values,
        "unique_news_categories": unique_category_predictions.long(),
        "total_upvotes": upvote_predictions.long()
    })

generated_x = generate(10**5)

fig, axes = plt.subplots(2, 1)
generated_hist = axes[0].hist(
    city_one_hot.columns[torch.multinomial(torch.softmax(
        city_model(torch.cat([
            tensor(city_rank_one_hot.to_numpy()),
            tensor(gender_one_hot.to_numpy()),
            tensor(age_one_hot.to_numpy())
        ], dim=1)),
        dim=1
    ), num_samples=1).squeeze()],
    alpha=0.5, density=True, bins=num_cities // 2
)
actual_hist = axes[0].hist(X_train_numeric["city"], alpha=0.5, density=True, bins=num_cities // 2)
axes[0].legend(generated_hist, actual_hist, labels=["Generated", "Actual"])
axes[0].set_title("Training Data Predictions")

axes[1].hist(generated_x["city"], alpha=0.5, density=True, bins=num_cities // 2)
axes[1].hist(X_test_numeric["city"], alpha=0.5, density=True, bins=num_cities // 2)
axes[1].set_title("Generated Data")

for ax in axes:
    ax.set_xlabel("City ID")
    ax.set_ylabel("Density")

fig.set_figheight(8)
fig.tight_layout()
# fig.savefig("nn_city_train.png")

In [None]:
model_dir = Path("models")

torch.save(  # pyright: ignore[reportUnknownMemberType]
    city_rank_model.state_dict(),
    model_dir / "city_rank.pt"
)
torch.save(  # pyright: ignore[reportUnknownMemberType]
    gender_model.state_dict(),
    model_dir / "gender.pt"
)
torch.save(  # pyright: ignore[reportUnknownMemberType]
    age_model.state_dict(),
    model_dir / "age.pt"
)
torch.save(  # pyright: ignore[reportUnknownMemberType]
    unique_category_model.state_dict(),
    model_dir / "unique_category.pt"
)
torch.save(  # pyright: ignore[reportUnknownMemberType]
    upvote_model.state_dict(),
    model_dir / "upvote.pt"
)
torch.save(  # pyright: ignore[reportUnknownMemberType]
    refresh_model.state_dict(),
    model_dir / "refresh.pt"
)

In [None]:
X_train_numeric["device_name"].nunique()

In [None]:
plt.hist(X_train_numeric["unique_news_categories"], alpha=0.5, density=True)
loaded_category_model = GenerativeNormalNetwork(
    input_size=num_city_ranks + num_genders + num_ages + num_cities,
    n_hidden_layers=6,
    hidden_layer_size=48
)
loaded_category_model.load_state_dict(torch.load(model_dir / "unique_category.pt"))
with torch.no_grad():
    category_pred = loaded_category_model(torch.cat([
        tensor(city_rank_one_hot.to_numpy()),
        tensor(gender_one_hot.to_numpy()),
        tensor(age_one_hot.to_numpy()),
        tensor(city_one_hot.to_numpy())
    ], dim=1))
    plt.hist(
        torch.normal(category_pred[:, 0], category_pred[:, 1]),
        alpha=0.5,
        density=True
    )
# torch.cat([
#         tensor(city_rank_one_hot_test.to_numpy()),
#         tensor(gender_one_hot_test.to_numpy()),
#         tensor(city_one_hot_test.to_numpy())
#     ], dim=1).float()

In [None]:
generated_x

In [None]:
drop_columns = ["city", "unique_news_categories", "total_upvotes"]
x_sample = X_train_numeric.sample(10**5, replace=True)[generated_x.columns]
fig, ax = plt.subplots()
ax.plot(
    range(1, 5),
    [
        evaluate_propensity_score(
            generated_x.drop(columns=drop_columns[i:]),
            x_sample.drop(columns=drop_columns[i:])
        ) for i in range(4)
    ],
    color="grey",
    linewidth=1
)
ax.set_xticks(range(1, 5))
ax.set_ylim(0, 1.02)
ax.set_xlabel("Model")
ax.set_ylabel("Propensity Score")
ax.grid(alpha=0.2)


In [None]:
performance_results: list[dict[str, Any]] = [
    evaluate_synthetic_data_quality(
        cast(pd.DataFrame, generated_x).drop(columns=drop_columns[i:]),  # pyright: ignore[reportUnknownMemberType]
        cast(pd.DataFrame, x_sample).drop(columns=drop_columns[i:])  # pyright: ignore[reportUnknownMemberType]
    ) for i in range(4)
]

In [None]:
fig, axes = plt.subplots(2, 2)  # pyright: ignore[reportUnknownMemberType]

axes[0, 0].plot(  # pyright: ignore[reportUnknownMemberType]
    range(1, 5),
    [model["propensity_score_auc"] for model in performance_results],
    linewidth=1,
    color="grey"
)
axes[0, 0].set_ylabel("Propensity Score")

axes[0, 1].plot(  # pyright: ignore[reportUnknownMemberType]
    range(1, 5),
    [model["privacy_risk_distance"] for model in performance_results],
    linewidth=1,
    color="grey"
)
axes[0, 1].set_ylabel("Privacy Risk")

axes[1, 0].plot(  # pyright: ignore[reportUnknownMemberType]
    range(1, 5),
    [model["precision"] for model in performance_results],
    linewidth=1,
    color="grey"
)
axes[1, 0].set_ylabel("Precision")

axes[1, 1].plot(  # pyright: ignore[reportUnknownMemberType]
    range(1, 5),
    [model["recall"] for model in performance_results],
    linewidth=1,
    color="grey"
)
axes[1, 1].set_ylabel("Recall")

for row in axes:
    for ax in row:
        ax.set_ylim(-0.02, 1.02)
        ax.set_xlabel("Model")
        ax.set_xticks(range(1, 5))
        ax.grid(alpha=0.2)

fig.set_figwidth(10)
fig.set_figheight(8)
fig.tight_layout()
fig.savefig("nn_performance.png")  # pyright: ignore[reportUnknownMemberType]


In [None]:
fig, axes = plt.subplots(2, 2)  # pyright: ignore[reportUnknownMemberType]


predicted_hist = axes[0, 0].hist(  # pyright: ignore[reportUnknownMemberType]
    cast(pd.DataFrame, X_train_numeric)["city_rank"].astype(int),
    bins=[x - 0.5 for x in range(2, 7)],
    density=True,
    alpha=0.5
)
with torch.no_grad():
    actual_hist = axes[0, 0].hist(
        torch.multinomial(torch.softmax(
            city_rank_model(torch.rand((city_rank.shape[0], 1))),
            dim=1
        ), num_samples=1) + 2,
        bins=[x - 0.5 for x in range(2, 7)],
        density=True,
        alpha=0.5
    )
axes[0, 0].set_xticks(range(2, 6))  # pyright: ignore[reportUnknownMemberType]
axes[0, 0].set_title("City Rank")  # pyright: ignore[reportUnknownMemberType]
axes[0, 0].legend(  # pyright: ignore[reportUnknownMemberType]
    (actual_hist, predicted_hist),
    labels=("Actual", "Predicted")
)

axes[0, 1].hist(  # pyright: ignore[reportUnknownMemberType]
    cast(pd.DataFrame, X_train_numeric)["gender"].astype(int),
    bins=[x - 0.5 for x in range(2, 6)],
    density=True,
    alpha=0.5
)
with torch.no_grad():
    axes[0, 1].hist(
        torch.multinomial(torch.softmax(
            gender_model(
                tensor(city_rank_one_hot.to_numpy())  # pyright: ignore[reportUnknownMemberType]
            ),
            dim=1
        ), num_samples=1) + 2,
        bins=[x - 0.5 for x in range(2, 6)],
        density=True,
        alpha=0.5
    )
axes[0, 1].set_xticks(range(2, 5))  # pyright: ignore[reportUnknownMemberType]
axes[0, 1].set_title("Gender")  # pyright: ignore[reportUnknownMemberType]

for row in axes:
    for ax in row:
        ax.grid(alpha=0.2)
        ax.set_ylabel("Density")  # pyright: ignore[reportUnknownMemberType]

axes[1, 0].hist(  # pyright: ignore[reportUnknownMemberType]
    cast(pd.DataFrame, X_train_numeric)["age"].astype(int),
    bins=[x - 0.5 for x in range(2, 11)],
    density=True,
    alpha=0.5
)
with torch.no_grad():
    axes[1, 0].hist(
        torch.multinomial(torch.softmax(
            age_model(torch.cat([
                tensor(city_rank_one_hot.to_numpy()),  # pyright: ignore[reportUnknownMemberType]
                tensor(gender_one_hot.to_numpy())  # pyright: ignore[reportUnknownMemberType]
            ], dim=1)),
            dim=1
        ), num_samples=1) + 2,
        bins=[x - 0.5 for x in range(2, 11)],
        density=True,
        alpha=0.5
    )
axes[1, 0].set_xticks(range(2, 10))  # pyright: ignore[reportUnknownMemberType]
axes[1, 0].set_title("Age")  # pyright: ignore[reportUnknownMemberType]

axes[1, 1].hist(  # pyright: ignore[reportUnknownMemberType]
    cast(pd.DataFrame, X_train_numeric)["unique_news_categories"].astype(int),
    density=True,
    alpha=0.5
)
with torch.no_grad():
    category_pred = unique_category_model(torch.cat([
        tensor(city_rank_one_hot.to_numpy()),  # pyright: ignore[reportUnknownMemberType]
        tensor(gender_one_hot.to_numpy()),  # pyright: ignore[reportUnknownMemberType]
        tensor(age_one_hot.to_numpy()),  # pyright: ignore[reportUnknownMemberType]
        tensor(city_one_hot.to_numpy()),  # pyright: ignore[reportUnknownMemberType]
    ], dim=1))
    axes[1, 1].hist(
        torch.normal(category_pred[:, 0], category_pred[:, 1]),
        density=True,
        alpha=0.5
    )
    axes[1, 1].set_title("Unique News Categories")


fig.set_figwidth(10)
fig.set_figheight(8)
fig.tight_layout()
fig.savefig("nn_training.png")  # pyright: ignore[reportUnknownMemberType]


In [None]:
X_train_numeric["city_rank"].unique()