In [1]:
import os
import yaml
import warnings
import pandas as pd

from modules.scoring import ScoreSeed
from modules.multigraph import NNGraph

warnings.filterwarnings("ignore")

### Load the config file

In [2]:
with open("config.yaml") as f:
    config = yaml.load(f)

In [3]:
out_path = "data/adform/extn.csv"

nn = config["score"]["nn"]
eps = float(config["score"]["eps"])
cat_cols = config["dataset_params"]["cat_cols"]
list_cols = config["dataset_params"]["list_cols"]
data_root_dir = config["dataset_params"]["root_dir"]
data_file = config["dataset_params"]["output_file_name"]
prob_file = config["dataset_params"]["feat_count_file_name"]

### Load the clean data

In [4]:
df = pd.read_json(os.path.join(data_root_dir, data_file))

### Take random 50000 users who have clicked the ad as Seed Set

In [5]:
df = df[df["click"]==1]
df = df.sample(n=50000)
seed_df = df[["id"]]
seed_df.to_csv(os.path.join(data_root_dir, "seed.csv"))

In [6]:
seed_df = pd.read_csv(os.path.join(data_root_dir, "seed.csv"))

In [7]:
seed_ids = list(seed_df["id"])

### Load the feature counts data

In [8]:
prob_vals = pd.read_csv(os.path.join(data_root_dir, prob_file))
data_path = os.path.join(data_root_dir, data_file)

In [9]:
cat_graph = NNGraph(os.path.join(data_root_dir, "cat_graph"))
list_graphs = []
for i in list_cols:
    g = NNGraph(os.path.join(data_root_dir, f"{i}_graph"))
    list_graphs.append(g)

### Create the scorer object

In [10]:
scorer = ScoreSeed(seed_ids, data_path, prob_vals, cat_graph, list_graphs, cat_cols, 
                  list_cols, out_path, nn, eps)

In [11]:
scorer.score()

Reading data...
Calculating Information Value...
Calculating seed set distribution...
Extracting Neighbors...
Scoring Neighbors...
Saving output file...


### Load the data again

In [12]:
df = pd.read_json(data_path)

### Create extension sets

In [13]:
extension_ids = scorer.extension
extn_5x = df[df["id"].isin(extension_ids[:50000])]
extn_10x = df[df["id"].isin(extension_ids[:100000])]
extn_15x = df[df["id"].isin(extension_ids[:150000])]
extn_20x = df[df["id"].isin(extension_ids[:200000])]
extn_25x = df[df["id"].isin(extension_ids[:250000])]

### Calculate Recall Rates for the different extension sets

In [14]:
recall_5x = extn_5x["click"].mean()
recall_10x = extn_10x["click"].mean()
recall_15x = extn_15x["click"].mean()
recall_20x = extn_20x["click"].mean()
recall_25x = extn_25x["click"].mean()

In [15]:
print(recall_5x, recall_10x, recall_15x, recall_20x, recall_25x)

0.3273 0.29042 0.26364666666666664 0.24218 0.23364
