<a href="https://colab.research.google.com/github/fellowship/deep-and-wide-bandit/blob/dev/EDA_LargeSet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install torch
#!pip install transformers

# Mount Drive

In [2]:
from google.colab import drive
drive.mount("/content/GDrive")

Mounted at /content/GDrive


In [3]:
import os
import pandas as pd
import numpy as np
from zipfile import ZipFile
import re
#from collections import defaultdict
import pickle as pkl
#from transformers import BertTokenizer, BertModel
#import torch

#Formatting .describe() calls s.t. floats are displayed in non-scientific notation
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [4]:
os.chdir(os.getcwd())
zip_path_l = ["/content/GDrive/MyDrive/Bandit_Project/July_Dec_2019/July_Dec_2019.zip",
               "/content/GDrive/MyDrive/Bandit_Project/Jan_June_2020/Jan_June_2020.zip", 
               "/content/GDrive/MyDrive/Bandit_Project/Jul_Sep_2020/Jul_Sep_2020.zip"]

file_name_l = ['/content/GDrive/MyDrive/Bandit_Project/July_Dec_2019/sends_july_dec_2019_2000',
               '/content/GDrive/MyDrive/Bandit_Project/Jan_June_2020/sends_jan_june_2020_2000',
               '/content/GDrive/MyDrive/Bandit_Project/Jul_Sep_2020/sends_july_sept_2020_2000']

year_l = ["2019", "2020", "2020"] #For saving

folder_name_l = [os.path.split(zip_path)[0] for zip_path in zip_path_l]

for (zip_path, file_name, folder_name) in zip(zip_path_l, file_name_l, folder_name_l):
  
  if not(os.path.exists(file_name)):
    
    with ZipFile(zip_path, 'r') as zip_obj:

      zip_obj.extractall(folder_name)

In [5]:
col_names = ['riid','send_dt','launch_id','sub_sourcedev','opened','unsub',
             'rev_3dv2','aq_dt','aq_mo','aq_dow','aq_period','sends_since_last_open',
             'wk1_opens','wk4_opens','mo3_opens','mo6_opens','yr1_opens',
             'wk1_clicks','wk4_clicks','mo3_clicks','mo6_clicks',
             'rev_dec','rev_raw','snds_dec','snds_opens_dec','wk1_vis',
             'wk4_vis','mo3_vis','mo6_vis','wk1_pv','wk4_pv','mo3_pv','mo6_pv',
             'wk1_secs','wk4_secs','mo3_secs','mo6_secs','prev_optouts',
             'last_order','last_click','last_cart','last_visit','last_ded_open',
             'sent_time','offer_signature_id','dynamic_content_signature_id',
             'message_size','open_cnt','first_open_time','last_open_time',
             'hrs_to_first_open','hrs_to_last_open','click_cnt','first_click_time',
             'last_click_time','offer_category','hrs_to_first_click','hrs_to_last_click',
             'opt_out_cnt','first_opt_out_time','last_opt_out_time','opt_out_source',
             'opt_out_reason','hrs_to_first_opt_out','hrs_to_last_opt_out','campaign_id',
             'subject','marketing_strategy','campaign_type']

In [6]:
#Inspect the first 1e5 rows of one of the dataframes
df = pd.read_csv(file_name_l[0], sep="\t", header=None, names=col_names, nrows=1e5)

# Finalizing data preprocessing steps

**QUESTION:** How do we split the train-valid-test set?

1.   For each of the provided files, we employ a 80-20 split.
2.   For the full set of files, we employ a 80-20 split. i.e., the full first 2 files and a portion of the first half of the last file is used for training and remaining is used for validation.



In [7]:
#Too many questions around what to keep, what to drop...
#Hence, currently default behaviour of drop
#Using a control variable dictionary to modify behaviour for relevant column groups
control_variable = {'marketing': True, 'subject':True, 'campaign_type':False,
                    'aq': True,'site': True,'last':True, 'promo_ids': True,
                    'email_cnts':True, 'email_times':True, 'opt_out':True, 
                    'offer_cat':True}

#Another control variable to control whether dataset has to be exported OR not
save_control_variable = True

#Appended gzipped filename
myfile = "/content/GDrive/MyDrive/Bandit_Project/dataset_without_time_ohe.csv.gz"
month_files_base_folder = "/content/GDrive/MyDrive/Bandit_Project/"

In [8]:
def rew(row):
    sl = -1 * (1+max(row['sends_since_last_open'], 0))
    #op = 5  * max(row['opened'], 0)
    if row['rev_3dv2'] > 0:
        cl = 10
    else:
        cl = 0
    un = -50 * max(row['unsub'], 0)
    fu = -50 * row['frequency_score'] * max(row['unsub'],0)
    return sl+cl+un+fu

In [9]:
#Create a dictionary that updates values for min, max, mean & std for scaling later
moving_col_names = ["rev_3dv2", "sends_since_last_open", "message_size", "retention_score", "frequency_score",
                    "sent_hm_numeric", "sent_dayofweek", "sent_week", "sent_month", "discount"] 
#moving_dict = {name: defaultdict(int) for name in moving_col_names}
moving_dict = {name: {} for name in moving_col_names}

#Have a variable that also keeps track of the number of elements of the previous chunk
previous_n_so_far = 0

In [10]:
for i in range(0, len(file_name_l)):

  #For printing status updates
  cnt = 0

  for chunk in pd.read_csv(file_name_l[i], sep='\t', header=None, names=col_names, chunksize=1e6):

    df_sample=chunk.copy()
    
    #if ((cnt+1)%5 == 0):
    print(f"[INFO] Processing the Chunk #{cnt+1} of Dataset #{i+1}")

    """
    Drop less-frequent AND/OR redundant columns
    all team members voted to eliminate
    """

    #Columns that are finalised to be dropped
    col_drop_names = ["send_dt", "sub_sourcedev", "rev_dec", "rev_raw", "snds_dec", "snds_opens_dec", 
                "dynamic_content_signature_id"]

    df_sample.drop(columns=col_drop_names, inplace=True)
    del col_drop_names

    """
    Compute the composite recency, frequency (dropped) & retention scores
    Dropping the constituent base columns

    NOTE 
    ----
    1) Both recency and frequency are strongly correlated - Remove recency score
    3) sends_since_last_open used for retention score AS WELL AS reward
    """

    #Retention Score
    df_sample['retention_score'] = df_sample.sends_since_last_open.apply(lambda x: 28/max(1,x)) 

    #Recency Score
    #df_sample["recency_score"] = 16/7*(df_sample['wk1_opens'] + df_sample['wk1_clicks']) 
    #+ 8/28*(df_sample['wk4_opens'] + df_sample['wk4_clicks']) 
    #+ 4/64*(df_sample['mo3_opens'] + df_sample['mo3_clicks']) 
    #+ 2/92*(df_sample['mo6_opens'] + df_sample['mo6_clicks']) 
    #+ 1/184*df_sample['yr1_opens']

    #Frequency Score
    df_sample["frequency_score"] = df_sample['wk1_opens'] + df_sample['wk4_opens'] + \
              df_sample['mo3_opens'] + df_sample['mo6_opens']

    #Drop the base columns
    col_drop_scores = ["wk1_opens", "wk4_opens", "mo3_opens","mo6_opens","yr1_opens",
                      "wk1_clicks", "wk4_clicks", "mo3_clicks", "mo6_clicks"]

    df_sample.drop(columns=col_drop_scores, inplace=True)
    del col_drop_scores

    """
    Convert sent_time column to datetime object
    Extract useful features like 
    """
    datetimes = pd.to_datetime(df_sample.sent_time)
    df_sample.drop(columns=["sent_time"], inplace=True)
    df_sample["sent_hm_numeric"] = datetimes.dt.hour + (datetimes.dt.minute/60)
    df_sample["sent_dayofweek"] = datetimes.dt.dayofweek
    df_sample["sent_week"] = datetimes.dt.day//7
    df_sample["sent_month"] = datetimes.dt.month
    del datetimes

    """
    Column Name. - marketing_strategy

    NOTE
    ----
    1) AdHoc (1) or Program (0)? Might be useful to keep track of this
    2) Whether Promo or not is useful to check perhaps
    3) Whether Sale or not is useful to check perhaps
    """

    if control_variable["marketing"]:
      #Create a new column for AdHoc OR Program: Seems to be the type of email
      #df_sample["email_type"] = df_sample.marketing_strategy.str.contains("_Adhoc_").fillna(0).astype(int) #1 for AdHoc, 0 for Program

      #Create columns for Promotional and Sale: Other values do not seem useful - Aleksey to get back about correlations
      #df_sample["promo"] = df_sample.marketing_strategy.str.contains("Promotional").fillna(0).astype(int)
      #df_sample["sale"] = df_sample.marketing_strategy.str.contains("Sale").fillna(0).astype(int)
      
      #Drop the column
      df_sample.drop(columns=["marketing_strategy"], inplace=True)

    """
    #Initialize the BERT tokenizer and the model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
    model.eval()

    def bert_text_preparation(text, tokenizer):
        
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1]*len(indexed_tokens)

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        return tokenized_text, tokens_tensor, segments_tensors

    def get_bert_embeddings(tokens_tensor, segments_tensors, model):
        
        # Gradient calculation id disabled
        # Model is in inference mode
        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensors)
            # Removing the first hidden state
            # The first state is the input state
            hidden_states = outputs[2][1:]
        
        #Stack tensors up to give us a proper Pytorch tensor to work with
        token_embeddings = torch.stack(hidden_states, dim=0)
        
        #Remove the batch-dimension because we have only a single sentence
        token_embeddings = torch.squeeze(token_embeddings, dim=1)

        #Swapping the layer dimension with the token dimension
        token_embeddings = token_embeddings.permute(1,0,2)

        #For each token, compute average across 1st dimension. Then, average across 0th dimension.
        sentence_embedding = torch.mean(token_embeddings, dim=(0, 1))

        return sentence_embedding
    #Try 3 sentences:
    #Sentence 1) "We bet we have the fit for you"
    #Sentence 2) "We’ve got your size"
    #Sentence 3) "Long-weekend savings all week long"
    #Sentence 4) "Get 50% off— ahead of everyone else"

    s1 = "Find pants that fit perfectly"
    s2 = "We bet we have the fit for you"
    s3 = "50% off absolutely everything + free shipping"
    s4 = "Get 50% off— ahead of everyone else"

    _, token_ids_s1, segment_ids_s1 = bert_text_preparation(s1, tokenizer)
    _, token_ids_s2, segment_ids_s2 = bert_text_preparation(s2, tokenizer)
    _, token_ids_s3, segment_ids_s3 = bert_text_preparation(s3, tokenizer)
    _, token_ids_s4, segment_ids_s4 = bert_text_preparation(s4, tokenizer)

    s1_embedding = get_bert_embeddings(token_ids_s1, segment_ids_s1, model)
    s2_embedding = get_bert_embeddings(token_ids_s2, segment_ids_s2, model)
    s3_embedding = get_bert_embeddings(token_ids_s3, segment_ids_s3, model)
    s4_embedding = get_bert_embeddings(token_ids_s4, segment_ids_s4, model)
    from scipy.spatial.distance import cosine

    # Calculate the cosine similarity between the S1 and S2 (similar - fit-based)
    s1s2sim = 1 - cosine(s1_embedding, s2_embedding)
    print(f"[INFO] Computing cosing similarity between {s1} and {s2}: {s1s2sim}")

    # Calculate the cosine similarity between the S3 and S4 (similar - savings-based)
    s3s4sim = 1 - cosine(s3_embedding, s4_embedding)
    print(f"[INFO] Computing cosing similarity between {s3} and {s4}: {s3s4sim}")

    # Calculate the cosine similarity between the S1 and S3 (dissimilar - fit vs savings)
    s1s3sim = 1 - cosine(s1_embedding, s3_embedding)
    print(f"[INFO] Computing cosing similarity between {s1} and {s3}: {s1s3sim}")

    # Calculate the cosine similarity between the S2 and S4 (dissimilar - fit vs savings)
    s2s4sim = 1 - cosine(s2_embedding, s4_embedding)
    print(f"[INFO] Computing cosing similarity between {s2} and {s4}: {s2s4sim}")
    #Explore the hidden_states more clearly
    print ("Number of layers:", len(hidden_states))

    #Pick any 1 layer
    layer_i = 0
    print ("Number of batches:", len(hidden_states[layer_i]))

    #Pick any 1 batch
    batch_i = 0
    print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))

    #Pick any 1 token
    token_i = 0
    print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)
    print(f"Stacking all the hidden units...Pytorch tensor of shape {token_embeddings.size()} is created")

    #Remove the batch-dimension because we have only a single sentence
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    print(f"Removing batch dimension...Pytorch tensor of shape {token_embeddings.size()} is created")

    #Swapping the layer dimension with the token dimension
    token_embeddings = token_embeddings.permute(1,0,2)
    print(f"Swapping layer with token...Pytorch tensor of shape {token_embeddings.size()} is created")

    #For each token, compute average across 1st dimension. Then, average across 0th dimension.
    token_vecs = torch.mean(token_embeddings, dim=(0, 1))
    print(f"The embedding for - Sample text - is of size {token_vecs.size()}")

    Column - subject

    1) Use BERT to convert this into a sentence vector?
    2) Check for specific words of interest in the subject?
    3) Other handcrafted features like LogReg or NB?
    """
    if control_variable["subject"]:

      #Check for Aleksey's words
      feature_words = ['these', 'what', 'that', 'your', 'have', 'holidays', 'flex', 'know', 'fits', '&', 'styles', 'holiday', 'may', 'wear', 'best', 'with', 'gifts']
      col_names = ["word_" + word for word in feature_words]
      for (feature_word, col_name) in zip(feature_words, col_names):
        df_sample[col_name] = df_sample.subject.str.contains(feature_word, case=False)
      df_sample.drop(columns=["subject"], inplace=True)

    """
    Column Name - campaign_type

    1) Can get the old campaign_type categorical columns from this
    2) Can eliminate EU entries.
    3) Discounts can be extracted.
    4) Sale and Promotional can be extracted.
    """

    #Find out how many different ways words like US show up in campaign_type
    #country_mask = df_sample["campaign_type"].str.contains(r"_[A-Z][A-Z]_", regex=True)
    #print(f"There are {len(df_sample[country_mask])} entries that satisfy _[A-Z][A-Z]_ regex match\n")

    if control_variable["campaign_type"]:
      #Drop the campaign_type column 
      df_sample.drop(columns=["campaign_type"], inplace=True)

    else:
      #Do stops and then drop the column
      #Eliminate non-US rows
      US_mask = df_sample.campaign_type.str.contains('_US_')
      df_sample = df_sample[US_mask]
      
      #Convert a single column into one-hot encoded campaign_type columns
      df_sample["campaign_Brand"] = df_sample.campaign_type.str.contains("Brand").astype(int)
      df_sample["campaign_Collection"] = df_sample.campaign_type.str.contains("Collection").astype(int)
      df_sample["campaign_Core"] = df_sample.campaign_type.str.contains("Core").astype(int)
      df_sample["campaign_Dedicated"] = df_sample.campaign_type.str.contains("Dedicated").astype(int)
      df_sample["campaign_InnovationSpotlight"] = df_sample.campaign_type.str.contains("InnovationSpotlight").astype(int)
      df_sample["campaign_NewArrivals"] = df_sample.campaign_type.str.contains("NewArrivals").astype(int)
      df_sample["campaign_ProductSpotlight"] = df_sample.campaign_type.str.contains("ProductSpotlight").astype(int)
      df_sample["campaign_Replen"] = df_sample.campaign_type.str.contains("Replen").astype(int)
      df_sample["campaign_Tops"] = df_sample.campaign_type.str.contains("Tops").astype(int)
      df_sample["campaign_Trend"] = df_sample.campaign_type.str.contains("Trend").astype(int)
      
      #Sometimes, there are multiple columns that fire for the same row —> -1 value for Other
      campaign_type_col_names = ["campaign_Collection", "campaign_Core", "campaign_Dedicated", "campaign_InnovationSpotlight",
                                "campaign_NewArrivals", "campaign_ProductSpotlight", "campaign_Replen", "campaign_Tops",
                                "campaign_Brand", "campaign_Trend"]  
      df_sample["campaign_Other"] = 1-df_sample[campaign_type_col_names].any(axis=1).astype(int)

      #Create discount columns
      df_sample["discount"] = 0
      
      #Display how the discounts are presented
      discount_pattern = re.compile(r"_[1-9][0-9]off_", flags = re.I)
      discount_mask = df_sample["campaign_type"].str.contains(discount_pattern, regex=True)
      
      #print(f"There are {len(df_sample[discount_mask])} entries that satisfy discount regex match\n")
      #print("Some of them include:")
      #print((df_sample[discount_mask]).campaign_type.unique()[:5])

      df_sample.loc[discount_mask, "discount"] = (df_sample[discount_mask])["campaign_type"].str.findall(r"(?<=_)\d{2}").str.get(0).astype(int)

      #Check whether the promot 
      df_sample["promo"] = df_sample.campaign_type.str.contains("Promotional", case=False).fillna(0).astype(int)
      df_sample["sale"] = df_sample.campaign_type.str.contains("Sale", case=False).fillna(0).astype(int)

      #Drop the campaign_type column 
      df_sample.drop(columns=["campaign_type"], inplace=True)

    if control_variable['offer_cat']:

      """
      Offer_category Columns - "offer_category"

      NOTE
      ----
      1) This contains additional info about whether it's promotion, sale, Tops etc.
      that are somehow missed by the other columns. We can leverage this?

      e.g. "Mens_Pants...", "Mens_ShirtTops...", "Mens_BigAndTall...", "Mens_Shorts...",
      "Mens_SweatersJackets...", "Mens_Accessories...", "Promotion", "Sale", "Brand"

      2) HORRIBLY BIASED (at least this column is) DATASET because (A) not even a single mention
      of womens products along the lines of Mens_Shorts etc. and (B) very few entries
      3) But at the end of the day - only 8.3% are filled. Hence, not using.


      #Extract the column as a Series + convert to str
      offer_cat_series = df_sample["offer_category"].astype(str)

      #Print the unique values corresponding to Mens clothing
      mens_clothing_mask = offer_cat_series.str.contains("Mens")
      off_cat_mens = offer_cat_series[mens_clothing_mask]
      print(f"[INFO] The number of entries with Mens: {len(off_cat_mens)}")
      print(f"[INFO] Printing out a few of them: {off_cat_mens.unique()[:10]}")

      womens_clothing_mask = offer_cat_series.str.contains("Wom")
      off_cat_womens = offer_cat_series[womens_clothing_mask]
      print(f"\n[INFO] The number of entries with Womens: {len(off_cat_womens)}")
      print(f"[INFO] Printing out a few of them: {off_cat_womens.unique()[:-1]}")
      """
      #Drop this column as well
      df_sample.drop(columns=["offer_category"], inplace=True)

    """
    Acquisiton Columns - "aq_dt", "aq_mo", "aq_dow", "aq_period"

    NOTE
    ----
    1) Older customers who have not subscribed — exploit strategy should work better
    2) aq_dt has negative numbers - Why?
    """
    if control_variable["aq"]:
      col_drop_aq = ["aq_dt", "aq_mo", "aq_dow", "aq_period"]
      df_sample.drop(columns=col_drop_aq, inplace=True)
    """
    Website Data - "wk1_vis", "wk4_vis", "mo3_vis", "mo6_vis", "wk1_pv",
    "wk4_pv", "mo3_pv", "mo6_pv", "wk1_secs", "wk4_secs", "mo3_secs", "mo6_secs"

    NOTE
    ----
    1) Extremely sparse - mostly filled with 0s
    2) Marko had said website and email integration has not taken place
    3) Naturally, this integration will happen in the future. Let the future team take care of it then.
    """
    if control_variable["site"]:
      col_drop_site = ["wk1_vis", "wk4_vis", "mo3_vis", "mo6_vis", "wk1_pv",
    "wk4_pv", "mo3_pv", "mo6_pv", "wk1_secs", "wk4_secs", "mo3_secs", "mo6_secs"]
      df_sample.drop(columns=col_drop_site, inplace=True)

    """
    The Last Time - "last_order", "last_click", "last_cart","last_visit","last_ded_open"

    NOTE
    ----
    1) Moooostly NaN values currently
    2) However, when it is not NaN - especially for last_order - it could be a really useful signal
    """
    if control_variable["last"]:
      col_drop_last = ["last_order", "last_click", "last_cart","last_visit","last_ded_open"]
      df_sample.drop(columns=col_drop_last, inplace=True)

    """
    Promotion IDs - "offer_signature_id", "launch_id", "campaign_id"

    NOTE
    ----
    1) A bunch of strings - might be useful for either cross-feature interactions or 
    campaign/promotion embeddings
    2) I feel it is a nice-to-have thing, as opposed to a need-to-have-thing
    """
    if control_variable["promo_ids"]:
      col_drop_promo_ids = ["offer_signature_id", "launch_id", "campaign_id"]
      df_sample.drop(columns=col_drop_promo_ids, inplace=True)

    """
    Detailed user-email interaction COUNTS - "open_cnt", "click_cnt" 
    ""
    NOTE
    ----
    1) We already have the "opened" signal - can it be boosted further with open_cnt and click_cnt?
    2) But only 28.2% and 8.3% of these counts are available
    """
    if control_variable["email_cnts"]:
      col_drop_email_cnts = ["open_cnt", "click_cnt"]
      df_sample.drop(columns=col_drop_email_cnts, inplace=True)

    """
    Detailed user-email interaction TIMES - "first_open_time", "last_open_time", "hrs_to_first_open", 
    "hrs_to_last_open", "first_click_time", "last_click_time", "hrs_to_first_click", "hrs_to_last_click"

    NOTE
    ----
    1) Hmmmm...
    """
    if control_variable["email_times"]:
      col_drop_email_times = ["first_open_time", "last_open_time", "hrs_to_first_open", 
    "hrs_to_last_open", "first_click_time", "last_click_time", "hrs_to_first_click", "hrs_to_last_click"]
      df_sample.drop(columns=col_drop_email_times, inplace=True)

    """
    Opt Out - "first_opt_out_time", "opt_out_cnt", "last_opt_out_time", "opt_out_source", 
    "opt_out_reason", "hrs_to_first_opt_out", "hrs_to_last_opt_out"

    NOTE
    -----
    1) Marko's response: so I think there's a couple of different levels you can opt out on, just promotional emails, all emails etc.
    so they probably resubscribed if they got added in later
    2) I think the emailer system has checks and balances to see whether people who opt out,
    should be sent mails to. But they don't keep track of the past - i.e., when someone opts out,
    then resubscribes - he/she does not get counted as an opt out
    3) Only 0.3% of entries have opt out filled - prev_opt_outs is 0 for 94% of entries
    """
    if control_variable["opt_out"]:
      col_drop_opt_out = ["prev_optouts", "first_opt_out_time", "opt_out_cnt", "last_opt_out_time", "opt_out_source", 
    "opt_out_reason", "hrs_to_first_opt_out", "hrs_to_last_opt_out"]
      df_sample.drop(columns=col_drop_opt_out, inplace=True)

    #Add the reward column as per Marko's definition
    df_sample['reward'] = df_sample.apply(lambda row: rew(row), axis=1)

    #Add an optimal action: If unsub OR not opened, 0. If revenue OR open, then 1.
    df_sample["optimal_action"] = 1
    df_sample[(df_sample["unsub"] == 1) | (df_sample["opened"] == 0)]["optimal_action"] = 0

    #Update the moving dict
    current_n = len(df_sample)

    #We have updated the dict with values from 1 chunk
    if (cnt + i) > 0:
      
      for col_name in moving_dict.keys():
        
        #Update min & max of col_name if required
        min_val = df_sample[col_name].min()
        max_val = df_sample[col_name].max()

        if min_val < moving_dict[col_name]["min"]:
          moving_dict[col_name]["min"] = min_val
        
        if max_val > moving_dict[col_name]["max"]:
          moving_dict[col_name]["max"] = max_val

        #Update the mean to incorporate the new population
        current_mean = df_sample[col_name].mean()
        previous_mean = moving_dict[col_name]["mean"]
        moving_dict[col_name]["mean"] = (previous_n_so_far * previous_mean + current_n * current_mean)/(previous_n_so_far + current_n)
        
        #Keeping track of std is a real doozy - https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
        current_std = df_sample[col_name].std()
        previous_std = moving_dict[col_name]["std"]
        correction_term = (previous_n_so_far * current_n * (current_mean - previous_mean)**2)/((previous_n_so_far + current_n) * (previous_n_so_far + current_n-1))
        combined_variance = ((previous_n_so_far-1)* (previous_std**2) + (current_n - 1)*(current_std**2))/(previous_n_so_far + current_n - 1) + correction_term
        moving_dict[col_name]["std"] = np.sqrt(combined_variance)

        #Update the number of points
        previous_n_so_far += current_n

        #Append chunk to our file
        if save_control_variable:
          df_sample.to_csv(myfile, index=False, compression="gzip", mode="a", header=False)

          #Create monthly save files
          #Year string is accessed by year_l[i]
          month_l = list(df_sample["sent_month"].unique()).sort()
          month_filename_l = [month_files_base_folder + "df_" +  year_l[i] + "_" + str(month) + ".csv.gz" 
                              for month in month_l]
          for counter, month_filename in enumerate(month_filename_l):
            df_sample[df_sample["sent_month"] == month_l[counter]].to_csv(month_filename, index=False, compression='gzip', mode='a', header=False)
    
    else:

      #First chunk
      #Set current chunk size to the variable that tracks datapoints
      previous_n_so_far = current_n
      
      for col_name in moving_dict.keys():
        moving_dict[col_name]["min"] = df_sample[col_name].min() 
        moving_dict[col_name]["max"] = df_sample[col_name].max()
        moving_dict[col_name]["mean"] = df_sample[col_name].mean()
        moving_dict[col_name]["std"] = df_sample[col_name].std()

      #Delete gzip file it exists
      if os.path.isfile(myfile) and save_control_variable:
        os.remove(myfile)

      #Append chunk to our file
      if save_control_variable:
        df_sample.to_csv(myfile, index=False, compression="gzip", mode="a")

        #Create monthly save files
        #Year string is accessed by year_l[i]
        month_l = list(df_sample["sent_month"].unique()).sort()
        month_filename_l = [month_files_base_folder + "df_" +  year_l[i] + "_" + str(month) + ".csv.gz" 
                            for month in month_l]
        for counter, month_filename in enumerate(month_filename_l):
          df_sample[df_sample["sent_month"] == month_l[counter]].to_csv(month_filename, index=False, compression='gzip', mode='a', header=False)
    
    #Delete the dataframe to save space
    del df_sample

    #Increment display counter
    cnt += 1

#Dump the moving_dict dictionary to a pickled file for later use
if save_control_variable:
  pickle_filename = "/content/GDrive/MyDrive/Bandit_Project/rolling_statistics.pkl"
  with open(pickle_filename, mode='wb') as file:
    pkl.dump(moving_dict, file)

[INFO] Processing the Chunk #1 of Dataset #1
[INFO] Processing the Chunk #2 of Dataset #1


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #3 of Dataset #1
[INFO] Processing the Chunk #4 of Dataset #1


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #5 of Dataset #1
[INFO] Processing the Chunk #6 of Dataset #1
[INFO] Processing the Chunk #7 of Dataset #1
[INFO] Processing the Chunk #8 of Dataset #1
[INFO] Processing the Chunk #9 of Dataset #1
[INFO] Processing the Chunk #10 of Dataset #1
[INFO] Processing the Chunk #11 of Dataset #1
[INFO] Processing the Chunk #12 of Dataset #1
[INFO] Processing the Chunk #13 of Dataset #1
[INFO] Processing the Chunk #14 of Dataset #1


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #15 of Dataset #1
[INFO] Processing the Chunk #16 of Dataset #1
[INFO] Processing the Chunk #17 of Dataset #1
[INFO] Processing the Chunk #18 of Dataset #1


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #19 of Dataset #1
[INFO] Processing the Chunk #20 of Dataset #1
[INFO] Processing the Chunk #21 of Dataset #1
[INFO] Processing the Chunk #22 of Dataset #1
[INFO] Processing the Chunk #23 of Dataset #1
[INFO] Processing the Chunk #24 of Dataset #1
[INFO] Processing the Chunk #25 of Dataset #1
[INFO] Processing the Chunk #26 of Dataset #1
[INFO] Processing the Chunk #27 of Dataset #1
[INFO] Processing the Chunk #28 of Dataset #1
[INFO] Processing the Chunk #29 of Dataset #1
[INFO] Processing the Chunk #30 of Dataset #1
[INFO] Processing the Chunk #31 of Dataset #1
[INFO] Processing the Chunk #32 of Dataset #1
[INFO] Processing the Chunk #33 of Dataset #1
[INFO] Processing the Chunk #34 of Dataset #1


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #35 of Dataset #1
[INFO] Processing the Chunk #36 of Dataset #1
[INFO] Processing the Chunk #37 of Dataset #1
[INFO] Processing the Chunk #38 of Dataset #1


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #1 of Dataset #2
[INFO] Processing the Chunk #2 of Dataset #2
[INFO] Processing the Chunk #3 of Dataset #2


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #4 of Dataset #2
[INFO] Processing the Chunk #5 of Dataset #2
[INFO] Processing the Chunk #6 of Dataset #2
[INFO] Processing the Chunk #7 of Dataset #2
[INFO] Processing the Chunk #8 of Dataset #2
[INFO] Processing the Chunk #9 of Dataset #2
[INFO] Processing the Chunk #10 of Dataset #2
[INFO] Processing the Chunk #11 of Dataset #2


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #12 of Dataset #2


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #13 of Dataset #2
[INFO] Processing the Chunk #14 of Dataset #2
[INFO] Processing the Chunk #15 of Dataset #2
[INFO] Processing the Chunk #16 of Dataset #2
[INFO] Processing the Chunk #17 of Dataset #2
[INFO] Processing the Chunk #18 of Dataset #2


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #19 of Dataset #2
[INFO] Processing the Chunk #20 of Dataset #2
[INFO] Processing the Chunk #21 of Dataset #2
[INFO] Processing the Chunk #22 of Dataset #2
[INFO] Processing the Chunk #23 of Dataset #2
[INFO] Processing the Chunk #24 of Dataset #2
[INFO] Processing the Chunk #25 of Dataset #2
[INFO] Processing the Chunk #26 of Dataset #2
[INFO] Processing the Chunk #27 of Dataset #2
[INFO] Processing the Chunk #28 of Dataset #2
[INFO] Processing the Chunk #29 of Dataset #2
[INFO] Processing the Chunk #30 of Dataset #2
[INFO] Processing the Chunk #31 of Dataset #2
[INFO] Processing the Chunk #32 of Dataset #2
[INFO] Processing the Chunk #33 of Dataset #2
[INFO] Processing the Chunk #34 of Dataset #2
[INFO] Processing the Chunk #1 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #2 of Dataset #3
[INFO] Processing the Chunk #3 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #4 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #5 of Dataset #3
[INFO] Processing the Chunk #6 of Dataset #3
[INFO] Processing the Chunk #7 of Dataset #3
[INFO] Processing the Chunk #8 of Dataset #3
[INFO] Processing the Chunk #9 of Dataset #3
[INFO] Processing the Chunk #10 of Dataset #3
[INFO] Processing the Chunk #11 of Dataset #3
[INFO] Processing the Chunk #12 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #13 of Dataset #3
[INFO] Processing the Chunk #14 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #15 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing the Chunk #16 of Dataset #3
[INFO] Processing the Chunk #17 of Dataset #3
[INFO] Processing the Chunk #18 of Dataset #3
[INFO] Processing the Chunk #19 of Dataset #3


In [11]:
for key, value in moving_dict.items():
  print(f"{key} : {value}\n")

rev_3dv2 : {'min': -14.0, 'max': 3526.84, 'mean': 0.8281484860246667, 'std': 10.519036680643051}

sends_since_last_open : {'min': 0, 'max': 182, 'mean': 11.614258710460234, 'std': 15.658222596850884}

message_size : {'min': 80267, 'max': 354737, 'mean': 153750.9570297518, 'std': 40514.58888464764}

retention_score : {'min': 0.15384615384615385, 'max': 28.0, 'mean': 12.025238347455499, 'std': 11.479056910150538}

frequency_score : {'min': 0, 'max': 163, 'mean': 16.37320307943908, 'std': 21.918304730832393}

sent_hm_numeric : {'min': 0.0, 'max': 23.183333333333334, 'mean': 16.926774430085377, 'std': 4.666712228630996}

sent_dayofweek : {'min': 0, 'max': 6, 'mean': 2.8199950563185165, 'std': 2.1693128901977192}

sent_week : {'min': 0, 'max': 4, 'mean': 1.9418937306569122, 'std': 1.4402637267866456}

sent_month : {'min': 1, 'max': 12, 'mean': 9.476696887296637, 'std': 2.2838700061014556}

discount : {'min': 0, 'max': 75, 'mean': 14.385221859553527, 'std': 21.095893062976085}

