<a href="https://colab.research.google.com/github/fellowship/deep-and-wide-bandit/blob/dev/EDA_LargeSet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount Drive

In [None]:
from google.colab import drive
drive.mount("/content/GDrive")

Drive already mounted at /content/GDrive; to attempt to forcibly remount, call drive.mount("/content/GDrive", force_remount=True).


In [None]:
import os
import pandas as pd
import numpy as np
from zipfile import ZipFile
import re
import json
import pickle as pkl

#Formatting .describe() calls s.t. floats are displayed in non-scientific notation
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
os.chdir(os.getcwd())
zip_path_l = ["/content/GDrive/MyDrive/Bandit_Project/July_Dec_2019/July_Dec_2019.zip",
               "/content/GDrive/MyDrive/Bandit_Project/Jan_June_2020/Jan_June_2020.zip", 
               "/content/GDrive/MyDrive/Bandit_Project/Jul_Sep_2020/Jul_Sep_2020.zip"]

file_name_l = ['/content/GDrive/MyDrive/Bandit_Project/July_Dec_2019/sends_july_dec_2019_2000',
               '/content/GDrive/MyDrive/Bandit_Project/Jan_June_2020/sends_jan_june_2020_2000',
               '/content/GDrive/MyDrive/Bandit_Project/Jul_Sep_2020/sends_july_sept_2020_2000']

year_l = ["2019", "2020", "2020"] #For saving

folder_name_l = [os.path.split(zip_path)[0] for zip_path in zip_path_l]

for (zip_path, file_name, folder_name) in zip(zip_path_l, file_name_l, folder_name_l):
  
  if not(os.path.exists(file_name)):
    
    with ZipFile(zip_path, 'r') as zip_obj:

      zip_obj.extractall(folder_name)

In [None]:
orig_col_names = ['riid','send_dt','launch_id','sub_sourcedev','opened','unsub',
             'rev_3dv2','aq_dt','aq_mo','aq_dow','aq_period','sends_since_last_open',
             'wk1_opens','wk4_opens','mo3_opens','mo6_opens','yr1_opens',
             'wk1_clicks','wk4_clicks','mo3_clicks','mo6_clicks',
             'rev_dec','rev_raw','snds_dec','snds_opens_dec','wk1_vis',
             'wk4_vis','mo3_vis','mo6_vis','wk1_pv','wk4_pv','mo3_pv','mo6_pv',
             'wk1_secs','wk4_secs','mo3_secs','mo6_secs','prev_optouts',
             'last_order','last_click','last_cart','last_visit','last_ded_open',
             'sent_time','offer_signature_id','dynamic_content_signature_id',
             'message_size','open_cnt','first_open_time','last_open_time',
             'hrs_to_first_open','hrs_to_last_open','click_cnt','first_click_time',
             'last_click_time','offer_category','hrs_to_first_click','hrs_to_last_click',
             'opt_out_cnt','first_opt_out_time','last_opt_out_time','opt_out_source',
             'opt_out_reason','hrs_to_first_opt_out','hrs_to_last_opt_out','campaign_id',
             'subject','marketing_strategy','campaign_type']

# Finalizing data preprocessing steps

**QUESTION:** How do we split the train-valid-test set?

1.   For each of the provided files, we employ a 80-20 split.
2.   For the full set of files, we employ a 80-20 split. i.e., the full first 2 files and a portion of the first half of the last file is used for training and remaining is used for validation.



In [None]:
#Main Control variable: Default behaviour - Drop if True, Process and Drop if False
control_variable = {'marketing': True, 'subject':True, 'campaign_type':False,
                    'aq': False,'site': True,'last':True, 'promo_ids': True,
                    'email_cnts':True, 'email_times':True, 'opt_out':True, 
                    'offer_cat':True}

#Control variable: Whether processed dataset has to be exported OR not
#If True, needs to be saved. Otherwise, ignored.
save_control_variable = {
    'full': True,
    'weekly': [True, True, True],
    'rolling_stats': True,
}

#Control variable: Whether files need to be overwritten OR appended
#If True, needs to be overwritten. Otherwise, appended.
#Weekly —> 3-membered list, where each element corresponds to 1 of 3 Marko's files.
overwrite_control_variable = {
    'full': True,
    'weekly': [True, True, True],
}

#Appended gzipped filename
myfile = "/content/GDrive/MyDrive/Bandit_Project/dataset_without_time_ohe.csv.gz"
weekly_files_base_folder = "/content/GDrive/MyDrive/Bandit_Project/"

In [None]:
def rew(row):
    sl = -1 * (1+max(row['sends_since_last_open'], 0))
    #op = 5  * max(row['opened'], 0)
    if row['rev_3dv2'] > 0:
        cl = 10
    else:
        cl = 0
    un = -50 * max(row['unsub'], 0)
    fu = -50 * row['frequency_score'] * max(row['unsub'],0)
    return sl+cl+un+fu

In [None]:
#Create a dictionary that updates values for min, max, mean & std for scaling later
moving_col_names = ["rev_3dv2", "sends_since_last_open", "message_size", "retention_score", "recency_score", "frequency_score",
                    "sent_week", "sent_dayofweek", "sent_hr", "discount", "days_subscr"] 

moving_dict = {name: {} for name in moving_col_names}

#Have a variable that also keeps track of the number of elements of the previous chunk
moving_dict["previous_n_so_far"] = 0

In [None]:
for i in range(1, len(file_name_l)):

  #For printing status updates
  cnt = 0

  #If rolling stats json is present from previous run, load it
  pkl_filename_rolling_stats = "/content/GDrive/MyDrive/Bandit_Project/rolling_statistics.pkl"
  try:
    with open(pkl_filename_rolling_stats, 'rb') as file:
      moving_dict = pkl.load(file)
  except OSError:
    pass    

  for df_sample in pd.read_csv(file_name_l[i], sep='\t', header=None, names=orig_col_names, chunksize=1e6):
    
    print(f"[INFO] Processing Chunk #{cnt+1} of Dataset #{i+1}")

    """
    Drop less-frequent AND/OR redundant columns
    all team members voted to eliminate
    """

    #Columns that are finalised to be dropped
    col_drop_names = ["send_dt", "sub_sourcedev", "rev_dec", "rev_raw", "snds_dec", "snds_opens_dec", 
                "dynamic_content_signature_id"]

    df_sample.drop(columns=col_drop_names, inplace=True)
    del col_drop_names

    """
    Compute the composite recency, frequency (dropped) & retention scores
    Dropping the constituent base columns

    NOTE 
    ----
    1) Both recency and frequency are strongly correlated - Remove recency score
    3) sends_since_last_open used for retention score AS WELL AS reward
    """

    #Retention Score
    df_sample['retention_score'] = df_sample.sends_since_last_open.apply(lambda x: 28/max(1,x)) 

    #Recency Score
    df_sample["recency_score"] = 16/7*(df_sample['wk1_opens'] + df_sample['wk1_clicks']) 
    + 8/28*(df_sample['wk4_opens'] + df_sample['wk4_clicks']) 
    + 4/64*(df_sample['mo3_opens'] + df_sample['mo3_clicks']) 
    + 2/92*(df_sample['mo6_opens'] + df_sample['mo6_clicks']) 
    + 1/184*df_sample['yr1_opens']

    #Frequency Score
    df_sample["frequency_score"] = df_sample['wk1_opens'] + df_sample['wk4_opens'] + \
              df_sample['mo3_opens'] + df_sample['mo6_opens']

    #Drop the base columns
    col_drop_scores = ["wk1_opens", "wk4_opens", "mo3_opens","mo6_opens","yr1_opens",
                      "wk1_clicks", "wk4_clicks", "mo3_clicks", "mo6_clicks"]

    df_sample.drop(columns=col_drop_scores, inplace=True)
    del col_drop_scores

    """
    Convert sent_time column to datetime object
    Extract useful features like 
    """
    datetimes = pd.to_datetime(df_sample.sent_time)
    df_sample.drop(columns=["sent_time"], inplace=True)
    df_sample["sent_week"] = datetimes.dt.isocalendar().week
    df_sample["sent_dayofweek"] = datetimes.dt.dayofweek
    df_sample["sent_hr"] = datetimes.dt.hour
    #df_sample["sent_hm_numeric"] = datetimes.dt.hour + (datetimes.dt.minute/60)
    #df_sample["sent_week"] = datetimes.dt.day//7
    #df_sample["sent_month"] = datetimes.dt.month

    """
    Column Name. - marketing_strategy

    NOTE
    ----
    1) AdHoc (1) or Program (0)? Might be useful to keep track of this
    2) Whether Promo or not is useful to check perhaps
    3) Whether Sale or not is useful to check perhaps
    """

    if control_variable["marketing"]:
      #Create a new column for AdHoc OR Program: Seems to be the type of email
      #df_sample["email_type"] = df_sample.marketing_strategy.str.contains("_Adhoc_").fillna(0).astype(int) #1 for AdHoc, 0 for Program

      #Create columns for Promotional and Sale: Other values do not seem useful - Aleksey to get back about correlations
      #df_sample["promo"] = df_sample.marketing_strategy.str.contains("Promotional").fillna(0).astype(int)
      #df_sample["sale"] = df_sample.marketing_strategy.str.contains("Sale").fillna(0).astype(int)
      
      #Drop the column
      df_sample.drop(columns=["marketing_strategy"], inplace=True)

    """
    Column - subject

    1) Use BERT to convert this into a sentence vector?
    2) Check for specific words of interest in the subject?
    3) Other handcrafted features like LogReg or NB?
    """
    if control_variable["subject"]:

      #Check for Aleksey's words
      #feature_words = ['these', 'what', 'that', 'your', 'have', 'holidays', 'flex', 'know', 'fits', '&', 'styles', 'holiday', 'may', 'wear', 'best', 'with', 'gifts']
      #col_names = ["word_" + word for word in feature_words]
      #for (feature_word, col_name) in zip(feature_words, col_names):
      #  df_sample[col_name] = df_sample.subject.str.contains(feature_word, case=False)

      #Marko's features: 'sl_contains_price', 'is_discount_mentioned'
      df_sample["sl_contains_price"] = df_sample.subject.str.contains("$").astype(int)
      
      ##'is_discount_mentioned' — ' off ' or 'up to'
      is_discount_list = ['off', 'up to']
      is_discount_pat = "|".join(is_discount_list)      
      df_sample["is_discount_mentioned"] = df_sample.subject.str.contains(is_discount_pat).astype(int)
      df_sample.drop(columns=["subject"], inplace=True)

    """
    Column Name - campaign_type

    1) Can get the old campaign_type categorical columns from this
    2) Can eliminate EU entries.
    3) Discounts can be extracted.
    4) Sale and Promotional can be extracted.
    """

    if control_variable["campaign_type"]:
      
      #Drop the campaign_type column 
      df_sample.drop(columns=["campaign_type"], inplace=True)

    else:
      #Do stops and then drop the column
      #Eliminate non-US rows
      US_mask = df_sample.campaign_type.str.contains('_US_')
      df_sample = df_sample[US_mask]
      
      #Convert a single column into one-hot encoded campaign_type columns
      df_sample["campaign_Brand"] = df_sample.campaign_type.str.contains("Brand").astype(int)
      df_sample["campaign_Collection"] = df_sample.campaign_type.str.contains("Collection").astype(int)
      df_sample["campaign_Core"] = df_sample.campaign_type.str.contains("Core").astype(int)
      df_sample["campaign_Dedicated"] = df_sample.campaign_type.str.contains("Dedicated").astype(int)
      df_sample["campaign_InnovationSpotlight"] = df_sample.campaign_type.str.contains("InnovationSpotlight").astype(int)
      df_sample["campaign_NewArrivals"] = df_sample.campaign_type.str.contains("NewArrivals").astype(int)
      df_sample["campaign_ProductSpotlight"] = df_sample.campaign_type.str.contains("ProductSpotlight").astype(int)
      df_sample["campaign_Replen"] = df_sample.campaign_type.str.contains("Replen").astype(int)
      df_sample["campaign_Tops"] = df_sample.campaign_type.str.contains("Tops").astype(int)
      df_sample["campaign_Trend"] = df_sample.campaign_type.str.contains("Trend").astype(int)
      
      #Sometimes, there are multiple columns that fire for the same row —> -1 value for Other
      campaign_type_col_names = ["campaign_Collection", "campaign_Core", "campaign_Dedicated", "campaign_InnovationSpotlight",
                                "campaign_NewArrivals", "campaign_ProductSpotlight", "campaign_Replen", "campaign_Tops",
                                "campaign_Brand", "campaign_Trend"]  
      df_sample["campaign_Other"] = 1-df_sample[campaign_type_col_names].any(axis=1).astype(int)

      #Create discount columns
      df_sample["discount"] = 0
      
      #Display how the discounts are presented
      discount_pattern = re.compile(r"_[1-9][0-9]off_", flags = re.I)
      discount_mask = df_sample["campaign_type"].str.contains(discount_pattern, regex=True)
      df_sample.loc[discount_mask, "discount"] = (df_sample[discount_mask])["campaign_type"].str.findall(r"(?<=_)\d{2}").str.get(0).astype(int)

      #Check whether the promot 
      df_sample["promo"] = df_sample.campaign_type.str.contains("Promotional", case=False).fillna(0).astype(int)
      df_sample["sale"] = df_sample.campaign_type.str.contains("Sale", case=False).fillna(0).astype(int)

      #Check for Marko's columns: [1]'is_one_for_free', [2]'is_exclusive', [3]'free_shipping', [4] 'has_urgency'
      df_sample['is_one_for_free'] = df_sample.campaign_type.str.contains(" for ", case=False).astype(int)
      df_sample['free_shipping'] = df_sample.campaign_type.str.contains("free shipping", case=False).astype(int)
      
      ##'is_exclusive': 'vip' or 'premium' or 'exclusive' or 'earlyaccess'
      is_exclusive_list = ['vip', 'premium', 'exclusive', 'earlyaccess']
      is_exclusive_pat = '|'.join(is_exclusive_list)
      df_sample["is_exclusive"] = df_sample.campaign_type.str.contains(is_exclusive_pat, case=False).astype(int)

      ##'has_urgency': 'soon' or 'starts' or 'ends' or 'lastmin' or 'lastday' or 'lastchance' or 'final' or 'tonight'
      has_urgency_list = ['soon', 'starts','ends','lastmin', 'lastday', 'lastchance', 'final', 'tonight']
      has_urgency_pat = '|'.join(has_urgency_list)
      df_sample["has_urgency"] = df_sample.campaign_type.str.contains(has_urgency_pat, case=False).astype(int)

      #Drop the campaign_type column 
      df_sample.drop(columns=["campaign_type"], inplace=True)

    if control_variable['offer_cat']:

      """
      Offer_category Columns - "offer_category"

      NOTE
      ----
      1) This contains additional info about whether it's promotion, sale, Tops etc.
      that are somehow missed by the other columns. We can leverage this?

      e.g. "Mens_Pants...", "Mens_ShirtTops...", "Mens_BigAndTall...", "Mens_Shorts...",
      "Mens_SweatersJackets...", "Mens_Accessories...", "Promotion", "Sale", "Brand"

      2) HORRIBLY BIASED (at least this column is) DATASET because (A) not even a single mention
      of womens products along the lines of Mens_Shorts etc. and (B) very few entries
      3) But at the end of the day - only 8.3% are filled. Hence, not using.
      """

      #Drop this column as well
      df_sample.drop(columns=["offer_category"], inplace=True)

    """
    Acquisiton Columns - "aq_dt", "aq_mo", "aq_dow", "aq_period"

    NOTE
    ----
    1) Older customers who have not subscribed — exploit strategy should work better
    2) aq_dt has negative numbers - Why?
    """
    
    if control_variable["aq"]:
    
      col_drop_aq = ["aq_dt", "aq_mo", "aq_dow", "aq_period"]
      df_sample.drop(columns=col_drop_aq, inplace=True)
    
    else:
      
      #One-hot encode Holiday and the Rest
      df_sample.loc[:, ["aq_period"]] = df_sample.aq_period.str.contains(r"^Holiday", case=False)
      
      #Rename aq_dt column name to days_subscr
      df_sample.rename(columns = {'aq_dt':'days_subscr'}, inplace = True)

      #Drop aq_mo and aq_dow —> Currently not dropping aq_mo and aq_dow 
      #col_drop_aq = ["aq_mo", "aq_dow"]
      #df_sample.drop(columns=col_drop_aq, inplace=True)
      del datetimes
      
    """
    Website Data - "wk1_vis", "wk4_vis", "mo3_vis", "mo6_vis", "wk1_pv",
    "wk4_pv", "mo3_pv", "mo6_pv", "wk1_secs", "wk4_secs", "mo3_secs", "mo6_secs"

    NOTE
    ----
    1) Extremely sparse - mostly filled with 0s
    2) Marko had said website and email integration has not taken place
    3) Naturally, this integration will happen in the future. Let the future team take care of it then.
    """
    
    if control_variable["site"]:
      col_drop_site = ["wk1_vis", "wk4_vis", "mo3_vis", "mo6_vis", "wk1_pv",
    "wk4_pv", "mo3_pv", "mo6_pv", "wk1_secs", "wk4_secs", "mo3_secs", "mo6_secs"]
      df_sample.drop(columns=col_drop_site, inplace=True)

    """
    The Last Time - "last_order", "last_click", "last_cart","last_visit","last_ded_open"

    NOTE
    ----
    1) Moooostly NaN values currently
    2) However, when it is not NaN - especially for last_order - it could be a really useful signal
    """
    if control_variable["last"]:
      col_drop_last = ["last_order", "last_click", "last_cart","last_visit","last_ded_open"]
      df_sample.drop(columns=col_drop_last, inplace=True)

    """
    Promotion IDs - "offer_signature_id", "launch_id", "campaign_id"

    NOTE
    ----
    1) A bunch of strings - might be useful for either cross-feature interactions or 
    campaign/promotion embeddings
    2) I feel it is a nice-to-have thing, as opposed to a need-to-have-thing
    3) Campaign ID is not being dropped
    """
    if control_variable["promo_ids"]:
      col_drop_promo_ids = ["offer_signature_id", "launch_id"]
      df_sample.drop(columns=col_drop_promo_ids, inplace=True)

    """
    Detailed user-email interaction COUNTS - "open_cnt", "click_cnt" 
    ""
    NOTE
    ----
    1) We already have the "opened" signal - can it be boosted further with open_cnt and click_cnt?
    2) But only 28.2% and 8.3% of these counts are available
    """
    if control_variable["email_cnts"]:
      col_drop_email_cnts = ["open_cnt", "click_cnt"]
      df_sample.drop(columns=col_drop_email_cnts, inplace=True)

    """
    Detailed user-email interaction TIMES - "first_open_time", "last_open_time", "hrs_to_first_open", 
    "hrs_to_last_open", "first_click_time", "last_click_time", "hrs_to_first_click", "hrs_to_last_click"

    NOTE
    ----
    1) Hmmmm...
    """
    if control_variable["email_times"]:
      col_drop_email_times = ["first_open_time", "last_open_time", "hrs_to_first_open", 
    "hrs_to_last_open", "first_click_time", "last_click_time", "hrs_to_first_click", "hrs_to_last_click"]
      df_sample.drop(columns=col_drop_email_times, inplace=True)

    """
    Opt Out - "first_opt_out_time", "opt_out_cnt", "last_opt_out_time", "opt_out_source", 
    "opt_out_reason", "hrs_to_first_opt_out", "hrs_to_last_opt_out"

    NOTE
    -----
    1) Marko's response: so I think there's a couple of different levels you can opt out on, just promotional emails, all emails etc.
    so they probably resubscribed if they got added in later
    2) I think the emailer system has checks and balances to see whether people who opt out,
    should be sent mails to. But they don't keep track of the past - i.e., when someone opts out,
    then resubscribes - he/she does not get counted as an opt out
    3) Only 0.3% of entries have opt out filled - prev_opt_outs is 0 for 94% of entries
    """
    if control_variable["opt_out"]:
      col_drop_opt_out = ["prev_optouts", "first_opt_out_time", "opt_out_cnt", "last_opt_out_time", "opt_out_source", 
    "opt_out_reason", "hrs_to_first_opt_out", "hrs_to_last_opt_out"]
      df_sample.drop(columns=col_drop_opt_out, inplace=True)

    #Add an optimal action: If unsub OR not opened, 0. If revenue OR open, then 1.
    df_sample["optimal_action"] = 1
    dont_send_mask = (df_sample["unsub"] == 1) | (df_sample["opened"] == 0)
    df_sample.loc[dont_send_mask, ["optimal_action"]] = 0

    #Add the reward column as per Marko's definition
    df_sample['reward'] = df_sample.apply(lambda row: rew(row), axis=1)

    #Aleksey: Look Here
    #Update the moving dict
    current_n = len(df_sample)

    #We have updated the dict with values from 1 chunk
    if (cnt + i) > 0:
      
      for col_name in moving_dict.keys():
        
        #As long as col_name is not "previous_n_so_far"
        if col_name != "previous_n_so_far":
        
          #Update min & max of col_name if required
          min_val = df_sample[col_name].min()
          max_val = df_sample[col_name].max()

          if min_val < moving_dict[col_name]["min"]:
            moving_dict[col_name]["min"] = min_val
          
          if max_val > moving_dict[col_name]["max"]:
            moving_dict[col_name]["max"] = max_val

          #Update the mean to incorporate the new population
          current_mean = df_sample[col_name].mean()
          previous_mean = moving_dict[col_name]["mean"]
          moving_dict[col_name]["mean"] = (moving_dict["previous_n_so_far"] * previous_mean + current_n * current_mean)/(moving_dict["previous_n_so_far"] + current_n)
          
          #Keeping track of std is a real doozy - https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
          current_std = df_sample[col_name].std()
          previous_std = moving_dict[col_name]["std"]
          correction_term = (moving_dict["previous_n_so_far"] * current_n * (current_mean - previous_mean)**2)/((moving_dict["previous_n_so_far"] + current_n) * (moving_dict["previous_n_so_far"] + current_n-1))
          combined_variance = ((moving_dict["previous_n_so_far"]-1)* (previous_std**2) + (current_n - 1)*(current_std**2))/(moving_dict["previous_n_so_far"] + current_n - 1) + correction_term
          moving_dict[col_name]["std"] = np.sqrt(combined_variance)

      #Update the number of points
      moving_dict["previous_n_so_far"] += current_n

      #Should we overwrite the file or append to the file?
      if save_control_variable["full"]:

        #For the first chunk of 2nd and 3rd Dset, check whether we need to overwrite?
        #If yes, we need to add it to whatever is there. If no, don't do anything. 
        if overwrite_control_variable["full"]:
          df_sample.to_csv(myfile, index=False, compression="gzip", mode="a", header=False)
        
      if save_control_variable["weekly"][i]:
        
        #Create weekly save files
        #Year string is accessed by year_l[i]
        weekly_l = sorted(list(df_sample["sent_week"].unique()))
        weekly_filename_l = [weekly_files_base_folder + "df_" +  year_l[i] + "_" + str(week) + ".csv.gz" 
                            for week in weekly_l]
        for counter, week_filename in enumerate(weekly_filename_l):

          #Is it the first chunk of the 2nd and 3rd Dset? If yes, check for the
          #overwrite flag. If yes, delete existing file. 
          if not(cnt):
            
            #Check whether the overwrite flag for this Data Subset is set to True or False.
            if overwrite_control_variable["weekly"][i]:
          
              #If yes, delete the file so that contents will be overwritten.
              try:
                os.remove(week_filename)
              except OSError:
                pass

          df_sample.loc[df_sample["sent_week"] == weekly_l[counter], :].to_csv(week_filename, index=False, compression='gzip', mode='a', header=False)
  
    else:

      #First chunk
      #Set current chunk size to the variable that tracks datapoints
      moving_dict["previous_n_so_far"] = current_n
      
      for col_name in moving_dict.keys():

        #As long as col_name is not "previous_n_so_far"
        if col_name != "previous_n_so_far":

          moving_dict[col_name]["min"] = df_sample[col_name].min() 
          moving_dict[col_name]["max"] = df_sample[col_name].max()
          moving_dict[col_name]["mean"] = df_sample[col_name].mean()
          moving_dict[col_name]["std"] = df_sample[col_name].std()

      #Delete gzip file it exists
      if save_control_variable["full"]:
        if os.path.isfile(myfile) and overwrite_control_variable["full"]:
          os.remove(myfile)
      
      #Append chunk to our file
      if save_control_variable["full"]:
        df_sample.to_csv(myfile, index=False, compression="gzip", mode="a")
        
      #Create weekly save files
      #Year string is accessed by year_l[i]
      if save_control_variable["weekly"][i]:

        week_l = sorted(list(df_sample["sent_week"].unique()))
        week_filename_l = [weekly_files_base_folder + "df_" +  year_l[i] + "_" + str(week) + ".csv.gz" 
                            for week in week_l]
        for counter, week_filename in enumerate(week_filename_l):

          #Check whether the overwrite flag for this Data Subset is set to True or False.
          if overwrite_control_variable["weekly"][i]:
            
            #If yes, delete the file so that contents will be overwritten.
            try:
              os.remove(week_filename)
            except OSError:
              pass

          #Append contents of the file
          df_sample.loc[df_sample["sent_week"] == week_l[counter], :].to_csv(week_filename, index=False, compression='gzip', mode='a', header=False)

      #Save dtypes as a json file once, which can be used by pandas to load the file
      df_sample_dtypes = df_sample.dtypes.to_frame('dtypes').reset_index() #First, convert to df
      d = df_sample_dtypes.set_index('index')['dtypes'].astype(str).to_dict() #Then, convert to dict with str
      json_filename = "/content/GDrive/MyDrive/Bandit_Project/dtypes.json"
      with open(json_filename, 'w') as f:
        json.dump(d, f)

    #Delete the dataframe to save space
    del df_sample

    #Increment display counter
    cnt += 1

  #Dump the moving_dict dictionary to a json file for later use
  if save_control_variable["rolling_stats"]:
    with open(pkl_filename_rolling_stats, 'wb') as file:
      pkl.dump(moving_dict, file)

  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing Chunk #1 of Dataset #2
[INFO] Processing Chunk #2 of Dataset #2
[INFO] Processing Chunk #3 of Dataset #2


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing Chunk #4 of Dataset #2
[INFO] Processing Chunk #5 of Dataset #2
[INFO] Processing Chunk #6 of Dataset #2
[INFO] Processing Chunk #7 of Dataset #2
[INFO] Processing Chunk #8 of Dataset #2
[INFO] Processing Chunk #9 of Dataset #2
[INFO] Processing Chunk #10 of Dataset #2
[INFO] Processing Chunk #11 of Dataset #2


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing Chunk #12 of Dataset #2


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing Chunk #13 of Dataset #2
[INFO] Processing Chunk #14 of Dataset #2
[INFO] Processing Chunk #15 of Dataset #2
[INFO] Processing Chunk #16 of Dataset #2
[INFO] Processing Chunk #17 of Dataset #2
[INFO] Processing Chunk #18 of Dataset #2


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing Chunk #19 of Dataset #2
[INFO] Processing Chunk #20 of Dataset #2
[INFO] Processing Chunk #21 of Dataset #2
[INFO] Processing Chunk #22 of Dataset #2
[INFO] Processing Chunk #23 of Dataset #2
[INFO] Processing Chunk #24 of Dataset #2
[INFO] Processing Chunk #25 of Dataset #2
[INFO] Processing Chunk #26 of Dataset #2
[INFO] Processing Chunk #27 of Dataset #2
[INFO] Processing Chunk #28 of Dataset #2
[INFO] Processing Chunk #29 of Dataset #2
[INFO] Processing Chunk #30 of Dataset #2
[INFO] Processing Chunk #31 of Dataset #2
[INFO] Processing Chunk #32 of Dataset #2
[INFO] Processing Chunk #33 of Dataset #2
[INFO] Processing Chunk #34 of Dataset #2
[INFO] Processing Chunk #1 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing Chunk #2 of Dataset #3
[INFO] Processing Chunk #3 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing Chunk #4 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing Chunk #5 of Dataset #3
[INFO] Processing Chunk #6 of Dataset #3
[INFO] Processing Chunk #7 of Dataset #3
[INFO] Processing Chunk #8 of Dataset #3
[INFO] Processing Chunk #9 of Dataset #3
[INFO] Processing Chunk #10 of Dataset #3
[INFO] Processing Chunk #11 of Dataset #3
[INFO] Processing Chunk #12 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing Chunk #13 of Dataset #3
[INFO] Processing Chunk #14 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing Chunk #15 of Dataset #3


  interactivity=interactivity, compiler=compiler, result=result)


[INFO] Processing Chunk #16 of Dataset #3
[INFO] Processing Chunk #17 of Dataset #3
[INFO] Processing Chunk #18 of Dataset #3
[INFO] Processing Chunk #19 of Dataset #3


# Analysing Aleksey's Exported Files

Checking both Weekly + Full Dataset

In [None]:
"""
aleksey_full_l = ['/content/GDrive/MyDrive/Bandit_Project/BanditsData/Jul_Dec_2019.zip',
               '/content/GDrive/MyDrive/Bandit_Project/BanditsData/Jan_Jun_2020.zip',
               '/content/GDrive/MyDrive/Bandit_Project/BanditsData/Jul_Sep_2020.zip',]

aleksey_weekly_l = ['/content/GDrive/MyDrive/Bandit_Project/BanditsData/2019_wks_26_52.zip', 
                    '/content/GDrive/MyDrive/Bandit_Project/BanditsData/2020_wks_00_25.zip', 
                    '/content/GDrive/MyDrive/Bandit_Project/BanditsData/2020_wks_26_38.zip',]


#Extract one file from each of the 2 lists
zip_path_l = ['/content/GDrive/MyDrive/Bandit_Project/BanditsData/Jul_Dec_2019.zip',
              '/content/GDrive/MyDrive/Bandit_Project/BanditsData/2019_wks_26_52.zip',] 

folder_name_l = ['/content/GDrive/MyDrive/Bandit_Project/Aleksey/Jul_Dec_2019',
              '/content/GDrive/MyDrive/Bandit_Project/Aleksey/2019_wks_26_52',] 

for (zip_path, folder_name) in zip(zip_path_l, folder_name_l):
    
  if not(os.path.exists(folder_name)):    
    
    with ZipFile(zip_path, 'r') as zip_obj:

      zip_obj.extractall(folder_name)

"""

In [None]:
"""
#Extract 1e5 rows from a full and weekly file
full_sample = '/content/GDrive/MyDrive/Bandit_Project/Aleksey/Jul_Dec_2019/sends_july_dec_2019_2000_01.csv'
weekly_sample = '/content/GDrive/MyDrive/Bandit_Project/Aleksey/2019_wks_26_52/sends_2019_wk26.csv'

df_aleksey_full = pd.read_csv(full_sample)
df_aleksey_weekly = pd.read_csv(weekly_sample)
"""

In [None]:
#df_aleksey_full.head(10)

In [None]:
#df_aleksey_weekly.head(10)