In [1]:
import pandas as pd
import re
from src.nielsen_helpers import get_s3_files, process_files, get_regex_domains
from src.helpers import  plot_top, set_size
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.random  as rnd
import swifter

# Overall weights

## Desktop

In [2]:
def get_users(df, reg):
    tmp = df[df.url.apply(lambda x: reg.match(x) is not None)].copy(deep=True)
    if len(tmp) == 0:
        return pd.DataFrame()
    tmp["platform"] = tmp.url.apply(lambda x: 
                                    [y for y in reg.match(x).groups() if y is not None][-2])
    tmp = tmp.groupby(["nol_id", "platform", pd.Grouper(key="activitydatetime", freq="D")])\
        .agg({"viewduration": sum, "url": len})\
        .reset_index()
    tmp.activitydatetime = pd.to_datetime(tmp.activitydatetime)
    return tmp

recompute = False
start_date, end_date = "2020-05-01", "2021-07-01"
filesn, dem = get_s3_files(start_date, end_date)
path = "/data/deplatforming/data/tmp_wall/df_fringe_to_weight_{}_to_{}.csv".format(start_date, end_date)
regex_fringe = re.compile(get_regex_domains(path_to_regexes='./data/domains.csv', category_key='category', 
                                             category='fringe', domain_keys='domains'))

if recompute:
    df_fringe = process_files(filesn, start_date, end_date, num_workers=15, mf=get_users, 
                              args_mf=regex_fringe)
    df_fringe = df_fringe.groupby(["nol_id", "platform", "activitydatetime"]).sum().reset_index()
    df_fringe.to_csv(path, index=False)

else:
    df_fringe = pd.read_csv(path)
    df_fringe.activitydatetime = pd.to_datetime(df_fringe.activitydatetime)

In [3]:
def get_users(df, reg):
    tmp = df[df.url.apply(lambda x: reg.match(x) is not None)].copy(deep=True)
    if len(tmp) == 0:
        return pd.DataFrame()
    tmp["platform"] = tmp.url.apply(lambda x: 
                                    [y for y in reg.match(x).groups() if y is not None][-2])
    tmp = tmp.groupby(["nol_id", "platform", pd.Grouper(key="activitydatetime", freq="D")])\
        .agg({"viewduration": sum, "url": len})\
        .reset_index()
    tmp.activitydatetime = pd.to_datetime(tmp.activitydatetime)
    return tmp

recompute = False
start_date, end_date = "2020-05-01", "2021-07-01"
filesn, dem = get_s3_files(start_date, end_date)
path = "/data/deplatforming/data/tmp_wall/df_mainstream_to_weight_{}_to_{}.csv".format(start_date, end_date)
regex_mainstream = re.compile(get_regex_domains(path_to_regexes='./data/domains.csv', category_key='category', 
                                             category='osn', domain_keys='domains'))
if recompute:
    df_mainstream = process_files(filesn, start_date, end_date, num_workers=15, mf=get_users, 
                              args_mf=regex_mainstream)
    df_mainstream = df_mainstream.groupby(["nol_id", "platform", "activitydatetime"]).sum().reset_index()
    df_mainstream.to_csv(path, index=False)

else:
    df_mainstream = pd.read_csv(path)
    df_mainstream.activitydatetime = pd.to_datetime(df_mainstream.activitydatetime)

In [4]:
def get_all(df):
    if len(df) == 0:
        return pd.DataFrame()
    tmp = df.groupby(["nol_id", pd.Grouper(key="activitydatetime", freq="D")])\
        .agg({"viewduration": sum, "url": len})\
        .reset_index()
    tmp.activitydatetime = pd.to_datetime(tmp.activitydatetime)
    return tmp

recompute = False
start_date, end_date = "2020-05-01", "2021-07-01"
filesn, dem = get_s3_files(start_date, end_date)
path = "/data/deplatforming/data/tmp_wall/df_all_to_weight_{}_to_{}.csv".format(start_date, end_date)
                                  
if recompute:
    df_all = process_files(filesn, start_date, end_date, num_workers=15, mf=get_all)
    df_all = df_all.groupby(["nol_id", "activitydatetime"]).sum().reset_index()
    df_all.to_csv(path, index=False)

else:
    df_all = pd.read_csv(path)
    df_all.activitydatetime = pd.to_datetime(df_all.activitydatetime)

# Mobile

In [5]:
def get_users(df, reg):
    apps = ["Rumble - Video Battles", "MeWe Network", "Parler", "DLive · Stream on Blockchain",
           "Telegram Messenger"]
    tmp =  df[df.domain_name.apply(lambda x: not pd.isna(x) and reg.match(x) is not None) 
              | df.app_name.apply(lambda x: not pd.isna(x) and x in apps) ].copy(deep=True)
    
    tmp.loc[tmp.app_name == "Rumble - Video Battles", "domain_name"] = "rumble.com"
    tmp.loc[tmp.app_name == "MeWe Network", "domain_name"] = "mewe.com"
    tmp.loc[tmp.app_name == "Parler", "domain_name"] = "parler.com"
    tmp.loc[tmp.app_name == "DLive · Stream on Blockchain", "domain_name"] = "dlive.tv"
    tmp.loc[tmp.app_name == "Telegram Messenger", "domain_name"] = "telegram.org"
    
    tmp["is_app"] = False
    for appn in apps:
        tmp.loc[tmp.app_name == appn, "is_app"] = True
 
    tmp["platform"] = tmp.domain_name.apply(lambda x: 
                                    [y for y in reg.match(x).groups() if y is not None][-2])

    tmp = tmp.groupby(["mobile_id", "platform", pd.Grouper(key="activitydatetime", freq="D"), "is_app"])\
        .agg({"duration": sum, "app_name": len})\
        .reset_index()
    tmp.activitydatetime = pd.to_datetime(tmp.activitydatetime)
    return tmp
    

regex_fringe = re.compile(get_regex_domains(path_to_regexes='./data/domains.csv', category_key='category', 
                                             category='fringe', domain_keys='domains'))
recompute = False
start_date, end_date = "2020-05-01", "2021-07-01"
filesn, ap, dem = get_s3_files(start_date, end_date, is_mobile=True,
                                     pathv="s3://epfl-collaboration-paspkaoe1nx9ptad1k8rjh5maact6use1a-s3alias/tmp/")
path = "/data/deplatforming/data/tmp_wall/df_fringe_to_weight_{}_to_{}_mob.csv".format(start_date, end_date)

if recompute:
    df_fringe_mob = process_files(filesn, start_date, end_date, num_workers=15, mf=get_users, 
                              args_mf=regex_fringe, is_mobile=True)
    df_fringe_mob = df_fringe_mob.groupby(["mobile_id", "platform", "activitydatetime", "is_app"])\
                                .sum().reset_index()
    df_fringe_mob.to_csv(path, index=False)

else:
    df_fringe_mob = pd.read_csv(path)
    df_fringe_mob.activitydatetime = pd.to_datetime(df_fringe_mob.activitydatetime)


In [6]:
def get_users(df, reg):
    apps = ["Facebook", "Facebook Lite", "Twitter", "Pinterest", "Reddit: Trending News & Tips",
           "Instagram", "YouTube: Watch, Listen, Stream", "TikTok", "TikTok - Make Your Day",
           "LinkedIn: Network & Job Finder", "Snapchat", 'WhatsApp Messenger',
            "Nextdoor: Local Neighborhood"]
    tmp =  df[df.domain_name.apply(lambda x: not pd.isna(x) and reg.match(x) is not None) 
              | df.app_name.apply(lambda x: not pd.isna(x) and x in apps) ].copy(deep=True)
    

    tmp.loc[tmp.app_name == "Facebook", "domain_name"] = "facebook.com"
    tmp.loc[tmp.app_name == "Facebook Lite", "domain_name"] = "facebook.com"
    tmp.loc[tmp.app_name == "Twitter", "domain_name"] = "twitter.com"
    tmp.loc[tmp.app_name == "Pinterest", "domain_name"] = "pinterest.com"
    tmp.loc[tmp.app_name == "Reddit: Trending News & Tips", "domain_name"] = "reddit.com"
    tmp.loc[tmp.app_name == "Instagram", "domain_name"] = "instagram.com"
    tmp.loc[tmp.app_name == "YouTube: Watch, Listen, Stream", "domain_name"] = "youtube.com"
    tmp.loc[tmp.app_name == "TikTok", "domain_name"] = "tiktok.com"
    tmp.loc[tmp.app_name == "TikTok - Make Your Day", "domain_name"] = "tiktok.com"
    tmp.loc[tmp.app_name == "LinkedIn: Network & Job Finder", "domain_name"] = "tiktok.com"
    tmp.loc[tmp.app_name == "Snapchat", "domain_name"] = "snapchat.com"
    tmp.loc[tmp.app_name == 'WhatsApp Messenger', "domain_name"] = "whatsapp.com"
    tmp.loc[tmp.app_name == "Nextdoor: Local Neighborhood", "domain_name"] = "nextdoor.com"


    tmp["is_app"] = False
    for appn in apps:
        tmp.loc[tmp.app_name == appn, "is_app"] = True
 
    tmp["platform"] = tmp.domain_name.apply(lambda x: 
                                    [y for y in reg.match(x).groups() if y is not None][-2])

    tmp = tmp.groupby(["mobile_id", "platform", pd.Grouper(key="activitydatetime", freq="D"), "is_app"])\
        .agg({"duration": sum, "app_name": len})\
        .reset_index()
    tmp.activitydatetime = pd.to_datetime(tmp.activitydatetime)
    return tmp
    

regex_mainstream = re.compile(get_regex_domains(path_to_regexes='./data/domains.csv', category_key='category', 
                                             category='osn', domain_keys='domains'))
recompute = False
start_date, end_date = "2020-05-01", "2021-07-01"
filesn, ap, dem = get_s3_files(start_date, end_date, is_mobile=True,
                                     pathv="s3://epfl-collaboration-paspkaoe1nx9ptad1k8rjh5maact6use1a-s3alias/tmp/")

path = "/data/deplatforming/data/tmp_wall/df_mainstream_to_weight_{}_to_{}_mob.csv".format(start_date, end_date)

if recompute:
    df_mainstream_mob = process_files(filesn, start_date, end_date, num_workers=8, mf=get_users, 
                              args_mf=regex_mainstream, is_mobile=True)
    df_mainstream_mob = df_mainstream_mob.groupby(["mobile_id", "platform", "activitydatetime", "is_app"])\
                            .sum().reset_index()
    df_mainstream_mob.to_csv(path, index=False)

else:
    df_mainstream_mob = pd.read_csv(path)
    df_mainstream_mob.activitydatetime = pd.to_datetime(df_mainstream_mob.activitydatetime)


In [7]:
def get_users(df):

    df["is_app"] = df.domain_name.isna()

    df = df.groupby(["mobile_id", pd.Grouper(key="activitydatetime", freq="D"), "is_app"])\
        .agg({"duration": sum, "app_name": len})\
        .reset_index()
    df.activitydatetime = pd.to_datetime(df.activitydatetime)
    return df
    
recompute = False
start_date, end_date = "2020-05-01", "2021-07-01"
filesn, ap, dem = get_s3_files(start_date, end_date, is_mobile=True,
                                     pathv="s3://epfl-collaboration-paspkaoe1nx9ptad1k8rjh5maact6use1a-s3alias/tmp/")
path = "/data/deplatforming/data/tmp_wall/df_all_to_weight_{}_to_{}_mob.csv".format(start_date, end_date)

if recompute:
    df_all_mob = process_files(filesn, start_date, end_date, num_workers=8, mf=get_users, is_mobile=True)
    df_all_mob = df_all_mob.groupby(["mobile_id", "activitydatetime", "is_app"]).sum().reset_index()
    df_all_mob.to_csv(path, index=False)

else:
    df_all_mob = pd.read_csv(path)
    df_all_mob.activitydatetime = pd.to_datetime(df_all_mob.activitydatetime)

# Experiment


In [8]:
def get_users_parler(df, reg):
    return df[df.url.apply(lambda x: reg.match(x) is not None)]

regex_parler =  re.compile("(http(s)?://(www\.)?([a-zA-Z0-9\-_]+\.)?)?(parler)\.(com)")
recompute = False
start_date, end_date = "2020-12-01", "2021-01-01"
filesn, dem = get_s3_files(start_date, end_date)
path = "/data/deplatforming/data/tmp_parler/df_parler2_{}_to_{}.csv".format(start_date, end_date)

if recompute:
    df_parler = process_files(filesn, start_date, end_date, num_workers=4, 
mf=get_users_parler, args_mf=regex_parler)
    df_parler.to_csv(path, index=False)

else:
    df_parler = pd.read_csv(path)
    df_parler.activitydatetime = pd.to_datetime(df_parler.activitydatetime)

In [9]:
def get_users_parler_all(df, u):
    return df.loc[df.nol_id.apply(lambda x: x in u), ["nol_id", "activitydatetime", "viewduration", "url"]]

recompute = False

tmp = df_parler.groupby("nol_id").viewduration.sum().sort_values() 
tmp = tmp[tmp > 180].index.values
treated_users = set(tmp)
print(len(treated_users))
start_date, end_date = "2020-12-01", "2021-06-01"
filesn, dem = get_s3_files(start_date, end_date)

path = "/data/deplatforming/data/tmp_parler/df_parler2_all_{}_to_{}.csv".format(start_date, end_date)
if recompute:
    df_parler_all = process_files(filesn, start_date, end_date, num_workers=15, mf=get_users_parler_all, 
                                    args_mf=treated_users)
    df_parler_all.to_csv(path, index=False)
else:
    df_parler_all = pd.read_csv(path)
    df_parler_all.activitydatetime = pd.to_datetime(df_parler_all.activitydatetime)
    

172


In [10]:
def get_users_control(df, reg):
    return df[df.url.apply(lambda x: reg.match(x) is not None)]

recompute = False
start_date, end_date = "2020-12-01", "2021-01-01"
filesn, dem = get_s3_files(start_date, end_date)
path = "/data/deplatforming/data/tmp_parler/df_control2_{}_to_{}.csv".format(start_date, end_date)
regex_fringe = re.compile(get_regex_domains(path_to_regexes='./data/domains.csv', category_key='category', 
                                             category='fringe', domain_keys='domains'))

if recompute:
    df_control = process_files(filesn, start_date, end_date, num_workers=4, 
                               mf=get_users_control,args_mf=regex_fringe)
    df_control.to_csv(path, index=False)

else:
    df_control = pd.read_csv(path)
    df_control.activitydatetime = pd.to_datetime(df_control.activitydatetime)

In [11]:
def get_users_control_all(df, u):
    return df.loc[df.nol_id.apply(lambda x: x in u), ["nol_id", "activitydatetime", "viewduration", "url"]]

recompute = False

tmp = df_control.groupby("nol_id").viewduration.sum().sort_values() 
tmp = tmp[tmp > 180].index.values
control_users = set(tmp) - treated_users

start_date, end_date = "2020-12-01", "2021-06-01"
filesn, dem  = get_s3_files(start_date, end_date)

path = "/data/deplatforming/data/tmp_parler/df_control2_all_{}_to_{}.csv".format(start_date, end_date)
if recompute:
    df_control_all = process_files(filesn, 
                                   start_date, 
                                   end_date, 
                                   num_workers=15,
                                   mf=get_users_control_all, 
                                   args_mf=control_users)
    df_control_all.to_csv(path, index=False)
else:
    df_control_all = pd.read_csv(path)
    df_control_all.activitydatetime = pd.to_datetime(df_control_all.activitydatetime)

## Mobile

In [12]:
def get_users(df, reg):
    apps = ["Parler"]
    tmp =  df[df.domain_name.apply(lambda x: not pd.isna(x) and reg.match(x) is not None) 
              | df.app_name.apply(lambda x: not pd.isna(x) and x in apps) ].copy(deep=True)
    
    tmp.loc[tmp.app_name == "Parler", "domain_name"] = "parler.com"
    
    tmp["is_app"] = False
    for appn in apps:
        tmp.loc[tmp.app_name == appn, "is_app"] = True
 
    tmp["platform"] = tmp.domain_name.apply(lambda x: 
                                    [y for y in reg.match(x).groups() if y is not None][-2])

    tmp.activitydatetime = pd.to_datetime(tmp.activitydatetime)
    return tmp
    
regex_parler =  re.compile("(http(s)?://(www\.)?([a-zA-Z0-9\-_]+\.)?)?(parler)\.(com)")
recompute = False
start_date, end_date = "2020-12-01", "2021-01-01"
filesn, ap, dem = get_s3_files(start_date, end_date, is_mobile=True,
                                     pathv="s3://epfl-collaboration-paspkaoe1nx9ptad1k8rjh5maact6use1a-s3alias/tmp/")

path = "/data/deplatforming/data/tmp_parler/df_parler2_{}_to_{}_mob.csv".format(start_date, end_date)

if recompute:
    df_parler_mob = process_files(filesn, start_date, end_date, num_workers=3, mf=get_users, args_mf=regex_parler,
                             is_mobile=True)
    df_parler_mob.to_csv(path, index=False)

else:
    df_parler_mob = pd.read_csv(path)
    df_parler_mob.activitydatetime = pd.to_datetime(df_parler_mob.activitydatetime)

In [13]:
def get_users(df, u):
    return df.loc[df.mobile_id.apply(lambda x: x in u), ["mobile_id", 
                                                     "activitydatetime", 
                                                     "duration", 
                                                     "app_name", "domain_name"]]

recompute = False
tmp = df_parler_mob.groupby("mobile_id").duration.sum().sort_values() 
tmp = tmp[tmp > 180].index.values
treated_users_mob = set(tmp)
print(len(treated_users_mob))

start_date, end_date = "2020-12-01", "2021-06-01"
filesn, ap, dem = get_s3_files(start_date, end_date, is_mobile=True,
                                     pathv="s3://epfl-collaboration-paspkaoe1nx9ptad1k8rjh5maact6use1a-s3alias/tmp/")

path = "/data/deplatforming/data/tmp_parler/df_parler2_all_{}_to_{}_mob.csv".format(start_date, end_date)
if recompute:
    df_parler_all_mob = process_files(filesn, start_date, end_date, num_workers=15, mf=get_users, 
                                    args_mf=treated_users_mob, is_mobile=True)
    df_parler_all_mob.to_csv(path, index=False)
else:
    df_parler_all_mob = pd.read_csv(path)
    df_parler_all_mob.activitydatetime = pd.to_datetime(df_parler_all_mob.activitydatetime)

283


In [14]:
def get_users(df, reg):
    apps = ["Rumble - Video Battles", "MeWe Network", "Parler", "DLive · Stream on Blockchain",
           "Telegram Messenger"]
    tmp =  df[df.domain_name.apply(lambda x: not pd.isna(x) and reg.match(x) is not None) 
              | df.app_name.apply(lambda x: not pd.isna(x) and x in apps) ].copy(deep=True)
    
    tmp.loc[tmp.app_name == "Rumble - Video Battles", "domain_name"] = "rumble.com"
    tmp.loc[tmp.app_name == "MeWe Network", "domain_name"] = "mewe.com"
    tmp.loc[tmp.app_name == "Parler", "domain_name"] = "parler.com"
    tmp.loc[tmp.app_name == "DLive · Stream on Blockchain", "domain_name"] = "dlive.tv"
    tmp.loc[tmp.app_name == "Telegram Messenger", "domain_name"] = "telegram.org"
    
    tmp["is_app"] = False
    for appn in apps:
        tmp.loc[tmp.app_name == appn, "is_app"] = True
 
    tmp["platform"] = tmp.domain_name.apply(lambda x: 
                                    [y for y in reg.match(x).groups() if y is not None][-2])
    tmp.activitydatetime = pd.to_datetime(tmp.activitydatetime)
    return tmp
    
regex_fringe = re.compile(get_regex_domains(path_to_regexes='./data/domains.csv', category_key='category', 
                                             category='fringe', domain_keys='domains'))
recompute = False
start_date, end_date = "2020-12-01", "2021-01-01"
filesn, ap, dem = get_s3_files(start_date, end_date, is_mobile=True,
                                     pathv="s3://epfl-collaboration-paspkaoe1nx9ptad1k8rjh5maact6use1a-s3alias/tmp/")
path = "/data/deplatforming/data/tmp_parler/df_control2_{}_to_{}_mob.csv".format(start_date, end_date)

if recompute:
    df_control_mob = process_files(filesn, start_date, end_date, num_workers=4, mf=get_users, args_mf=regex_fringe,
                             is_mobile=True)
    df_control_mob.to_csv(path, index=False)

else:
    df_control_mob = pd.read_csv(path)
    df_control_mob.activitydatetime = pd.to_datetime(df_control_mob.activitydatetime)

In [15]:
def get_users(df, u):
    return df.loc[df.mobile_id.apply(lambda x: x in u), ["mobile_id", 
                                                     "activitydatetime", 
                                                     "duration", 
                                                     "app_name", "domain_name"]]

recompute = False

tmp = df_control_mob.groupby("mobile_id").duration.sum().sort_values() 
tmp = tmp[tmp > 180].index.values
control_users_mob = set(tmp) - treated_users_mob
print(len(control_users_mob))
start_date, end_date = "2020-12-01", "2021-06-01"
filesn, ap, dem = get_s3_files(start_date, end_date, is_mobile=True,
                                     pathv="s3://epfl-collaboration-paspkaoe1nx9ptad1k8rjh5maact6use1a-s3alias/tmp/")

path = "/data/deplatforming/data/tmp_parler/df_control2_all_{}_to_{}_mob.csv".format(start_date, end_date)
if recompute:
    df_control_all_mob = process_files(filesn, start_date, end_date, num_workers=15, mf=get_users, 
                                    args_mf=control_users_mob, is_mobile=True)
    df_control_all_mob.to_csv(path, index=False)
else:
    df_control_all_mob = pd.read_csv(path)
    df_control_all_mob.activitydatetime = pd.to_datetime(df_control_all_mob.activitydatetime)

659


# Final prep

In [16]:
# Desktop

import regex
import multiprocessing as mp

df_control_all["is_treat"] = 0
df_parler_all["is_treat"] = 1

df_all = pd.concat([df_parler_all, df_control_all]).reset_index(drop=True)

def reg_group(series, reg):
    tmp = reg.match(series)
    
    if tmp is None:
        return None
    
    return [y for y in reg.match(series).groups() if y is not None][-2]
    
reg_fringe = re.compile(get_regex_domains(path_to_regexes='./data/domains.csv', 
                                          category_key='category', 
                                          category='fringe',
                                          domain_keys='domains'))
reg_osn = re.compile(get_regex_domains(path_to_regexes='./data/domains.csv', 
                                          category_key='category', 
                                          category='osn',
                                          domain_keys='domains'))

df_all["platform"] = None

df_all.loc[df_all.platform.isna(), "platform"] = df_all.loc[df_all.platform.isna(), "url"].swifter.apply(
                                                    lambda x: reg_group(x, reg_fringe))


df_all.loc[df_all.platform.isna(), "platform"] = df_all.loc[df_all.platform.isna(), "url"].swifter.apply(
                                                    lambda x: reg_group(x, reg_osn))

path = "/data/deplatforming/data/df_parler2_final.csv.gz"
df_all.to_csv(path, index=False, compression="infer")

Pandas Apply:   0%|          | 0/5354709 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/5265568 [00:00<?, ?it/s]

In [17]:
# Mobile

df_control_all_mob["is_treat"] = 0
df_parler_all_mob["is_treat"] = 1

df_all_mob = pd.concat([df_control_all_mob, df_parler_all_mob]).reset_index(drop=True)

df_all_mob.loc[df_all_mob.app_name == "Rumble - Video Battles", "domain_name"] = "rumble.com"
df_all_mob.loc[df_all_mob.app_name == "MeWe Network", "domain_name"] = "mewe.com"
df_all_mob.loc[df_all_mob.app_name == "Parler", "domain_name"] = "parler.com"
df_all_mob.loc[df_all_mob.app_name == "DLive · Stream on Blockchain", "domain_name"] = "dlive.tv"
df_all_mob.loc[df_all_mob.app_name == "Telegram Messenger", "domain_name"] = "telegram.org"
df_all_mob.loc[df_all_mob.app_name == "Facebook", "domain_name"] = "facebook.com"
df_all_mob.loc[df_all_mob.app_name == "Facebook Lite", "domain_name"] = "facebook.com"
df_all_mob.loc[df_all_mob.app_name == "Twitter", "domain_name"] = "twitter.com"
df_all_mob.loc[df_all_mob.app_name == "Reddit: Trending News & Tips", "domain_name"] = "reddit.com"
df_all_mob.loc[df_all_mob.app_name == "Instagram", "domain_name"] = "instagram.com"
df_all_mob.loc[df_all_mob.app_name == "YouTube: Watch, Listen, Stream", "domain_name"] = "youtube.com"
df_all_mob.loc[df_all_mob.app_name == "TikTok", "domain_name"] = "tiktok.com"
df_all_mob.loc[df_all_mob.app_name == "TikTok - Make Your Day", "domain_name"] = "tiktok.com"

def reg_group(series, reg):
    tmp = reg.match(series)
    
    if tmp is None:
        return None
    
    return [y for y in reg.match(series).groups() if y is not None][-2]
    
reg_fringe = re.compile(get_regex_domains(path_to_regexes='./data/domains.csv', 
                                          category_key='category', 
                                          category='fringe',
                                          domain_keys='domains'))
reg_osn = re.compile(get_regex_domains(path_to_regexes='./data/domains.csv', 
                                          category_key='category', 
                                          category='osn',
                                          domain_keys='domains'))

df_all_mob["platform"] = None

bool_filter = df_all_mob.platform.isna() & (~df_all_mob.domain_name.isna())
df_all_mob.loc[bool_filter, "platform"] = df_all_mob.loc[bool_filter, "domain_name"].swifter.apply(
                                                    lambda x: reg_group(x, reg_fringe))

bool_filter = df_all_mob.platform.isna() & (~df_all_mob.domain_name.isna())
df_all_mob.loc[bool_filter, "platform"] = df_all_mob.loc[bool_filter, "domain_name"].swifter.apply(
                                                    lambda x: reg_group(x, reg_osn))

path = "/data/deplatforming/data/df_parler2_final_mob.csv.gz"
df_all_mob.to_csv(path, index=False, compression="infer")

Pandas Apply:   0%|          | 0/6829657 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/6610412 [00:00<?, ?it/s]

## Additional normalization



In [18]:
df_desktop = pd.read_csv("/data/deplatforming/data/df_parler2_final.csv.gz")
df_mobile = pd.read_csv("/data/deplatforming/data/df_parler2_final_mob.csv.gz")
df_desktop["activitydatetime"] = pd.to_datetime(df_desktop["activitydatetime"])
df_mobile["activitydatetime"] = pd.to_datetime(df_mobile["activitydatetime"])

df_desktop["app_name"] = None
df_desktop["is_mobile"] = False
df_desktop = df_desktop.loc[:, ["nol_id", "activitydatetime", "viewduration", 
              "app_name", "url", "is_treat", "platform", "is_mobile"]]
df_mobile["is_mobile"] = True
df_mobile.columns = df_desktop.columns

df_desktop.loc[df_desktop.platform.isna(), "platform"] = "other"
df_mobile.loc[df_mobile.platform.isna(), "platform"] = "other"

df_mobile_prep = df_mobile.groupby(["nol_id", "is_treat", "platform", 
                                    pd.Grouper(key="activitydatetime", freq="D")])\
        .agg({"viewduration": sum}).reset_index()

df_desktop_prep = df_desktop.groupby(["nol_id", "is_treat", "platform", 
                                      pd.Grouper(key="activitydatetime", freq="D")])\
        .agg({"viewduration": sum}).reset_index()

path = "/data/deplatforming/data/df_parler2_final_prep.csv.gz"
df_desktop_prep.to_csv(path, index=False, compression="infer")

path = "/data/deplatforming/data/df_parler2_final_mob_prep.csv.gz"
df_mobile_prep.to_csv(path, index=False, compression="infer")