## Load and test the raw dataset

#### **Option 1:** Load the raw data manually

In [2]:
# Load raw 
f_name = "raw_data.csv"

DATA = []

with open(f_name, "r") as fp:
    
    # Get the field names from the first line of the csv
    headers = fp.readline().replace("\n", "").split("\t")
    print("Available fields:", headers) 
    
    # Loop through all lines of the csv file to load the raw data
    for l in fp:
        r = {} # row
        d = l.replace("\n", "").split("\t") # cleanup step
        for k, v in zip(headers, d): # Join headers with data
            r[k] = v
        
        DATA.append(r)
        
print("Total number of rows: {}".format(len(DATA)))

Available fields: ['Timestamp', 'FirstPartyDomain', 'Country', 'RequestedDomain', 'ReferrerDomain', 'RequestType', 'ServerIP']
Total number of rows: 3413570


In [3]:
import pandas as pd

df = pd.DataFrame(DATA)
df

Unnamed: 0,Timestamp,FirstPartyDomain,Country,RequestedDomain,ReferrerDomain,RequestType,ServerIP
0,1505229096,ant1iwo.com,Cyprus,tpc.googlesyndication.com,bs.serving-sys.com,script,82.199.68.72
1,1505229096,ant1iwo.com,Cyprus,tpc.googlesyndication.com,securepubads.g.doubleclick.net,image,216.58.208.34
2,1505229096,ant1iwo.com,Cyprus,tpc.googlesyndication.com,ds.serving-sys.com,script,195.14.151.154
3,1505229096,ant1iwo.com,Cyprus,tpc.googlesyndication.com,bs.serving-sys.com,script,82.199.68.72
4,1505229096,ant1iwo.com,Cyprus,tpc.googlesyndication.com,ds.serving-sys.com,script,195.14.151.147
...,...,...,...,...,...,...,...
3413565,1577654581,reporter.com.cy,United States,tpc.googlesyndication.com,securepubads.g.doubleclick.net,image,172.217.203.154
3413566,1577654581,reporter.com.cy,United States,ads.tbwa-entelia.com,s0.2mdn.net,script,173.194.216.149
3413567,1577654581,reporter.com.cy,United States,reporter-com-cy.os.tc,cdn.onesignal.com,script,104.18.226.52
3413568,1577654581,reporter.com.cy,United States,reporter-com-cy.os.tc,cdn.onesignal.com,script,104.18.226.52


#### **Option 2:** Load the raw data using pandas

In [4]:
# Alternatively, you can load the raw data directly with pandas dataframe
import pandas as pd

f_name = "raw_data.csv"

df = pd.read_csv(f_name, sep="\t")
df

Unnamed: 0,Timestamp,FirstPartyDomain,Country,RequestedDomain,ReferrerDomain,RequestType,ServerIP
0,1505229096,ant1iwo.com,Cyprus,tpc.googlesyndication.com,bs.serving-sys.com,script,82.199.68.72
1,1505229096,ant1iwo.com,Cyprus,tpc.googlesyndication.com,securepubads.g.doubleclick.net,image,216.58.208.34
2,1505229096,ant1iwo.com,Cyprus,tpc.googlesyndication.com,ds.serving-sys.com,script,195.14.151.154
3,1505229096,ant1iwo.com,Cyprus,tpc.googlesyndication.com,bs.serving-sys.com,script,82.199.68.72
4,1505229096,ant1iwo.com,Cyprus,tpc.googlesyndication.com,ds.serving-sys.com,script,195.14.151.147
...,...,...,...,...,...,...,...
3413565,1577654581,reporter.com.cy,United States,tpc.googlesyndication.com,securepubads.g.doubleclick.net,image,172.217.203.154
3413566,1577654581,reporter.com.cy,United States,ads.tbwa-entelia.com,s0.2mdn.net,script,173.194.216.149
3413567,1577654581,reporter.com.cy,United States,reporter-com-cy.os.tc,cdn.onesignal.com,script,104.18.226.52
3413568,1577654581,reporter.com.cy,United States,reporter-com-cy.os.tc,cdn.onesignal.com,script,104.18.226.52


#### Print some raw data statistics, i.e., website visits per user country

In [5]:
df_country = df.groupby(df["Country"]).count()
df_country

Unnamed: 0_level_0,Timestamp,FirstPartyDomain,RequestedDomain,ReferrerDomain,RequestType,ServerIP
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Argentina,1683,1683,1683,1683,1683,1683
Austria,2027,2027,2027,2027,2027,2027
Bangladesh,1825,1825,1825,1825,1825,1825
Belgium,2307,2307,2307,2307,2307,2307
Bolivia,4325,4325,4325,4325,4325,4325
Bosnia and Herzegovina,9450,9450,9450,9450,9450,9450
Brazil,76127,76127,76127,76127,76127,76127
Canada,22618,22618,22618,22618,22618,22618
Chile,428,428,428,428,428,428
Colombia,4113,4113,4113,4113,4113,4113


In [6]:
df_types = df.groupby(df["RequestType"]).count()
df_types

Unnamed: 0_level_0,Timestamp,FirstPartyDomain,Country,RequestedDomain,ReferrerDomain,ServerIP
RequestType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
csp_report,5,5,5,5,5,5
font,173763,173749,172929,173763,173763,173763
image,1645136,1645092,1642393,1645136,1645136,1645136
media,2009,2009,2009,2009,2009,2009
object,10,10,10,10,10,10
other,1343,1343,1343,1343,1343,1343
ping,35745,35745,35725,35745,35745,35745
script,946142,946119,944770,946142,946142,946142
stylesheet,142163,142160,141789,142163,142163,142163
sub_frame,305682,305679,305215,305682,305682,305682
