# WHOIS requests on the first URL of each quote

## 0 Needed packages and functions

### 0.1 Importing needed packages

In [1]:
import numpy as np
import pandas as pd
import bz2
import json
from whois import whois
from tqdm import tqdm
tqdm.pandas()

### 0.2 Function to perform WHOIS request

In [2]:
def tryandget(url):
    try:
        who= whois(url)
        return [who.org, who.resgistrar, who.country, who.state, who.city] #[who.org, who.country, who.city]
    except:
        pass
    return [None, None, None, None, None]

## 1 Data preparation for request

### 1.1 Reading sample data

In [3]:
df= pd.read_json("/Users/nicolasantacroce/Desktop/Desktop/EPFL/EPFL MA1/Applied Data Analysis/Sample.json.bz2",compression="bz2",lines=True)

### 1.2 Extracting domains from urls

In [4]:
df["domain"]= df['urls'].apply(lambda x: "/".join(x[0].split('/',3)[:3]))

### 1.3 Creating a dataframe of unique domains

In [5]:
df_domain= pd.DataFrame(df.value_counts("domain")).reset_index()
df_domain.describe()

Unnamed: 0,0
count,26582.0
mean,47.543074
std,163.114553
min,1.0
25%,1.0
50%,7.0
75%,34.0
max,5220.0


## 2 Getting WHOIS informations

### 2.1 Performing WHOIS requests on domains (takes a lot of time)

In [None]:
df_domain["origin"]= df_domain["domain"].apply(lambda x: tryandget(x))

### 2.2 Saving domain dataframe to pickle format

In [None]:
df_domain.to_pickle("./whois_result.pkl")
df_domain.head(10)

## 3 Joining WHOIS info to sample data

### 3.1 Loading domain dataframe from pickle

In [6]:
df_domain= pd.read_pickle("./whois_result.pkl")
df_domain.head(10)

Unnamed: 0,domain,0,origin
0,http://www.breitbart.com,5220,"[Domains By Proxy, LLC, None, US, Arizona, Tempe]"
1,http://www.msn.com,4505,"[Microsoft Corporation, None, US, WA, Redmond]"
2,http://home.nzcity.co.nz,4372,"[None, None, None, None, None]"
3,http://www.stuff.co.nz,3766,"[None, None, None, None, None]"
4,https://www.thesun.co.uk,3726,"[None, None, None, None, None]"
5,http://msn.com,3649,"[Microsoft Corporation, None, US, WA, Redmond]"
6,http://express.co.uk,3487,"[None, None, None, None, None]"
7,http://mlb.mlb.com,3189,"[MLB Advanced Media, LP, None, US, NY, New York]"
8,http://thehill.com,3142,"[None, None, US, FL, Jacksonville]"
9,https://www.seattletimes.com,3012,"[SEATTLE TIMES COMPANY, None, US, WA, SEATTLE]"


In [23]:
df_domain[["org","registar","country","state","city"]]= df_domain["origin"].apply(lambda x : pd.Series(x))
df_domain.head(10)

  df_domain[["org","registar","country","state","city"]]= df_domain["origin"].apply(lambda x : pd.Series(x))


Unnamed: 0,domain,0,origin,org,registar,country,state,city
0,http://www.breitbart.com,5220,"[Domains By Proxy, LLC, None, US, Arizona, Tempe]","Domains By Proxy, LLC",,US,Arizona,Tempe
1,http://www.msn.com,4505,"[Microsoft Corporation, None, US, WA, Redmond]",Microsoft Corporation,,US,WA,Redmond
2,http://home.nzcity.co.nz,4372,"[None, None, None, None, None]",,,,,
3,http://www.stuff.co.nz,3766,"[None, None, None, None, None]",,,,,
4,https://www.thesun.co.uk,3726,"[None, None, None, None, None]",,,,,
5,http://msn.com,3649,"[Microsoft Corporation, None, US, WA, Redmond]",Microsoft Corporation,,US,WA,Redmond
6,http://express.co.uk,3487,"[None, None, None, None, None]",,,,,
7,http://mlb.mlb.com,3189,"[MLB Advanced Media, LP, None, US, NY, New York]","MLB Advanced Media, LP",,US,NY,New York
8,http://thehill.com,3142,"[None, None, US, FL, Jacksonville]",,,US,FL,Jacksonville
9,https://www.seattletimes.com,3012,"[SEATTLE TIMES COMPANY, None, US, WA, SEATTLE]",SEATTLE TIMES COMPANY,,US,WA,SEATTLE


In [24]:
df_domain.describe()

Unnamed: 0,0,registar
count,26582.0,0.0
mean,47.543074,
std,163.114553,
min,1.0,
25%,1.0,
50%,7.0,
75%,34.0,
max,5220.0,
