# Top sites

In [115]:
import pandas as pd
from os.path import join
from os import listdir
from src.shared.utils import get_country
from datetime import datetime

## Parsing the dataframes

In [143]:
def parse_df (df: pd.DataFrame) -> pd.DataFrame:
    '''
    Returns a df with columns
    name, marketshare
    '''
    def percentify (x):
        try:
            n = x.split('%')[0]
            return float(n)/100
        except:
            return 0
    # name of columns where percentages are
    perc_col_name = [c for c in df.columns if c.startswith('Percentage')][0]
    df['marketshare'] = df[perc_col_name].apply(percentify)
    # if this is a heirarchical csv,
    # get top-level entries only
    if 'Rank' in df.columns:
        df['top-level'] = df['Rank'].apply(lambda x: str(x).endswith('.0'))
        df = df[df['top-level']==True]
        # get names from 1st column
        n = df.columns[1]
    else:
        # get names from 0th column
        n = df.columns[0]
    # get jurisdictions
    df['name'] = df[n]
    # remove 'and territories' for server locations
    df['name'] = df['name'].apply(lambda x: x.split(' and territories')[0])
    df['jurisdiction_alpha2'] = df['name'].apply(get_country)
    return df[['name', 'marketshare', 'jurisdiction_alpha2']]

In [144]:
ex_fn = listdir('top-sites')[1]
ex_df = pd.read_csv(join('top-sites', ex_fn))
parse_df(ex_df)

Unnamed: 0,name,marketshare,jurisdiction_alpha2
0,United States,0.4323,US
1,Germany,0.1899,DE
2,France,0.1329,FR
3,China,0.1147,CN
4,Japan,0.0576,JP
...,...,...,...
78,Armenia,0.0002,AM
79,Panama,0.0002,PA
80,Kuwait,0.0002,KW
81,Cambodia,0.0002,KH


## Extracting market/top-n from filenames

In [162]:
dfs = []
for my_dir in listdir('top-sites'):
    fn = my_dir.split('.csv')[0]
    if fn.split('-')[1]=='hierarchy':
        market, h, top_n, date_str  = fn.split('-')
        date = datetime.strptime(date_str, '%Y%M')
        print(market, top_n, date)
        df = pd.read_csv(join('top-sites', my_dir))
        df = parse_df(df)
        df['market'] = market
        df['top_n'] = top_n
        df['date'] = date
        dfs.append(df)

dns_servers top_10k 2021-01-01 00:06:00
server_locations top_10k 2021-01-01 00:06:00
ssl_certificate top_10k 2021-01-01 00:06:00
data_center top_1k 2021-01-01 00:06:00
dns_servers top_1k 2021-01-01 00:06:00
web_hosting top_10k 2021-01-01 00:06:00
server_locations top_1k 2021-01-01 00:06:00
web_hosting top_1k 2021-01-01 00:06:00
ssl_certificate top_1k 2021-01-01 00:06:00
data_center top_10k 2021-01-01 00:06:00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['name'] = df[n]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['name'] = df['name'].apply(lambda x: x.split(' and territories')[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['jurisdiction_alpha2'] = df['name'].apply(get_country)
A value is trying to be set on a copy of a slice from a 

In [163]:
pd.concat(dfs).to_csv('out/top-sites-combined.csv')

# Simple analyseshh,,  hh,,  

In [164]:
df = pd.read_csv('out/top-sites-combined.csv').drop('Unnamed: 0', axis=1)

In [165]:
df.groupby(['market', 'jurisdiction_alpha2', 'top_n']).sum().sort_values(by='marketshare', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,marketshare
market,jurisdiction_alpha2,top_n,Unnamed: 3_level_1
ssl_certificate,US,top_10k,1.0089
ssl_certificate,US,top_1k,0.9610
dns_servers,US,top_1k,0.7990
dns_servers,US,top_10k,0.7276
server_locations,US,top_1k,0.5240
...,...,...,...
data_center,ZA,top_10k,0.0001
data_center,LT,top_10k,0.0001
data_center,EE,top_10k,0.0001
data_center,CH,top_10k,0.0001
