This notebook is used for uploading the data and looking at it (analysis of columns, missing data, etc.)
The path to the data is: <br/>
http://194.95.75.11/home/iboeckmann/webtracking_analysis/parsing/data/archive/parsed_news-websites/us_part1 <br/>
http://194.95.75.11/home/iboeckmann/webtracking_analysis/parsing/data/archive/parsed_news-websites/us_part2 <br/>
http://194.95.75.11/home/iboeckmann/webtracking_analysis/parsing/data/archive/parsed_news-websites/uk <br/>
And the html files are at http://194.95.75.11/home/iboeckmann/webtracking_analysis/parsing/tmp/Netquest/downloads/us/2019-04-08/

In [None]:
import pandas as pd
import numpy as np
import glob
import tldextract
from bs4 import BeautifulSoup
from statistics import mean 
import matplotlib.pyplot as plt

In [None]:
PATH_TO_DATA_FOLDER = '../../iboeckmann/webtracking_analysis/parsing/data/archive/parsed_news-websites/'
news = pd.DataFrame()
DATA_FILES = ['us_part1/newspaper.json', 'us_part2/newspaper.json', 'uk/newspaper.json']
for filepath in DATA_FILES:
    current_news = pd.read_json(PATH_TO_DATA_FOLDER+filepath, lines=True)
    current_news['country'] = filepath[:2]
    news = news.append(current_news)
    

news.reset_index(inplace=True, drop=True)
news.head()

In [None]:
news = news[(news['meta_lang'] == 'en') | (news['meta_lang'].isnull())] #select only articles with 'en' meta_lang or with empty

In [None]:
#list with media domains
domains_path = '../../iboeckmann/webtracking_analysis/parsing/data/domaincodes/domaincodes.csv'
news_domains = pd.read_csv(domains_path)
news_domains

In [None]:
def extract_domain(row):
    domain = row.split('/')[0]
    domain = domain.replace('www.', '')
    return domain

news['domain'] = news['url'].apply(lambda row: extract_domain(row))
news['domain']

In [None]:
#domain_mask 
media_portals = news[news['domain'].isin(news_domains['domain'].values)]
media_portals.shape

In [None]:
media_portals['htmlfile']

In [None]:
print('The number of unique URLs and domains  per country:')
news_by_countries = media_portals.groupby('country')
news_by_countries.agg({"url": "nunique", "domain": "nunique"})

In [None]:
countries_by_domains = media_portals.groupby(['country', 'domain'])
urls_by_domains = countries_by_domains.agg({"url":"nunique"}).sort_values('url', ascending=False)
urls_by_domains.reset_index(inplace=True)

In [None]:
urls_by_domains['fraction'] = urls_by_domains['url'].apply(lambda x: x/52721) #please change the number here to the corresponding number of total unique URLs
us_domains = urls_by_domains[urls_by_domains['country'] == 'us'][:30] #change a country to either 'uk' or 'us'
us_domains

In [None]:
k = 30
k_top = urls_by_domains[urls_by_domains['country'] == 'us'][:k]
print(f'For top {k} the fraction is ', k_top['fraction'].sum())

## Pagetext analysis

In [None]:
media_portals['text_len'] = media_portals['text'].apply(lambda x: len(str(x).split()) if x else np.nan)
media_portals_clean = media_portals[~media_portals['text_len'].isnull()].copy() #omitting those rows where text is empty
media_portals_clean

In [None]:
title_list = media_portals_clean['title'].value_counts() #show how many titles are repeating
title_list

In [None]:
title_list[title_list<5] #are there unique titles that repeat?

In [None]:
media_portals_clean[media_portals_clean['title'] == 'Schoolboy finds lost piece of Glasgow\'s Govan Stones']

In [None]:
media_portals_unique = media_portals_clean.drop_duplicates(subset ="text", 
                     keep = False)
media_portals_unique

In [None]:
title_list = media_portals_unique['title'].value_counts()
title_list

In [None]:
frequent_portals = media_portals_unique[media_portals_unique['title'].isin(title_list[title_list>7].index)] #getting only pages with frequent articles
frequent_portals['text_len'].describe() #how long is the text for such pages?

In [None]:
df_1 = media_portals_unique[media_portals_unique['title'].isin(title_list[title_list<=7].index)]
df_1['text_len'].describe()

In [None]:
display(frequent_portals.loc[frequent_portals['text_len']>132, 'url'])
frequent_portals.loc[767166, 'url'] #has text_len over 1300

In [None]:
display(frequent_portals.loc[frequent_portals['text_len']>2000]) #those that have the same title but still long text
fr_portals = list(frequent_portals.index.values)
fr_portals = [elem for elem in fr_portals if elem not in [883485, 887186, 899170, 932244, 988452]]

In [None]:
len(frequent_portals.loc[frequent_portals['text_len']>500])

In [None]:
media_portals_unique = media_portals_unique.drop(fr_portals) #dropping urls with frequent titles
media_portals_unique

In [None]:
media_portals_unique['title'].value_counts()

In [None]:
media_portals_unique['text_len'].describe()

In [None]:
media_portals_unique.hist(column='text_len', by='country', layout=(2,1), bins=15, grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)

In [None]:
media_portals_unique[media_portals_unique['text_len']>2000]

## Analyzing columns

In [None]:
for column in news.columns:
    print(column)
    print('Number of null elements: ', len(news[news[column].isnull()])) #number of null entries
    print(news[~news[column].isnull()][column][0:10]) #example of non-empty elements

In [None]:
#meta_keywords are lists and just .isnull() does not capture empty elements
keywords = news[~news['meta_keywords'].isnull()]
keywords[keywords.meta_keywords.map(len)>1]['meta_keywords']

In [None]:
#get the distribution of the languages
news.meta_lang.value_counts()

In [None]:
news.loc[10029, 'title']

In [None]:
news.title.value_counts() #this will show repeated titles for non-existing or service pages pages

In [None]:
news['url'][222222]