This notebook is used for uploading the data and looking at it (analysis of columns, missing data, etc.)
The path to the data is: <br/>
http://194.95.75.11/home/iboeckmann/webtracking_analysis/parsing/data/archive/parsed_news-websites/us_part1 <br/>
http://194.95.75.11/home/iboeckmann/webtracking_analysis/parsing/data/archive/parsed_news-websites/us_part2 <br/>
http://194.95.75.11/home/iboeckmann/webtracking_analysis/parsing/data/archive/parsed_news-websites/uk <br/>
And the html files are at http://194.95.75.11/home/iboeckmann/webtracking_analysis/parsing/tmp/Netquest/downloads/us/2019-04-08/

In [None]:
import pandas as pd
import glob
import tldextract
from bs4 import BeautifulSoup
from statistics import mean 

In [None]:
PATH_TO_DATA_FOLDER = '../../iboeckmann/webtracking_analysis/parsing/data/archive/parsed_news-websites/'
news = pd.DataFrame()
DATA_FILES = ['us_part1/newspaper.json', 'us_part2/newspaper.json', 'uk/newspaper.json']
for filepath in DATA_FILES:
    current_news = pd.read_json(PATH_TO_DATA_FOLDER+filepath, lines=True)
    current_news['country'] = filepath[:2]
    news = news.append(current_news)
    

news.reset_index(inplace=True, drop=True)
news.head()

In [None]:
news = news[(news['meta_lang'] == 'en') | (news['meta_lang'].isnull())] #select only articles with 'en' meta_lang or with empty
news = news[news['htmlfile'].str.contains("2019-04-08")] #select only articles for 2019-04-08

In [None]:
news['domain'] = news['url'].apply (lambda row: tldextract.extract(row).domain)
news['subdomain'] = news['url'].apply (lambda row: tldextract.extract(row).subdomain)
news['suffix'] = news['url'].apply (lambda row: tldextract.extract(row).suffix)

In [None]:
print('The number of unique URLs and domains  per country:')
news_by_countries = news.groupby('country')
news_by_countries.agg({"url": "nunique", "domain": "nunique","subdomain": "nunique"})

In [None]:
countries_by_domains = news.groupby(['country', 'domain'])
urls_by_domains = countries_by_domains.agg({"url":"nunique"}).sort_values('url', ascending=False)
urls_by_domains.reset_index(inplace=True)

In [None]:
urls_by_domains['fraction'] = urls_by_domains['url'].apply(lambda x: x/11803) #please change the number here to the corresponding number of total unique URLs
us_domains = urls_by_domains[urls_by_domains['country'] == 'us'][:30] #change a country to either 'uk' or 'us'
us_domains

In [None]:
k = 30
k_top = urls_by_domains[urls_by_domains['country'] == 'us'][:k]
print(f'For top {k} the fraction is ', k_top['fraction'].sum())

## Extracting html files for US domains

In [None]:
def get_text(soup):
    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

PATH_TO_HTMLS = "../../iboeckmann/webtracking_analysis/parsing/tmp/Netquest/downloads/us/2019-04-08/web_page/"
htmls_dict = {}
for domain in k_top['domain']:
    df = news.loc[news['domain'] == domain]
    if int(us_domains[us_domains['domain'] == domain]['url']) <15:
        n = int(us_domains[us_domains['domain'] == domain]['url'])
        max_count = 1
    else:
        n = 15
        max_count = 3
    htmls_dict[domain] = {'count_not_found': [], 'count_found': [], 'count_contain_text': [], 'text': []}
    count = 0
    while count < max_count:
        htmls_dict[domain]['count_found'].append(0)
        htmls_dict[domain]['count_contain_text'].append(0)
        htmls_dict[domain]['count_not_found'].append(0)
        sample_df = df['htmlfile'].sample(n = n)
        for sample in sample_df:
            try:
                temp = open(PATH_TO_HTMLS+sample, "r").read()
                htmls_dict[domain]['count_found'][count] += 1
                soup = BeautifulSoup(temp)
                text = get_text(soup)
                if len(text) > 2000:
                    htmls_dict[domain]['count_contain_text'][count] += 1
                    htmls_dict[domain]['text'].append(text)
            except:
                htmls_dict[domain]['count_not_found'][count] += 1
          
        count += 1
        
    htmls_dict[domain]['count_found'] = mean(htmls_dict[domain]['count_found'])
    htmls_dict[domain]['count_not_found'] = mean(htmls_dict[domain]['count_not_found'])
    htmls_dict[domain]['count_contain_text'] = mean(htmls_dict[domain]['count_contain_text'])

In [None]:
htmls_df = pd.DataFrame(htmls_dict).transpose()
htmls_df['found_pages'] = htmls_df.apply(lambda x: 1 - x['count_not_found']/15, axis=1)
htmls_df['text_pages'] = htmls_df.apply(lambda x: x['count_contain_text']/x['count_found'] if x['count_found']!=0 else 0, axis=1)
htmls_df

## Analyzing columns

In [None]:
for column in news.columns:
    print(column)
    print('Number of null elements: ', len(news[news[column].isnull()])) #number of null entries
    print(news[~news[column].isnull()][column][0:10]) #example of non-empty elements

In [None]:
#meta_keywords are lists and just .isnull() does not capture empty elements
keywords = news[~news['meta_keywords'].isnull()]
keywords[keywords.meta_keywords.map(len)>1]['meta_keywords']

In [None]:
#get the distribution of the languages
news.meta_lang.value_counts()

In [None]:
news.loc[10029, 'title']

In [None]:
news.title.value_counts() #this will show repeated titles for non-existing or service pages pages

In [None]:
news['url'][222222]