# Cleaning code for EuVsDisinfo scrape

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs

df = pd.read_csv('../datasets/euvsdisinfoheadings.csv')
dfpages = pd.read_csv('dfpages.csv')
dfpages

## Pages that didn't pull data

Fixed - running the scraper code above shouldn't create any missed pages any more. 

In [None]:
# These are the pages that failed to scrape
noscrapes = df[df.index.isin(dfpages[dfpages['Id'].isnull()].index.to_list())]
noscrapes

In [None]:
newpages = []
for i, url in enumerate(noscrapes['NATO_url'].to_list()):
    if int(i/10) == i/10:
        print('{} {}'.format(i, url))
    try:
        html = requests.get(url).text
        soup = bs(html, 'lxml')
        bodyclasses = soup.find('body').get('class')
        pdata = [url,
                 bodyclasses[[i for i, s in enumerate(bodyclasses) if s.startswith('postid-')][0]],
                 soup.find('h1', 'b-catalog__report-title').text,
                 soup.find('div', 'b-report__summary-text').text.strip(),
                 soup.find('div', 'b-report__disproof-text').text.strip()]
        if soup.find('div', 'b-catalog__link') != None:
            pdata += [[{x.text: x.get('href')} for x in soup.find('div', 'b-catalog__link').find_all('a')],
                     [bb.text.strip() for bb in soup.find('ul', 'b-catalog__repwidget-list').find_all('li')]]
        else:
            if soup.find('div', 'b-catalog__repwidget-source').find('a') != None:
                pdata += [[x.get('href') for x in soup.find('div', 'b-catalog__repwidget-source').find_all('a')]]
            else:
                pdata += [[]]
            pdata += [[bb.text.strip() for bb in soup.find('ul', 'b-catalog__repwidget-list').find_all('li')]]
    except:
        print('Scrape failed for {}'.format(url))
        pdata = [url]
    newpages += [pdata]

dfnewpages = pd.DataFrame(newpages, columns=['NATO_url', 'Id', 'PageTitle', 'Summary', 'Disproof', 'Media', 'Details'])
dfnewpages

In [None]:
df = pd.read_csv('euvsdisinfoheadings.csv')
df = pd.concat([df, dfpages], axis=1, sort=False)
df = pd.merge(df, dfnewpages, how='left', on=['NATO_url'], suffixes=['','_new'])
for col in ['PageTitle', 'Summary', 'Disproof', 'Media', 'Details', 'Id']:
    df.loc[df['Id'].isnull(), col] = df.loc[df['Id'].isnull(), col+'_new']
df.drop(['Id_new', 'PageTitle_new', 'Summary_new', 'Disproof_new', 'Media_new', 'Details_new'], axis=1, inplace=True)
df

## Varying details entries

In [None]:
deetlist = df['Details'].to_list()
deets = {i:{dd.split(':')[0]:dd.split(':')[1].strip() for dd in deet} for i, deet in enumerate(deetlist)}
dfdeets = pd.DataFrame.from_dict(deets).transpose()
dfdeets.rename(columns={x:"detail_"+x for x in dfdeets.columns}, inplace=True)
df = pd.concat([df, dfdeets], axis=1)
df

### Check data columns - remove anything redundant

No idea where column 'Index' came from

These are the same: 
* df[df['detail_DATE OF PUBLICATION'] != df['Date'].str.replace('.', '/')]
* df[(df['detail_Country'] != df['Country']) & (df['Country'].notnull())]
* Title and the second half (after the :) of PageTitle
* Outlets is better-formed than detail_Outlets (which has text truncated)


In [None]:
df.drop(['Index', 'Details', 'detail_DATE OF PUBLICATION', 'detail_Country', 'Title', 
         'detail_Outlet'], axis=1, inplace=True)
df.rename(columns={'Date':'Date of publication', 'detail_Keywords':'Keywords', 
                   'detail_Language/target audience':'Language/target audience', 
                   'detail_Reported in':'Reported in'}, inplace=True)
df[['Type', 'Title']] = pd.DataFrame(df['PageTitle'].str.split(':', n=1).to_list(), columns=['Type', 'Title'])
df['Title'] = df['Title'].str.strip()
df.drop(['PageTitle'], axis=1, inplace=True)
df.to_csv('euvsdisinfodata.csv', index=False)
df

## URLs that didn't get pulled

In [None]:
df[df['Media'].isnull()]

In [None]:
urlsplit = pd.DataFrame(df['Media'].to_list(), columns=['Url1', 'Url2'])
df = pd.concat([df, urlsplit], axis=1)
df

In [None]:
#only 52 of these - none of these have an archive link on the page
#Leave as-is
df[(df['Url2'].isnull()) & (df['Url1'].apply(type) == dict)]

In [None]:
# Most of these have 1 or more original links, but no archive link
df[(df['Url2'].isnull()) & (df['Url1'].apply(type) != dict)]

In [None]:
df[(df['Url2'].isnull()) & (df['Url1'].apply(type) != dict)]['Media'].to_list()

In [None]:
df['Url1'].to_list()

In [None]:
df[df['NATO_url']=='https://euvsdisinfo.eu/report/strange-coincidence-that-40-areas-are-mostly-affected-by-coronavirus/']

## Test data pulls

In [None]:
# Latest version of page pull code
html = requests.get(url).text
soup = bs(html, 'lxml')
bodyclasses = soup.find('body').get('class')
pdata = [url,
         bodyclasses[[i for i, s in enumerate(bodyclasses) if s.startswith('postid-')][0]],
         soup.find('h1', 'b-catalog__report-title').text,
         soup.find('div', 'b-report__summary-text').text.strip(),
         soup.find('div', 'b-report__disproof-text').text.strip()]
if soup.find('div', 'b-catalog__link') != None:
    pdata += [[{x.text: x.get('href')} for x in soup.find('div', 'b-catalog__link').find_all('a')],
             [bb.text.strip() for bb in soup.find('ul', 'b-catalog__repwidget-list').find_all('li')]]
else:
    if soup.find('div', 'b-catalog__repwidget-source').find('a') != None:
        pdata += [[x.get('href') for x in soup.find('div', 'b-catalog__repwidget-source').find_all('a')]]
    else:
        pdata += [[]]
    pdata += [[bb.text.strip() for bb in soup.find('ul', 'b-catalog__repwidget-list').find_all('li')]]
pdata