In [3]:
import pickle
import pandas as pd
from pathlib import Path

with open(Path.cwd().parent / 'content_analysis' / 'sp_df_tokenized.pkl', 'rb') as f:
    df = pickle.load(f)

In [8]:
from tqdm import tqdm
import os
from bs4 import BeautifulSoup
from pathlib import Path
import numpy as np

def create_dir(target_directory):
    '''Creates directory target_directory if the directory doesn't already exist'''
    if not os.path.isdir(str(target_directory)):
        os.mkdir(target_directory)
        
def populate_meta_tags(source_dir, target_dir):
    '''
    Populates target dir with .txt files of scraped meta tags from source dir
    Each .txt file contains all visible text

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
    - source_dir
        - source of HTML files, should be raw_sources/<real or fake>/html
    - target_dir
        - directory to save parsed files
    '''
    for idx, dir_ in tqdm(enumerate([source_dir / s for s in os.listdir(source_dir)])):
        files = [dir_ / s for s in os.listdir(dir_)]
        dest_dir = target_dir / dir_.stem
        EXISTS = create_dir(dest_dir)
        if EXISTS: # Skip directoriers that already exist
            continue
        for index, html in enumerate(files):
            try:
                # Open HTML
                with open(html, encoding='utf8') as f:
                    soup = BeautifulSoup(f, 'html.parser')
                f.close()
                
                # ---------------------------------------------------------------------------- #
                #                             open graph meta tags                             #
                # ---------------------------------------------------------------------------- #
                
                # og:title
                title = soup.find("meta", property="og:title")
                try:
                    og_title = title['content'] if title else ''
                except KeyError:
                    og_title = ''
                    print('KeyError at index {} title'.format(index))
                    print('continuing...')
                
                # og:keywords
                og_keywords = soup.find('meta', property='og:keywords')
                try:
                    og_keywords = og_keywords['content'] if og_keywords else ''
                except KeyError:
                    og_keywords = ''
                    print('KeyError at index {} og:keywords'.format(index))
                    print('continuing...')
                
                # og:description
                og_description = soup.find('meta', property='og:description')
                try:
                    og_description = og_description['content'] if og_description else ''
                except KeyError:
                    og_description = ''
                    print('KeyError at index {} og:description'.format(index))
                    print('continuing...')


                # ---------------------------------------------------------------------------- #
                #                               twitter meta tags                              #
                # ---------------------------------------------------------------------------- #
                
                # twitter:title
                tw_title = soup.find("meta", attrs={'name':"twitter:title"})
                try:
                    tw_title = tw_title['content'] if tw_title else ''
                except KeyError:
                    tw_title = ''
                    print('KeyError at index {} twitter:title'.format(index))
                    print('continuing...')
                
                # twitter:keywords
                tw_keywords = soup.find('meta', attrs={'name':"twitter:keywords"})
                try:
                    tw_keywords = tw_keywords['content'] if tw_keywords else ''
                except KeyError:
                    tw_keywords = ''
                    print('KeyError at index {} twitter:keywords'.format(index))
                    print('continuing...')
                
                # twitter:description
                tw_description = soup.find('meta', attrs={'name':"twitter:description"})
                try:
                    tw_description = tw_description['content'] if tw_description else ''
                except KeyError:
                    tw_description = ''
                    print('KeyError at index {} twitter:description'.format(index))
                    print('continuing...')

                # ---------------------------------------------------------------------------- #
                #                               regular meta tags                              #
                # ---------------------------------------------------------------------------- #

                # keywords, description
                description = soup.find('meta', attrs={'name':"description"})
                try:
                    description = description['content'] if description else ''
                except KeyError:
                    description = ''
                    print('KeyError at index {} description'.format(index))
                    print('continuing...')
                
                keywords = soup.find('meta', attrs={'name':"keywords"})
                try:
                    keywords = keywords['content'] if keywords else ''
                except KeyError:
                    keywords = ''
                    print('KeyError at index {} keywords'.format(index))
                    print('continuing...')


                # print(f'\nkeywords {keywords}')
                # print(f'description {description}')
                # print(f'og_title {og_title}')
                # print(f'og_description {og_description}')
                # print(f'og_keywords {og_keywords}')
                # print(f'tw_description {tw_description}')
                # print(f'tw_title {tw_title}')
                # print(f'tw_keywords {tw_keywords}')

                file_name = html.stem.split('.')[0] + '.txt'
                file_path = dest_dir / file_name
                text = [keywords, description, og_title, og_description, og_keywords, tw_description, tw_title, tw_keywords]
                if any(x != '' for x in text):
                    with open(file_path, 'a', encoding='utf8') as f:
                        f.writelines(text)
                    f.close()
            except:
                print(f'ERROR processing {dest_dir}, continuing...')
                continue
    return

real_source_dir = Path.cwd().parent / 'sp_sources'
real_target_dir = Path.cwd() / 'meta_tags_parsed'

populate_meta_tags(real_source_dir, real_target_dir)

5it [00:32,  7.57s/it]

KeyError at index 1 keywords
continuing...
KeyError at index 2 keywords
continuing...
KeyError at index 4 keywords
continuing...
KeyError at index 6 keywords
continuing...
KeyError at index 7 keywords
continuing...
KeyError at index 8 keywords
continuing...
KeyError at index 9 keywords
continuing...
KeyError at index 11 keywords
continuing...
KeyError at index 13 keywords
continuing...
KeyError at index 14 keywords
continuing...
KeyError at index 15 keywords
continuing...
KeyError at index 17 keywords
continuing...
KeyError at index 18 keywords
continuing...
KeyError at index 19 keywords
continuing...
KeyError at index 20 keywords
continuing...
KeyError at index 21 keywords
continuing...
KeyError at index 24 keywords
continuing...
KeyError at index 25 keywords
continuing...
KeyError at index 26 keywords
continuing...
KeyError at index 27 keywords
continuing...
KeyError at index 30 keywords
continuing...
KeyError at index 32 keywords
continuing...
KeyError at index 34 keywords
continuin

11it [01:16,  4.53s/it]

KeyError at index 0 keywords
continuing...
KeyError at index 1 keywords
continuing...
KeyError at index 2 description
continuing...
KeyError at index 2 keywords
continuing...
KeyError at index 3 keywords
continuing...
KeyError at index 4 description
continuing...
KeyError at index 4 keywords
continuing...
KeyError at index 6 keywords
continuing...
KeyError at index 8 description
continuing...
KeyError at index 8 keywords
continuing...
KeyError at index 10 description
continuing...
KeyError at index 10 keywords
continuing...
KeyError at index 11 description
continuing...
KeyError at index 12 description
continuing...
KeyError at index 12 keywords
continuing...
KeyError at index 13 keywords
continuing...
KeyError at index 15 description
continuing...
KeyError at index 15 keywords
continuing...
KeyError at index 16 description
continuing...
KeyError at index 16 keywords
continuing...
KeyError at index 17 description
continuing...
KeyError at index 17 keywords
continuing...
KeyError at ind

12it [01:21,  4.68s/it]


continuing...


22it [03:03, 11.01s/it]

KeyError at index 33 og:description
continuing...
KeyError at index 33 twitter:description
continuing...
KeyError at index 60 og:description
continuing...
KeyError at index 60 twitter:description
continuing...
KeyError at index 73 og:description
continuing...
KeyError at index 73 twitter:description
continuing...


28it [03:19,  2.83s/it]

KeyError at index 0 keywords
continuing...
KeyError at index 1 keywords
continuing...
KeyError at index 3 description
continuing...
KeyError at index 3 keywords
continuing...
KeyError at index 4 keywords
continuing...
KeyError at index 5 description
continuing...
KeyError at index 5 keywords
continuing...
KeyError at index 8 keywords
continuing...
KeyError at index 9 description
continuing...
KeyError at index 9 keywords
continuing...
KeyError at index 12 description
continuing...
KeyError at index 12 keywords
continuing...
KeyError at index 13 description
continuing...
KeyError at index 13 keywords
continuing...
KeyError at index 14 description
continuing...
KeyError at index 15 keywords
continuing...
KeyError at index 16 description
continuing...
KeyError at index 16 keywords
continuing...
KeyError at index 17 description
continuing...
KeyError at index 17 keywords
continuing...
KeyError at index 18 description
continuing...
KeyError at index 18 keywords
continuing...
KeyError at ind

29it [03:25,  3.54s/it]

KeyError at index 56 description
continuing...
KeyError at index 56 keywords
continuing...
KeyError at index 57 keywords
continuing...


61it [07:01,  6.91s/it]


In [10]:
content_list = []

for idx, row in df.iterrows():
    dir_ = Path.cwd() / 'meta_tags_parsed' / str(row['index'])
    
    content = ''
    for text in [dir_ / x for x in os.listdir(dir_)]:
        with open(text, 'r', encoding='utf8') as f:
            lines = f.readlines()
        f.close()

        page_content = ' '.join(lines)

        content = content + ' ' + page_content
    if content == '':
        print(f'no content at {dir_}')
        content += 'none'
    content_list.append(content)

df['meta_text'] = content_list
df.head()

no content at c:\Users\ewais\Documents\GitHub\misinfo_detection\sydney_powell_analysis\meta tag analysis\meta_tags_parsed\77


Unnamed: 0,index,site,text,tokenized,meta_text
0,0,dailyexpose.uk,[…] Does the Covid-19 Virus contain Genet...,"[…, doe, covid, viru, contain, genet, sequenc,...",Read all of the posts by Rhoda Wilson on The ...
1,1,rumble.com,Note\n \t\t\tthat this Policy may be modified ...,"[note, thi, polici, may, modifi, time, time, s...",The 2022 NBA Playoffs are here as the NBA Pla...
2,2,harpers.org,"It wasn’t just about the PowerPoint, though; i...","[’, powerpoint, though, retrospect, powerpoint...","Harper's Magazine, the oldest general-interes..."
3,3,kanekoa.substack.com,"The pathologist cited “rare, severe side effec...","[pathologist, cite, “, rare, sever, side, effe...","True the Vote used two petabytes of data, ten..."
4,6,pattyporter.net,Support local candidates running for office wi...,"[support, local, candid, run, offic, boot, gro...",Border CrisisBorder Crisis - HomeBorder Crisi...


In [11]:
with open('sp_df_tokenized.pkl', 'wb') as f:
    pickle.dump(df, f)