In [1]:
import os
import glob
import codecs
from bs4 import BeautifulSoup
from collections import Counter
from pprint import pprint as pp
from HeBERT.src.HebEMO import *
from newspaper.nlp import keywords, load_stopwords

hebemo = HebEMO()
load_stopwords('he')

def get_all_htmls(directory_path):
    return glob.iglob(os.path.join(directory_path,'*.html'))

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [01:44<00:00, 13.08s/it]


In [2]:
def get_is_sponsered(document):
    li = document.find("li", attrs={"class": "custom-txt"})
    if li is None:
        return False
    return li.find("a",target="_blank") is not None

In [3]:
def get_title(document):
    return document.title.text

In [4]:
def get_bold_count(document):
    return len(document.find_all("strong"))

In [5]:
def get_is_advertisement(document):
    return document.find("span", content="כתבה פרסומית") is not None

In [6]:
def get_links_count(document):
    return len(document.find_all("a", href=True))

In [7]:
def get_input_count(document):
    return len(document.find_all("input"))

In [8]:
def get_textarea_count(document):
    return len(document.find_all("textarea"))

In [9]:
def get_media_count(document):
    images = document.find_all("img")
    all_media = document.find_all("source")
    return len(images) + len(all_media)

In [10]:
def get_emotion_from_string(string):
    df = hebemo.hebemo(text=string)
    return set(col for col in df.columns if df[col][0] == 1)

In [11]:
def get_title_emotion(document):
    return get_emotion_from_string(get_title(document))

In [12]:
def get_text(document):
    all_text = document.find("section", itemprop="articleBody").find_all("p")
    return " ".join(i.text for i in all_text)

In [13]:
def get_text_emotion(document):
    # HebEmo only supports up to 512 words
    return get_emotion_from_string(get_text(document)[:512])

In [14]:
def get_keywords_from_string(string):
    return set(keywords(string).keys())

In [15]:
def get_title_keywords(document):
    return get_keywords_from_string(get_title(document))

In [16]:
def get_text_keywords(document):
    return get_keywords_from_string(get_text(document))

In [17]:
# Copied from https://stackoverflow.com/a/48653758
def getLCS(s1, s2):
    """ Find the longest common subsequence between two sequences """
    matrix = [["" for x in range(len(s2))] for x in range(len(s1))]
    for i in range(len(s1)):
        for j in range(len(s2)):
            if s1[i] == s2[j]:
                if i == 0 or j == 0:
                    matrix[i][j] = s1[i]
                else:
                    matrix[i][j] = matrix[i-1][j-1] + s1[i]
            else:
                matrix[i][j] = max(matrix[i-1][j], matrix[i][j-1], key=len)

    cs = matrix[-1][-1]

    return len(cs), cs

def string_length_min_max(first, second):
    if len(first) == len(second):
        return first, second
    return min(first, second, key=len), max(first, second, key=len)

def compare_words(first, second):
    a, b = string_length_min_max(first, second)
    
    # If the length of the smaller word is less then 4, don't continue (because this function is very heuristic-based)
    if len(a) < 4:
        return False
    
    # If the shorter word is included in the longer, we consider them to be the same
    if a in b:
        return True
    
    # Use the common longest subsequence between the two words to decide if they're similar enough
    lcs_len, lcs = getLCS(a, b)
    if len(a) <= lcs_len <= len(b):
        return True
    
    return False

def get_mutual_title_text_keywords(document):
    mutual_keywords = set()
    # Add all the words that are similar enough (decided in `compare_words`) - this will also get exact matches
    for title_word in get_title_keywords(document):
        for text_word in get_text_keywords(document):
            if compare_words(title_word, text_word):
                mutual_keywords.add(title_word)
    return mutual_keywords

In [18]:
def get_emotions_for_csv(emotions):
    emotion_names = ["anticipation", "joy", "trust", "fear", "surprise", "anger", "sadness", "disgust"]
    return ",".join(str(emotion_name in emotions) for emotion_name in emotion_names)

def parse(link, document):
    return f'{link},{get_is_sponsered(document)},{get_bold_count(document)},{get_is_advertisement(document)},{get_links_count(document)},{get_input_count(document)},{get_textarea_count(document)},{get_media_count(document)},{len(get_title(document))},{len(get_text(document))},{len(get_mutual_title_text_keywords(document))},{get_emotions_for_csv(get_title_emotion(document))},{get_emotions_for_csv(get_text_emotion(document))}\r\n'

def to_csv(output_path):
    header = 'file_name,is_sponsered,bold_count,is_advertisement,links_count,input_count,textarea_count,media_count,title_length,text_length,mutual_title_text_keywords_count,title_anticipation,title_joy,title_trust,title_fear,title_surprise,title_anger,title_sadness,title_disgust,text_anticipation,text_joy,text_trust,text_fear,text_surprise,text_anger,text_sadness,text_disgust\r\n'
    
    with codecs.open(output_path, 'w', 'utf-8') as of:
        # Write header
        of.write(header)

        # Write data for every article
        for link in get_all_htmls(directory_path):
            try:
                with codecs.open(link, 'r', 'utf-8') as f:
                    document = BeautifulSoup(f)
                    print(f'Writing data for link: {link}, title: {get_title(document)}')
                    of.write(parse(link, document))
                    of.flush()
            except Exception as e:
                print(f'Skipping {link} due to "{e}"')

In [None]:
directory_path=r'Articles'
to_csv('data.csv')
print("finished")