# Question 1 
fetch the content of a news article, using BeautifulSoup to interact with the HTML.

In [2]:
#!pip install bs4
import requests
from bs4 import BeautifulSoup
def get_article_content(url='https://www.ynet.co.il/laisha/article/b1al4tzyp'):
    # Given a URL, this function will return the article content in a string form
    # as well as saving the content to a file.
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Fetch the text via the class name (might need to change it for other websites)
    # This class name is applicable for some articles on ynet.co.il
    class_name = 'public-DraftEditor-content' 
    article_content = soup.find('div', {'class': class_name})
    
    if article_content:
        article_text = article_content.get_text()
        print("The content was saved to a file")
        with open('article_content.txt', 'w', encoding='utf-8') as file:
            file.write(article_text)
        # return article_text
    else:
        print('Article content wasnt fetched')


# Question 2
## Idea
I approached the problem in following way:
1. Use regex in order to identify the words that represent a number in hebrew within a text, using a set of indicative words
2. Translate those words into English
3. Convert the English words into integers
4. Replace the Hebrew words in the original text with their corresponding integers

In [3]:
heb_numbers_keywords = ['אחת', 'שתיים', 'שלוש', 'ארבע', 'חמש', 'שש', 'שבע', 'שמונה', 'תשע', 'עשר', 'עשרה', 'עשרים',
                        'שלושים', 'ארבעים', 'חמישים', 'שישים', 'שבעים', 'שמונים', 'תשעים', 'מאה', 'מאתיים', 'שלוש מאות',
                        'ארבע מאות', 'חמש מאות', 'שש מאות', 'שבע מאות', 'שמונה מאות', 'תשע מאות', 'אלף', 'אלפיים',
                        'שלושת אלפים', 'ארבעת אלפים', 'חמשת אלפים', 'ששת אלפים', 'שבעת אלפים', 'שמונת אלפים',
                        'תשעת אלפים', 'עשרת אלפים', 'מיליון', 'אחת', 'אחד',  'שתי', 'שתיים', 'שניים', 'שלוש',
                        'שלושה', 'ארבע', 'ארבעה', 'חמש', 'חמישה', 'שש', 'שישה', 'שבע', 'שבעה', 'שמונה', 'תשע', 'תשעה',
                        'עשר', 'עשרה']

In [4]:
#!pip install word2number
from word2number import w2n
import re

## Implementation
The function find_sequences recieves two arguments: the text to search in and the list of words to search for.
This regex was designed with hebrew linguistics in mind, capturing single words that represent a number of a "string" of words that together form a number i.e "מאה חמישים ושלוש"



In [5]:
def find_sequences(sentence, words):
    pattern = r'\b(?:[א-ת]?' + '|[א-ת]?'.join(words) + r')(?:\s(?:ו?' + '|ו?'.join(words) + r'))*\b'
    matches = re.findall(pattern, sentence)
    return matches


In [6]:
# Test the find_sequence function
s1= "מאה חמישים ושלוש סתם בדיקה"
s2= " בדיקה מאתיים ארבעים ושתיים"
s3= "שבעים ושבע כגדכג"
res = []
for s in [s1,s2,s3]:
    res+= find_sequences(s, heb_numbers_keywords )
res

['מאה חמישים ושלוש', 'מאתיים ארבעים ושתיים', 'שבעים ושבע']

The translation function from hebrew to english.

In [7]:
#!pip install translate
from translate import Translator

def translate_hebrew_to_english(text):
    translator = Translator(to_lang="en", from_lang="he")
    translation = translator.translate(text)
    return translation

Putting everything together:

In [8]:
def question2(text):
    # Contains the hebrew words representing numbers
    words_to_replace = find_sequences(text, heb_numbers_keywords)
    # Translation of each of the hebrew words representing numbers
    eng_version = [translate_hebrew_to_english(w) for w in words_to_replace]
    # List that will contain the corresponding numbers in integer form
    nums = []
    for w in eng_version:
        try:
            nums.append(w2n.word_to_num(w))
        except ValueError:
            # One of the values wasn't translated correctly, or the string is not a number
            nums.append('')
            pass

    eng_heb_dict = list(zip(eng_version, words_to_replace, nums))
    sorted_words = sorted(eng_heb_dict, key=lambda x: len(x[1]), reverse=True)
    modified_text = ''
    for t in sorted_words:
        old_text_heb = t[1]
        if t[0] == '':
            # Something went wrong with translation, keep the original word
            new_text_num = t[1]
        else:
            # Translation was successful
            new_text_num = str(t[2])
        modified_text = text.replace(old_text_heb, new_text_num)
    return modified_text

In [9]:
def example2():
    get_article_content()
    with open('article_content.txt', 'r') as file:
        file_content = file.read()
        modified_content = question2(file_content)
        with open('modified_content.txt', 'w') as mod_file:
            mod_file.write(modified_content)
        print('The modified file has been created')
# example2()

The content was saved to a file
The modified file has been created


# Question 3


## Assumptions:
Numeric value describing the quantity of an object appears after the object.

## Flow
1. parse the text, identify numeric values.
2. whenever encountering a numeric value, that follows a word save it as a tuple within a list of results.
3. save all of the saved results into a dataframe

In [10]:
def find_num_word_tuple(text):
    # a regex that searches for a number followed by a space and another word.
    # it excepts also prefix for the number
    pattern = r'\b[א-ת]*(\d+)\s*([a-zA-Zא-ת]+)\b'
    matches = re.findall(pattern, text)
    results = []
    for match in matches:
        number, word = match
        results.append((number,word))
        print(f"Number: {number}, Word: {word}")
    return results

In [15]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

def is_noun(heb_word):
    translated = translate_hebrew_to_english(heb_word)
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')

    words = word_tokenize(word)
    tagged_words = pos_tag(words)
    
    # Check if the first word is tagged as a noun (NN or NNS)
    if tagged_words and tagged_words[0][1] in ['NN', 'NNS']:
        return True
    else:
        return False


In [16]:
import pandas as pd

def question3(text):
    # the given text is after the processing done in question2, therefore we can look for numeric values
    results = find_num_word_tuple(text)
    filter_nouns_func = lambda x: True if is_noun(x[1]) else False
    results = list(filter(filter_nouns_func,results))                
    n = len(results)
    data = {"Number": [int(results[i][0]) for i in range(n)], 
            "Word": [results[i][1] for i in range (n)]}
    df = pd.DataFrame(data)
    print(df)
    return df

with open('modified_content.txt', 'r') as modified_file:
        content = modified_file.read()
        question3(content)

Number: 7, Word: באוקטובר
Number: 2, Word: שאר
Number: 26, Word: באוקטובר
Number: 5, Word: דקות
Number: 2, Word: אמניות
Number: 2, Word: שצריך
Number: 7, Word: באוקטובר
Number: 14, Word: מחזיקה
Number: 84, Word: מנחל
Number: 25, Word: שנה
Number: 2, Word: בנות
Number: 20, Word: ו
Number: 12, Word: ברמת
Number: 2, Word: את
Number: 3, Word: צפייה
Number: 2, Word: שאני
Number: 2, Word: את
Number: 2, Word: שכל
Number: 22, Word: הקמתי
Number: 27, Word: עזבה
Number: 2, Word: קלטות
Number: 2, Word: שמתייחסים
Number: 2, Word: אותו
Number: 15, Word: שנה
Number: 70, Word: לאבא
Number: 3, Word: צפייה
Number: 2015, Word: יחד
Number: 2, Word: הצגות


[nltk_data] Downloading package punkt to /home/dean/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/dean/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


NameError: name 'word' is not defined

# Question 4
In order to sort all the articles by their level "closeness" to a single article chosen, we will have to define some metrics.

## Phase 0 - Preprocessing
For each article:
1. Convert each word to its original form, such as "רוה"מ" -> "ראש הממשלה" and "כלבים" -> "כלב"
2. Remove 
## Phase 1 - embedding