## Header

Author : Amina Matt  
Date created : 07.11.2021  
Date last mofidied : 07.11.2021  
Description : Extracting most frequent words from a newspaper website  

### Libraries

In [279]:
import nltk #natural language processing library
nltk.download('stopwords')
import requests #http library
from bs4 import BeautifulSoup #extraction from HTML and XML files

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aminamatt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Functions

In [305]:
#From an html file sitting at a specific url extract all text inside a specific tag
def text_from_url(url,tag):
    response = requests.get(url) #http request
    headlines_text = '' #empty string
    soup = BeautifulSoup(response.text, 'html.parser') #parse the document with html format
    headlines = soup.find_all(tag) #get all the elements within a specified tag
    for headline in headlines:
        headlines_text = headlines_text + (headline.get_text()) #concatenating the text in a single string
    return headlines_text 


def links_from_url(url):
    response = requests.get(url) #http request
    headlines_text = '' #empty string
    soup = BeautifulSoup(response.text, 'html.parser') #parse the document with html format
    list_of_links = []
    for link in soup.find_all('a'): #get all elements within the a tag
        list_of_links.append(link.get('href')) #extract the url from the elements
    return list_of_links 

In [297]:
#Counting the frequency of single words in the text
#https://stackoverflow.com/questions/28392860/print-10-most-frequently-occurring-words-of-a-text-that-including-and-excluding

def words_frequency(text):
    #separate the text into words 
    allWords = nltk.tokenize.word_tokenize(text) 
    #gets rid on 1-gram and 2-gram
    allLongWords = []
    for word in allWords:
        if len(word) > 2: 
            allLongWords.append(word)   
    #print(allWordsH3_3grams)
    #allWordDist = nltk.FreqDist(w.lower() for w in allWordsH3_3grams)
    stopwords = nltk.corpus.stopwords.words('english') #get rids of words such as a, the, and etc..
    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allLongWords if w not in stopwords) 
    return allWordExceptStopDist
    

## Most frequent words from headlines

In [301]:
URL = 'https://www.euronews.com/tag/gender-equality' 
TAG = 'h3'
headlines_text = text_from_url(URL,TAG)
print(headlines_text)


                      Malaysian gynaecologist creates 'world's first unisex condom'
                  
                      COVID causing 'big losses' for gender equality in Europe, says report
                  
                      Will 'bikini bottoms' to 'tight pants' uniform change satisfy critics?
                  
                      Women dominate top table in Albania’s new government
                  
                      Summer School for Female Leadership aims to close gender gap
                  
                      Here’s why we should raise boys like girls | View
                  
                      Irish Football Association introduces equal pay for men and women
                  
                      Polish region wants to remain 'LGBT-free' despite risking EU funds
                  
                      Photo gallery: Burundi's women's status is being rethought
                  
                      Liberia’s FaithVonic: making music to overcome tr

### Counting words

In [299]:
headlines_freq = words_frequency(headlines_text)
MAX = 10

for word, frequency in headlines_freq.most_common(MAX):
    print('%s;%d' % (word, frequency))

women;8
podcast;5
gender;3
europe;3
change;3
men;3
the;3
back;3
covid;2
says;2


## Most frequent words from articles cited in the *Gender Equality* page

In [311]:
URL = 'https://www.euronews.com/tag/gender-equality' 
list_of_links = links_from_url(URL)
list_of_links = list_of_links[139:-1] #remove first irrelevant links

In [312]:
#cleaning links
clean_url = []
for link in list_of_links:
    if '2021' in link:
        url = link
        clean_url.append('https://euronews.com'+url)

TypeError: argument of type 'NoneType' is not iterable

In [272]:
clean_url

['https://euronews.com/next/2021/10/28/malaysian-gynaecologist-creates-world-s-first-unisex-condom',
 'https://euronews.com/next/2021/10/28/malaysian-gynaecologist-creates-world-s-first-unisex-condom',
 'https://euronews.com/2021/10/28/gender-equality-index-2021-covid-19-caused-big-losses-for-gender-equality-in-europe',
 'https://euronews.com/2021/10/28/gender-equality-index-2021-covid-19-caused-big-losses-for-gender-equality-in-europe',
 'https://euronews.com/2021/10/28/gender-equality-index-2021-covid-19-caused-big-losses-for-gender-equality-in-europe',
 'https://euronews.com/2021/10/21/norway-beach-handball-sexism-row-will-bikini-bottoms-to-tight-pants-kit-change-satisfy-cri',
 'https://euronews.com/2021/10/21/norway-beach-handball-sexism-row-will-bikini-bottoms-to-tight-pants-kit-change-satisfy-cri',
 'https://euronews.com/2021/10/21/norway-beach-handball-sexism-row-will-bikini-bottoms-to-tight-pants-kit-change-satisfy-cri',
 'https://euronews.com/2021/09/17/women-dominate-top-tabl

Now we can parse each all the HTMLs of the URL we have gathered.

In [317]:
all_articles = ''
for url in clean_url:
    all_articles = all_articles + text_from_url(url,'p')

In [318]:
len(all_articles)

404007

In [320]:
all_articles



In [321]:
articles_freq = words_frequency(all_articles)
MAX = 100

for word, frequency in articles_freq.most_common(MAX):
    print('%s;%d' % (word, frequency))

women;423
men;412
people;237
also;224
like;212
and;184
one;181
said;157
gender;152
man;150
would;149
country;134
the;128
know;114
n't;110
young;106
they;106
many;102
podcast;102
european;98
find;95
family;92
africa;92
home;91
this;90
boy;88
but;87
work;84
even;84
zama;84
years;83
world;82
time;80
think;79
want;78
female;77
olavario;76
see;75
get;75
come;72
euronews;71
way;71
live;70
need;69
make;68
back;68
two;67
around;65
help;64
programme;64
lesotho;64
part;63
lot;63
pressure;63
abortion;63
take;62
violence;62
change;62
school;62
masculinity;62
cry;62
different;61
life;61
south;60
families;60
hiv;60
told;59
going;58
leave;57
new;56
journalism;56
're;56
episode;56
means;56
often;54
says;53
first;53
pandemic;53
there;53
could;53
able;53
story;53
europe;51
things;51
woman;50
group;50
started;50
polish;50
guinea;50
mamadou;50
went;48
boys;48
you;48
adventure;48
care;46
kind;46
used;45
government;45
she;45
become;45


## Most frequent n-grams

In [278]:
#bigrams
from collections import Counter

bigrams = zip(allLongWords, allLongWords[1:])
counts = Counter(bigrams)
print(counts.most_common(100))

[(('that', 'they'), 69), (('they', 'have'), 45), (('South', 'Africa'), 42), (('have', 'been'), 38), (('gender', 'equality'), 36), (('zama', 'zama'), 36), (('because', 'they'), 35), (('Rosalind', 'Morris'), 34), (('that', 'this'), 33), (('programme', 'funded'), 30), (('funded', 'European'), 30), (('European', 'Journalism'), 30), (('Journalism', 'Centre'), 30), (('Centre', 'through'), 30), (('through', 'European'), 30), (('European', 'Development'), 30), (('Development', 'Journalism'), 30), (('Journalism', 'Grants'), 30), (('Grants', 'programme'), 30), (('programme', 'This'), 30), (('This', 'fund'), 30), (('fund', 'supported'), 30), (('supported', 'Bill'), 30), (('Bill', 'Melinda'), 30), (('Melinda', 'Gates'), 30), (('find', 'that'), 30), (('Gates', 'Foundation.Share'), 28), (('Foundation.Share', 'this'), 28), (('know', 'that'), 28), (('Peya', 'Diaw'), 28), (('this', 'podcast'), 28), (('when', 'they'), 27), (('their', 'families'), 26), (('think', 'that'), 25), (('that', 'there'), 25), ((