## Header

Author : Amina Matt  
Date created : 07.11.2021  
Date last mofidied : 08.11.2021  
Description : Extracting most frequent words from a newspaper website  

### Libraries

In [7]:
import nltk #natural language processing library
nltk.download('stopwords')
import requests #http library
from bs4 import BeautifulSoup #extraction from HTML and XML files
from collections import Counter #dictionary subclass for counting hashable objects

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aminamatt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Librairies for debugging
#import http.client 
#import logging

## Functions

In [37]:
#From an html file sitting at a specific url extract all text inside a specific tag
def text_from_url(url,tag,LIMIT):
    headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
    response = requests.get(url,headers=headers) #http request with a user-agent string to avoid blocking from server
    headlines_text = '' #empty string
    soup = BeautifulSoup(response.text, 'html.parser') #parse the document with html format
    headlines = soup.find_all(tag,limit=LIMIT) #get all the elements within a specified tag
    for headline in headlines:
        headlines_text = headlines_text +' '+ (headline.get_text()) #concatenating the text in a single string
    return headlines_text 


def links_from_url(url,LIMIT):
    headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
    response = requests.get(url,headers = headers) #http request with headers to avoid blocking from server
    headlines_text = '' #empty string
    soup = BeautifulSoup(response.text, 'html.parser') #parse the document with html format
    list_of_links = []
    for link in soup.find_all('a',limit=LIMIT): #get all elements within the a tag
        list_of_links.append(link.get('href')) #extract the url from the elements
    list_of_links = list(set(list_of_links)) #remove duplicates
    return list_of_links 

In [38]:
#Counting the frequency of single words in the text
#https://stackoverflow.com/questions/28392860/print-10-most-frequently-occurring-words-of-a-text-that-including-and-excluding

def words_frequency(text):
    #separate the text into words 
    allWords = nltk.tokenize.word_tokenize(text) 
    #gets rid on 1-gram and 2-gram
    allLongWords = []
    for word in allWords:
        if len(word) > 2: 
            allLongWords.append(word)   
    #print(allWordsH3_3grams)
    #allWordDist = nltk.FreqDist(w.lower() for w in allWordsH3_3grams)
    stopwords = nltk.corpus.stopwords.words('english') #get rids of words such as a, the, and etc..
    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allLongWords if w not in stopwords) 
    return allWordExceptStopDist
    

In [43]:
#Counting the frequency of n-grams in the text
#Input: take the text as a single string of text
#Output: 
#Requirement: Counter library
def ngram_frequency(text):
    #separate the text into words 
    allWords = nltk.tokenize.word_tokenize(text) 
    #gets rid on 1-gram and 2-gram
    allLongWords = []
    for word in allWords:
        if len(word) > 2: 
            allLongWords.append(word)   
    stopwords = nltk.corpus.stopwords.words('english') #list of words such as a, the, and etc..
    allWordExceptStop =[]
    #get rid of stopwords
    for w in allLongWords:
        if w.lower() not in stopwords:
            allWordExceptStop.append(w)
    bigrams = zip(allWordExceptStop, allWordExceptStop[1:])
    bigramsFreq = nltk.FreqDist(bigrams) 
    return bigramsFreq

In [234]:
def article_from_url(url):
    
    response = requests.get(url,headers=headers) #http request with a user-agent string to avoid blocking from server
    soup = BeautifulSoup(response.text, 'html.parser') #parse the document with html format
    soup.find(type="application/ld+json")
    jsonArticle = json.loads(soup.find(type="application/ld+json").string)
    text=jsonArticle['articleBody']
    clean_text = text.replace('.Copyright 2021 The&nbsp;Associated Press. All rights reserved. This material may not be published, broadcast, rewritten or redistributed.',' ').replace('Associated Press',' ').replace('quot',' ')
    
    return clean_text

In [213]:
def get_urls_usnews(URL):
    
    response = requests.get(URL,headers=headers) #http request with a user-agent string to avoid blocking from server
    soup = BeautifulSoup(response.text, 'html.parser') #parse the document with html format
    latest = soup.find('div',{'class':"LoadMoreWrapper__Container-zwyk5c-0 himujt"}) #get all the elements within a specified tag
    list_of_urls = []
    for a in latest.find_all('a'):
        list_of_urls.append(a['href'])
    usnews_urls = list(set(list_of_urls))
    
    return usnews_urls


def get_all_articles(usnews_urls):
    all_articles = ''
    for url in usnews_urls:
        all_articles = all_articles +' '+article_from_url(url)
    
    return all_articles



# Euronews

## Most frequent words from headlines

In [30]:
URL = 'https://www.euronews.com/tag/gender-equality' 
TAG = 'h3'
headlines_text = text_from_url(URL,TAG)
#print(headlines_text)

### Counting words

In [31]:
headlines_freq = words_frequency(headlines_text)
MAX = 10

for word, frequency in headlines_freq.most_common(MAX):
    print('%s;%d' % (word, frequency))

women;8
podcast;5
gender;3
europe;3
change;3
men;3
the;3
back;3
covid;2
says;2


### Counting Bigrams

In [33]:
#bigrams
headlines_bigram_freq = ngram_frequency(headlines_text)
headlines_bigram_freq

FreqDist({('gender', 'gap'): 2, ('Malaysian', 'gynaecologist'): 1, ('gynaecologist', 'creates'): 1, ('creates', "'world"): 1, ("'world", 'first'): 1, ('first', 'unisex'): 1, ('unisex', "condom'"): 1, ("condom'", 'COVID'): 1, ('COVID', 'causing'): 1, ('causing', "'big"): 1, ...})

## Most frequent words from articles cited in the *Gender Equality* page

In [34]:
URL = 'https://www.euronews.com/tag/gender-equality' 
list_of_links = links_from_url(URL)
list_of_links = list_of_links[139:-1] #remove first irrelevant links

In [35]:
#cleaning links
clean_url = []
for link in list_of_links:
    if '2021' in link:
        url = link
        clean_url.append('https://euronews.com'+url)

TypeError: argument of type 'NoneType' is not iterable

In [272]:
clean_url

['https://euronews.com/next/2021/10/28/malaysian-gynaecologist-creates-world-s-first-unisex-condom',
 'https://euronews.com/next/2021/10/28/malaysian-gynaecologist-creates-world-s-first-unisex-condom',
 'https://euronews.com/2021/10/28/gender-equality-index-2021-covid-19-caused-big-losses-for-gender-equality-in-europe',
 'https://euronews.com/2021/10/28/gender-equality-index-2021-covid-19-caused-big-losses-for-gender-equality-in-europe',
 'https://euronews.com/2021/10/28/gender-equality-index-2021-covid-19-caused-big-losses-for-gender-equality-in-europe',
 'https://euronews.com/2021/10/21/norway-beach-handball-sexism-row-will-bikini-bottoms-to-tight-pants-kit-change-satisfy-cri',
 'https://euronews.com/2021/10/21/norway-beach-handball-sexism-row-will-bikini-bottoms-to-tight-pants-kit-change-satisfy-cri',
 'https://euronews.com/2021/10/21/norway-beach-handball-sexism-row-will-bikini-bottoms-to-tight-pants-kit-change-satisfy-cri',
 'https://euronews.com/2021/09/17/women-dominate-top-tabl

Now we can parse each all the HTMLs of the URL we have gathered.

In [63]:
all_articles = ''
for url in clean_url:
    all_articles = all_articles + text_from_url(url,'p',10000)

NameError: name 'clean_url' is not defined

In [318]:
len(all_articles)

404007

In [320]:
all_articles



In [321]:
articles_freq = words_frequency(all_articles)
MAX = 100

for word, frequency in articles_freq.most_common(MAX):
    print('%s;%d' % (word, frequency))

women;423
men;412
people;237
also;224
like;212
and;184
one;181
said;157
gender;152
man;150
would;149
country;134
the;128
know;114
n't;110
young;106
they;106
many;102
podcast;102
european;98
find;95
family;92
africa;92
home;91
this;90
boy;88
but;87
work;84
even;84
zama;84
years;83
world;82
time;80
think;79
want;78
female;77
olavario;76
see;75
get;75
come;72
euronews;71
way;71
live;70
need;69
make;68
back;68
two;67
around;65
help;64
programme;64
lesotho;64
part;63
lot;63
pressure;63
abortion;63
take;62
violence;62
change;62
school;62
masculinity;62
cry;62
different;61
life;61
south;60
families;60
hiv;60
told;59
going;58
leave;57
new;56
journalism;56
're;56
episode;56
means;56
often;54
says;53
first;53
pandemic;53
there;53
could;53
able;53
story;53
europe;51
things;51
woman;50
group;50
started;50
polish;50
guinea;50
mamadou;50
went;48
boys;48
you;48
adventure;48
care;46
kind;46
used;45
government;45
she;45
become;45


## Most frequent n-grams

In [46]:
#bigrams
articles_bigram_freq = ngram_frequency(all_articles)

#    print('%s;%d' % (word, frequency))

0

In [56]:
articles_bigram_freq.most_common(100)

[(('Like', 'Boy'), 60),
 (('Cry', 'Like'), 52),
 (('South', 'Africa'), 42),
 (('zama', 'zama'), 36),
 (('young', 'men'), 34),
 (('Rosalind', 'Morris'), 34),
 (('gender', 'equality'), 31),
 (('programme', 'funded'), 28),
 (('funded', 'European'), 28),
 (('European', 'Journalism'), 28),
 (('Journalism', 'Centre'), 28),
 (('Centre', 'European'), 28),
 (('European', 'Development'), 28),
 (('Development', 'Journalism'), 28),
 (('Journalism', 'Grants'), 28),
 (('Grants', 'programme'), 28),
 (('programme', 'fund'), 28),
 (('fund', 'supported'), 28),
 (('supported', 'Bill'), 28),
 (('Bill', 'Melinda'), 28),
 (('Melinda', 'Gates'), 28),
 (('Peya', 'Diaw'), 28),
 (('Gates', 'Foundation.Share'), 26),
 (('men', 'women'), 25),
 (('Mpiwa', 'Mangwiro'), 24),
 (('Summer', 'School'), 21),
 (('Castbox', 'Spotify'), 20),
 (('Spotify', 'Apple'), 20),
 (('women', 'men'), 18),
 (('female', 'players'), 18),
 (('available', 'French'), 18),
 (('series', 'podcast'), 16),
 (('gender', 'roles'), 16),
 (('Cecelia'

# Multiple newspaper

## US news

### Headlines : Gender tag articles 

In [15]:
URL = 'https://www.usnews.com/topics/subjects/gender'
LEVEL = 'h3'
headlines_text = text_from_url(URL,LEVEL,100)
headlines_text

'How Student Loan Debt Affects WomenThe Best States for Income Equality10 Things You Didn’t Know About The Top-Ranking CountriesThe Politics of Gender EqualityCulture Matters in Managing a PandemicWomen Remain Underrepresented in Local PoliticsUnsupported State Budgets May Hit Women’s Wallets HardestOpinion: Virus May Harm Gender EqualityBest States for Women Gender Diversity in California’s Companies\r\n'

In [19]:
articles_bigram_freq = ngram_frequency(headlines_text)
articles_bigram_freq

FreqDist({('Student', 'Loan'): 1, ('Loan', 'Debt'): 1, ('Debt', 'Affects'): 1, ('Affects', 'WomenThe'): 1, ('WomenThe', 'Best'): 1, ('Best', 'States'): 1, ('States', 'Income'): 1, ('Income', 'Equality10'): 1, ('Equality10', 'Things'): 1, ('Things', 'Know'): 1, ...})

## Getting only the part with latest 

In [84]:
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
response = requests.get(URL,headers=headers) #http request with a user-agent string to avoid blocking from server
soup = BeautifulSoup(response.text, 'html.parser') #parse the document with html format
latest = soup.find('div',{'class':"LoadMoreWrapper__Container-zwyk5c-0 himujt"}) #get all the elements within a specified tag

In [131]:
list_of_urls = []
for a in latest.find_all('a', href=True):
    list_of_urls.append(a['href'])
usnews_urls = list(set(list_of_urls))
usnews_urls

['https://www.usnews.com/news/best-states/slideshows/states-with-the-most-equal-pay-by-gender/',
 'https://www.usnews.com/news/best-countries/articles/2021-04-13/the-greatest-gender-inequality-in-the-world-is-in-politics',
 'https://www.usnews.com/news/best-states/articles/2020-03-04/many-california-corporations-refuse-to-follow-gender-diversity-law-report-finds',
 'https://www.usnews.com/news/cities/articles/2020-11-18/despite-recent-election-gains-women-remain-underrepresented-in-local-politics',
 'https://www.usnews.com/news/best-states/articles/2020-03-05/the-best-states-in-america-for-women',
 'https://www.usnews.com/news/best-countries/articles/2021-03-25/countries-culture-matters-when-fighting-the-covid-19-pandemic',
 'https://www.usnews.com/news/best-countries/articles/2020-04-06/commentary-coronavirus-pandemic-may-set-women-back-decades-on-equality',
 'https://www.usnews.com/news/best-states/articles/2020-09-29/failure-to-shore-up-state-budgets-may-hit-womens-wallets-especiall

### URLs and text of Gender tag articles

In [None]:
# def article_from_url(url):
#     url_test = 'https://www.usnews.com/news/best-states/slideshows/states-with-the-most-equal-pay-by-gender/'
# #text_test = text_from_url(url_test,'p',10000)
# response = requests.get(url_test,headers=headers) #http request with a user-agent string to avoid blocking from server
# soup = BeautifulSoup(response.text, 'html.parser') #parse the document with html format
# soup.find(type="application/ld+json")
# jsonArticle = json.loads(soup.find(type="application/ld+json").string)
# text=jsonArticle['articleBody']

In [41]:
import json

In [219]:
article_from_url(url)

'Gender Discrimination in U.S. Workplaces\nWomen were paid about 84% of what men were in 2020, and this gap has remained relatively steady for the past 15 years, according to a May 2021 survey by the Pew Research Center. Pew notes that women would have had to work an extra 42 days to earn what men did in 2020, based on this estimate.For its 2021 Best States rankings, U.S. News determined which states have the smallest income gap by gender by analyzing 2019 American Community Survey data from the U.S. Census Bureau.These states made the top 10 for income equality by gender:\n\n10. Connecticut\nWomen&#39;s pay as a percentage of men&#39;s pay: 83.7%Connecticut ranks in the bottom 10 for the Opportunity category overall but is buoyed by its better-than-average income equality percentage and poverty rate.Learn more about Connecticut.\n\n9. Alaska\nWomen&#39;s pay as a percentage of men&#39;s pay: 84.5%Alaska ranks in the top 10 for the economic opportunity subcategory, with a median househ

In [220]:
all_articles = ''
for url in usnews_urls:
    all_articles = all_articles + article_from_url(url)

In [222]:
usNewsGEbigramFreq = ngram_frequency(all_articles)

In [223]:
usNewsGEbigramFreq = ngram_frequency(all_articles)
MAX = 100

for word, frequency in usNewsGEbigramFreq.most_common(MAX):
    print('%s;%d' % (word, frequency))

('Women', 'pay');90
('pay', 'percentage');90
('percentage', 'men');90
('men', 'pay');90
('Rhode', 'Island');36
('North', 'Carolina');36
('New', 'York');36
('income', 'equality');27
('ranks', 'top');27
('top', 'economic');27
('economic', 'opportunity');27
('median', 'household');27
('household', 'income');27
('national', 'average.Learn');27
('men', '2020');18
('Best', 'States');18
('Opportunity', 'category');18
('opportunity', 'equality');18
('equality', 'subcategories.Learn');18
('equality', 'subcategory');18
('Gender', 'Discrimination');9
('Discrimination', 'U.S.');9
('U.S.', 'Workplaces');9
('Workplaces', 'Women');9
('Women', 'paid');9
('paid', 'men');9
('2020', 'gap');9
('gap', 'remained');9
('remained', 'relatively');9
('relatively', 'steady');9
('steady', 'past');9
('past', 'years');9
('years', 'according');9
('according', 'May');9
('May', '2021');9
('2021', 'survey');9
('survey', 'Pew');9
('Pew', 'Research');9
('Research', 'Center');9
('Center', 'Pew');9
('Pew', 'notes');9
('note

## *Women's rights* tag

In [31]:
URL = 'https://www.usnews.com/topics/subjects/womens-rights'

In [59]:
#No scroll
headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
response = requests.get(URL,headers=headers) #http request with a user-agent string to avoid blocking from server
response.headers
#soup = BeautifulSoup(response.text, 'html.parser') #parse the document with html format
#soup
#latest = soup.find('div',{'class':"LoadMoreWrapper__Container-zwyk5c-0 himujt"}) #get all the elements within a specified tag
#latest

{'Content-Encoding': 'gzip', 'Content-Type': 'text/html;charset=UTF-8', 'Server': 'N/A', 'X-Powered-By': 'Brightspot', 'X-Akamai-Transformed': '9 - 0 pmb=mTOE,3', 'Date': 'Tue, 09 Nov 2021 10:37:28 GMT', 'Content-Length': '36929', 'Connection': 'keep-alive', 'Vary': 'Accept-Encoding', 'Set-Cookie': 'usn_visitor_id=94641102fe070000684f8a617c01000063840100; expires=Mon, 31-Dec-2038 23:59:59 GMT; path=/; domain=.usnews.com, akacd_www=2177452799~rv=15~id=2903e07e0ea81fc9fca84ae4ba35caa7; path=/; Expires=Mon, 31 Dec 2038 23:59:59 GMT; Secure; SameSite=None, bm_mi=C5995F96F6939EA53FA3BA0137CE3BE9~+MZKpZknulh7XfkAxyOmO1p+Z3TKZajshsRA/9Nyxt/CH7qtUQ6Tanoh6IOyUJA2/TLfXP2PT7PgO/zg/sRjex5VTiV8ZsclRZ2PpgmOAcQUB/ELTjZGyuDvT298lb3DUQ+Y9P0O/I5rPfFwvVI1rX79xrIZ6nQ6KbTFdV9W4n0v7wG9NNumuX5on7C5+P17atjnjr628ToMEGi+4MVxzblvoWtVyKbG2RA7ZTdZNA8PoDQYKLOY6pGy19jyploR; Domain=.usnews.com; Path=/; Max-Age=0; HttpOnly, _abck=FA24640C7B84B2DC7F035A0389AF5E4D~-1~YAAQlGQRAmJQ/fx8AQAAvC9GBAZP67fhozI3zISgEPNnncgFvY0B7

In [34]:
list_of_urls = []
for a in latest.find_all('a'):
    list_of_urls.append(a['href'])
usnews_urls = list(set(list_of_urls))
usnews_urls

['https://www.usnews.com/news/best-states/south-carolina/articles/2021-10-06/editorial-roundup-south-carolina',
 'https://www.usnews.com/news/world/articles/2021-10-19/malta-vows-better-journalist-protections-reforms',
 'https://www.usnews.com/news/health-news/articles/2021-10-08/texas-judge-says-abortions-can-resume-but-future-uncertain',
 'https://www.usnews.com/news/best-states/montana/articles/2021-10-06/montana-man-committed-over-allegations-of-child-molestation',
 'https://www.usnews.com/news/business/articles/2021-10-12/eu-pledges-1-billion-euros-for-afghan-people-at-virtual-g-20',
 'https://www.usnews.com/news/best-states/kansas/articles/2021-11-01/kansas-lawmaker-charged-with-battery-mental-exam-ordered',
 'https://www.usnews.com/news/politics/articles/2021-10-28/rival-violent-past-should-bar-herschel-walker-from-senate',
 'https://www.usnews.com/news/health-news/articles/2021-10-07/texas-judge-says-abortions-can-resume-but-future-uncertain',
 'https://www.usnews.com/news/worl

In [235]:
all_articles = ''
for url in usnews_urls:
    all_articles = all_articles + article_from_url(url)

In [238]:
usNewsWRbigramFreq = ngram_frequency(all_articles)
MAX = 100


for word, frequency in usNewsWRbigramFreq.most_common(MAX):
    print('%s;%d' % (word, frequency))

('Associated', 'Press');14
('2021', 'nbsp');9
('nbsp', 'Associated');9
('Press', 'rights');9
('rights', 'reserved');9
('reserved', 'material');9
('material', 'may');9
('may', 'published');9
('published', 'broadcast');9
('broadcast', 'rewritten');9
('rewritten', 'redistributed.By');8
('Supreme', 'Court');8
('Texas', 'law');8
('court', 'records');7
('White', 'mother');6
('abortion', 'rights');6
('federal', 'government');6
('law', 'professor');6
('Abdi', 'said');5
('federal', 'law');5
('South', 'Carolina');5
('told', 'reporters');4
('civil', 'commitment');4
('law', 'enforcement');4
('United', 'States');4
('woman', 'right');4
('abortion', 'providers');4
('federal', 'court');4
('enforce', 'law');4
('question', 'whether');4
('providers', 'may');4
('Texas', 'abortion');4
('abortion', 'clinics');4
('clinics', 'Texas');4
('per', '100,000');4
('important', 'enough');4
('European', 'Union');3
('Afghan', 'people');3
('300', 'million');3
('sexually', 'dangerous');3
('federal', 'psychiatric');3
('ac

In [48]:
all_articles

'By MARÍA VERZA, Associated PressMEXICO CITY (AP) — Mexico’s Supreme Court ruled Tuesday that it is unconstitutional to punish abortion, unanimously annulling several provisions of a law from Coahuila — a state on the Texas border — that had made abortion a criminal act.The decision will immediately affect only the northern border state, but it establishes a historic precedent and “obligatory criteria for all of the country’s judges,” compelling them to act the same way in similar cases, said court President Arturo Zaldívar. “From now on you will not be able to, without violating the court\'s criteria and the constitution, charge any woman who aborts under the circumstances this court has ruled as valid.”Those circumstances will be clarified when the decision is published, but everything points to that referring to abortions carried out within the first 12 weeks of a pregnancy, the period allowed in the four states where abortion is already legal.The decision comes one week after a Tex

## *Other tags* tag

In [246]:
URL_TOPIC_LIST = ['https://www.usnews.com/topics/subjects/feminism',
             'https://www.usnews.com/topics/subjects/gender',
              'https://www.usnews.com/topics/subjects/gender_bias',
             'https://www.usnews.com/topics/subjects/sexism',
            # 'https://www.usnews.com/topics/subjects/womens-history',
            #  'https://www.usnews.com/topics/subjects/working_women',
            #'https://www.usnews.com/topics/subjects/womens-health',
            ]

all_articles = ''
for url_topic in URL_TOPIC_LIST:
    #Retrieve all urls for latest articles in the specific feminism subject page
    usnews_topic_urls = get_urls_usnews(url_topic)
    
    #Retrieve all the articles contents for the latest articles
    all_articles_topic =  get_all_articles(usnews_topic_urls)
    
    #append articles to create one text
    all_articles = all_articles +' '+all_articles_topic

#Export all the articles of interest in a singel text file
text_file = open("Articles-Contens.txt", "w")
text_file.write(all_articles)
text_file.close()

### Bigrams frequency for selected articles from USNews.com on topic of women

In [247]:
#Couting bigram frequencies for all articles of interest

usNewsFEMbigramFreq = ngram_frequency(all_articles)
MAX = 150

In [255]:
#Visualize the most common bigrams
for word, frequency in usNewsFEMbigramFreq.most_common(MAX):
    if (word[0][0].isupper() or word[1][0].isupper()):
        print('')
    else:
        print('%s;%d' % (word, frequency))


('gender', 'equality');22


('child', 'care');12

('health', 'care');11
('men', 'pay');11
('percentage', 'men');11


('pay', 'percentage');10



('vice', 'president');8
('sexual', 'harassment');8
('women', 'girls');7
('electoral', 'system');7
('girls', 'women');7



('first', 'time');6



('coronavirus', 'pandemic');6


('rates', 'women');6
('women', 'according');6

('female', 'mayors');6
('share', 'women');6
('one', 'highest');6


('women', 'movement');5
('see', 'women');5
('good', 'job');5

('social', 'media');5
('gender', 'stereotypes');5
('five', 'years');5


('gender', 'gap');5
('death', 'rate');5
('public', 'schools');5
('state', 'budgets');5
('top', 'states');5
('women', 'representation');5

('became', 'first');5
('sex', 'discrimination');5
('states', 'women');5

('lose', 'weight');5

('women', 'rights');4


('first', 'nonfiction');4
('nonfiction', 'book');4
('book', 'decade');4
('year', 'pandemic');4

('really', 'began');4
('woman', 'time');4




('100', 'million');4
('six', '

### Final List

In [261]:
#Create the final list of bigrams

bigram_final_list = []
MAX = 100
for word, frequency in usNewsFEMbigramFreq.most_common(MAX):
    if (word[0][0].isupper()==False and word[1][0].isupper()==False):
        bigram_final_list.append(word[0]+' '+word[1])

bigram_final_list

['gender equality',
 'child care',
 'health care',
 'men pay',
 'percentage men',
 'pay percentage',
 'vice president',
 'sexual harassment',
 'women girls',
 'electoral system',
 'girls women',
 'first time',
 'coronavirus pandemic',
 'rates women',
 'women according',
 'female mayors',
 'share women',
 'one highest',
 'women movement',
 'see women',
 'good job',
 'social media',
 'gender stereotypes',
 'five years',
 'gender gap',
 'death rate',
 'public schools',
 'state budgets',
 'top states',
 'women representation',
 'became first',
 'sex discrimination',
 'states women',
 'lose weight',
 'women rights',
 'first nonfiction',
 'nonfiction book',
 'book decade',
 'year pandemic',
 'really began',
 'woman time',
 '100 million',
 'six years',
 'regions say',
 'young people',
 'based gender',
 'public school',
 'federal government',
 'top economic',
 'education health',
 'proportional electoral',
 'female candidates',
 'top five',
 'gender-based violence',
 'metro area',
 'entirely f

In [264]:
#Add my own vocabulary
my_bigrams = ['women\'s right','Equal opportunities','Equal rights','Equal status',
           'equal pay','gender gap','Gender discrimination','Gender equality','Sexual harrasment','Women empowerment',
            'women victim','women immigration','Women emancipation','women\'s participation','Western women','non-western woman',
              'Muslim women', 'Equal wages','Gender equality',
             'gender equity','Men and women', 'women and men', 'women oppression', 'niqab ban'
           'struggle of girls','struggle of women', 'war against women','oppression of girls','oppression of women',
           'women oppression','women\'s opression','liberate women','religious oppresion',
           'abuse of women','Male oppression','Female oppression','Exploitation of women',
           'Indigenous women','Patriarchal culture']

all_bigrams = my_bigrams + bigram_final_list
all_bigrams

["women's right",
 'Equal opportunities',
 'Equal rights',
 'Equal status',
 'equal pay',
 'gender gap',
 'Gender discrimination',
 'Gender equality',
 'Sexual harrasment',
 'Women empowerment',
 'women victim',
 'women immigration',
 'Women emancipation',
 "women's participation",
 'Western women',
 'non-western woman',
 'Muslim women',
 'Equal wages',
 'Gender equality',
 'gender equity',
 'Men and women',
 'women and men',
 'women oppression',
 'niqab banstruggle of girls',
 'struggle of women',
 'war against women',
 'oppression of girls',
 'oppression of women',
 'women oppression',
 "women's opression",
 'liberate women',
 'religious oppresion',
 'abuse of women',
 'Male oppression',
 'Female oppression',
 'Exploitation of women',
 'Indigenous women',
 'Patriarchal culture',
 'gender equality',
 'child care',
 'health care',
 'men pay',
 'percentage men',
 'pay percentage',
 'vice president',
 'sexual harassment',
 'women girls',
 'electoral system',
 'girls women',
 'first time'

In [51]:
# #EXAMPLE of ONE subject
# URL = 'https://www.usnews.com/topics/subjects/gender_bias'
# #No scroll
# headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
# response = requests.get(URL,headers=headers) #http request with a user-agent string to avoid blocking from server
# soup = BeautifulSoup(response.text, 'html.parser') #parse the document with html format
# latest = soup.find('div',{'class':"LoadMoreWrapper__Container-zwyk5c-0 himujt"}) #get all the elements within a specified tag

# list_of_urls = []
# for a in latest.find_all('a'):
#     list_of_urls.append(a['href'])
# usnews_urls = list(set(list_of_urls))
# all_articles = ''
# for url in usnews_urls:
#     all_articles = all_articles + article_from_url(url)

# usNewsFEMbigramFreq = ngram_frequency(all_articles)
# MAX = 100

# for word, frequency in usNewsFEMbigramFreq.most_common(MAX):
#     print('%s;%d' % (word, frequency))

('child', 'care');11
('Los', 'Angeles');11
('quot', 'says');9
('says', 'quot');7
('health', 'care');6
('quot', 'Warren');6
('Angeles', 'County');6
('quot', 'quot');5
('vice', 'president');5
('quot', 'think');5
('Civil', 'Rights');5
('Washington', 'D.C.');4
('Elizabeth', 'Warren');4
('Joe', 'Biden');4
('EMILY', 'List');4
('became', 'first');4
('sex', 'discrimination');4
('civil', 'rights');4
('Biden', 'administration');4
('racist', 'attacks');3
('center', 'stage');3
('said', 'quot');3
('READ', 'Democracy');3
('Democracy', 'Demographics');3
('Demographics', 'data');3
('data', 'behind');3
('behind', 'votes');3
('presidential', 'nominee');3
('New', 'York');3
('Head', 'Start');3
('Child', 'Care');3
('across', 'country');3
('White', 'House');3
('women', 'men');3
('top', 'states');3
('states', 'women');3
('women', 'office');3
('major', 'party');3
('Donald', 'Trump');3
('quot', 'Trump');3
('Supreme', 'Court');3
('women', 'quot');3
('Trump', 'administration');3
('Title', 'rule');3
('sexual', 'a

# Debugging tools

## Requests to servers with whitelist

In [31]:
###Some servers, as usnews, blocks `requests` that aren't coming from browser or a their whitelist. 
###In this case we need to add information to make it looks like we are a browser
###From https://stackoverflow.com/questions/62599036/python-requests-is-slow-and-takes-very-long-to-complete-http-or-https-request

#http.client.HTTPConnection.debuglevel = 0

###You must initialize logging, otherwise you'll not see debug output.
#logging.basicConfig()
#logging.getLogger().setLevel(logging.DEBUG)
#requests_log = logging.getLogger("requests.packages.urllib3")
#requests_log.setLevel(logging.DEBUG)
#requests_log.propagate = True
#requests.get(URL, headers=headers)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.usnews.com:443
DEBUG:urllib3.connectionpool:https://www.usnews.com:443 "GET /topics/subjects/gender HTTP/1.1" 200 37665


<Response [200]>

# Scratch 

## Scrolling test

In [None]:
#With scroll
theLatestFromScroll = soup.find('div',{'class':"LoadMoreWrapper__Container-zwyk5c-0 himujt"})
theLatestFromScroll
#latest = theLatestFromScroll.select("a[class*=Anchor-byh49a-0]") # with *= means: contains
#latest

In [205]:
#function to automate the scroll down 
#https://stackoverflow.com/questions/54499193/python-scraping-web-page-for-information-that-only-appears-after-scrolling
import time
import selenium
from selenium import webdriver #website automation library
#from selenium.webdriver.common.touch_actions import TouchActions
from webdriver_manager.chrome import ChromeDriverManager #driver installation library
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# def ignore_sign_up_popover():
#     sign_up = driver.find_elements_by_css_selector('#signup')
#     if len(sign_up) > 0 and sign_up[0].is_displayed():
#         TouchActions(driver).tap_and_hold(0, 0).release(0, 0).perform()

    
def scroll(url):
#driver settings
#     options = webdriver.ChromeOptions()
#     options.add_experimental_option('w3c', False)
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-notifications")
    driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
    driver.implicitly_wait(30) #avoid error due to time lag
    print('Im before try ')
    headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
    #cookies = {'usn_session_id':'3645584794271639'}
    #r = requests.get(URL, cookies=cookies,headers=headers)
    try:
        SCROLL_PAUSE_TIME = 1
        driver.get(url)
        print('current window')
        #t = driver.find_element(By.CSS_SELECTOR, '[id="gdpr-modal-agree"]')
        t = driver.find_element(By.ID, 'gdpr-modal-agree').click()
        print('click done')
        #Execute the JavaScript for scrolling down until the end of the page and return the page source
        last_height = driver.execute_script("return document.body.scrollHeight")
        print(last_height)
        i = 0 
        while True:
            print('One more scroll')
            time.sleep(SCROLL_PAUSE_TIME)
            #scrolling down to the bottom
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5) #to avoid lag in loading
            button = driver.find_element(By.CLASS_NAME, 'button-content').click()
            #button = driver.find_element(By.CLASS_NAME, "Cell-sc-1abjmm4-0 jiILQT Hide-kg09cx-0 kRfYFr").click()
            #WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, 'button-content'))).click()
            #$driver.findElement(By.xpath("//span[@class='button-content']")).click()
            #button.click()
            print('This is the loading element found')
            print(button)
            #waiting for loading 
            time.sleep(5) #to avoid lag in loading
           
            #calculate new scroll bar height
            new_height = driver.execute_script("return document.body.scrollHeight")
            
            #print('new half scroll')
            #driver.execute_script("window.scrollTo(0, document.body.scrollHeight-1000);")
            #load = driver.find_element(By.CLASS_NAME, 'button-content').click()
            i = i+1
            
        
            if new_height == last_height:
                break
            last_height = new_height
    
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
    finally:
        driver.quit()
    
    return soup

In [206]:
URL = 'https://www.usnews.com/topics/subjects/womens-rights'
soup = scroll(URL)



Current google-chrome version is 95.0.4638
Get LATEST chromedriver version for 95.0.4638 google-chrome
Driver [/Users/aminamatt/.wdm/drivers/chromedriver/mac64/95.0.4638.69/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)


Im before try 
current window
click done
3737
One more scroll
This is the loading element found
None
One more scroll
This is the loading element found
None


In [207]:
latest = soup.find('div',{'class':"LoadMoreWrapper__Container-zwyk5c-0 himujt"}) #get all the elements within a specified tag
latest
list_of_urls = []
for a in latest.find_all('a'):
     list_of_urls.append(a['href'])
usnews_urls = list(set(list_of_urls))
usnews_urls

['https://www.usnews.com/news/best-states/south-carolina/articles/2021-10-06/editorial-roundup-south-carolina',
 'https://www.usnews.com/news/world/articles/2021-10-19/malta-vows-better-journalist-protections-reforms',
 'https://www.usnews.com/news/health-news/articles/2021-10-08/texas-judge-says-abortions-can-resume-but-future-uncertain',
 'https://www.usnews.com/news/best-states/montana/articles/2021-10-06/montana-man-committed-over-allegations-of-child-molestation',
 'https://www.usnews.com/news/business/articles/2021-10-12/eu-pledges-1-billion-euros-for-afghan-people-at-virtual-g-20',
 'https://www.usnews.com/news/best-states/kansas/articles/2021-11-01/kansas-lawmaker-charged-with-battery-mental-exam-ordered',
 'https://www.usnews.com/news/politics/articles/2021-10-28/rival-violent-past-should-bar-herschel-walker-from-senate',
 'https://www.usnews.com/news/health-news/articles/2021-10-07/texas-judge-says-abortions-can-resume-but-future-uncertain',
 'https://www.usnews.com/news/worl

In [70]:
## NO SCROLL
# headers = {"User-Agent": "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"}
# cookies = {'usn_session_id':'3645584794271639'}
# response = requests.get(URL, cookies=cookies, headers=headers)
# soup = BeautifulSoup(response.text, 'html.parser')
# soup


<!DOCTYPE html>

<!-- component="containers/pages/brightspot/tag.js" -->
<html class="Html__HtmlStyled-sc-1np0mfo-0 gjWElA" lang="en" style="scroll-behavior:smooth"><head>
<link as="font" crossorigin="" href="https://fonts.gstatic.com/s/montserrat/v12/IQHow_FEYlDC4Gzy_m8fcmaVI6zN22yiurzcBKxPjFE.woff2" rel="preload" type="font/woff2"/>
<link as="font" crossorigin="" href="https://fonts.gstatic.com/s/roboto/v18/oMMgfZMQthOryQo9n22dcuvvDin1pK8aKteLpeZ5c0A.woff2" rel="preload" type="font/woff2"/>
<link as="font" crossorigin="" href="https://fonts.gstatic.com/s/roboto/v18/d-6IYplOFocCacKzxwXSOJBw1xU1rKptJj_0jans920.woff2" rel="preload" type="font/woff2"/>
<title data-rh="true">Sexism | The Latest News on Sexism</title><meta charset="utf-8" data-rh="true"/><meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0" data-rh="true" name="viewport"/><meta content="Read the latest articles and commentary on sexism at US News." data-rh="true" name="description"/><me

KeyboardInterrupt: 

## With meta tags and newspaper

In [44]:
import newspaper
from newspaper import Article
usnews = newspaper.Source('https://www.usnews.com/topics/subjects/womens-rights',memoize_articles=False)
paper = usnews.build()
paper 
#for category in usnews.category_urls():
  #  print(category)

In [45]:
for article in paper.articles:
      print(article)

AttributeError: 'NoneType' object has no attribute 'articles'