In [1]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

# Identifying Keywords from News Articles

Idea:
- Webscrape text from article links
- Use TFIDF/NER to idenitfy keywords from the text for tagging purposes
- Use Keywords to create a summary


## Import Data

In [3]:
date_range = pd.date_range('01-01-2006', datetime.today().date())
df = pd.read_csv("../actions.csv")
df["date"] = pd.to_datetime(df["date"])
df.head()

Unnamed: 0.1,Unnamed: 0,id,date,sources,actions,struggles,employment_types,description,online,locations,companies,workers,tags,author,latlngs,addresses
0,0,1,1979-09-01,['https://twitter.com/clancynewyork/status/117...,['protest'],['ethics'],['white collar workers'],"IBM workers formed an organization, IBM: Speak...",,['worldwide'],['ibm'],,['international_solidarity'],['nataliyaned'],,
1,1,2,1986-09-16,['https://www.nytimes.com/1986/09/16/science/s...,['open letter'],['ethics'],['white collar workers'],Fourteen employees at AT&T sign a letter warni...,,['usa'],['at&t'],14.0,,['nataliyaned'],"[(45.5733162, -122.5587111463347)]","['AT&T, 10025, Northeast Cascades Parkway, Por..."
2,2,3,1986-09-16,['https://www.nytimes.com/1986/09/16/science/s...,['open letter'],['ethics'],['white collar workers'],"A group of computer professionals, led by Comp...",,['usa'],,30.0,,['nataliyaned'],"[(39.7837304, -100.4458825)]",['United States']
3,3,4,1994-01-24,['https://www.nytimes.com/1994/06/12/magazine/...,['open letter'],['ethics'],['white collar workers'],Computer Professionals for Social Responsibili...,True,,,50000.0,,['nataliyaned'],,
4,4,5,1998-11-23,['http://www.cnn.com/tech/computing/9811/23/ms...,['legal action'],"['unfair labor practices', 'pay and benefits']",['contract workers'],A law firm representing 10 current and former ...,,['usa'],['microsoft'],10.0,,['organizejs'],"[(35.139622349999996, -80.92306993327955)]","['Microsoft, Yorkwood, Charlotte, Mecklenburg ..."


## Links to Explore

In [4]:
df["sources"]

0      ['https://twitter.com/clancynewyork/status/117...
1      ['https://www.nytimes.com/1986/09/16/science/s...
2      ['https://www.nytimes.com/1986/09/16/science/s...
3      ['https://www.nytimes.com/1994/06/12/magazine/...
4      ['http://www.cnn.com/tech/computing/9811/23/ms...
                             ...                        
292    ['https://www.msn.com/en-us/news/us/boycott-co...
293    ['https://www.vice.com/en_au/article/ep4qdz/am...
294    ['https://www.news.com.au/finance/work/at-work...
295    ['https://www.cnbc.com/2020/08/13/german-digit...
296    ['https://www.nytimes.com/2020/08/14/technolog...
Name: sources, Length: 297, dtype: object

In [5]:
df["sources"][0]

"['https://twitter.com/clancynewyork/status/1175872040814993408?s=19', 'http://www-cs-students.stanford.edu/~cale/cs201/apartheid.comp.html']"

## Webscrape for source text

In [6]:
url = "http://www-cs-students.stanford.edu/~cale/cs201/apartheid.comp.html"
response = requests.get(url)

In [7]:
soup = BeautifulSoup(response.text, "html.parser")

In [8]:
print(soup)

<html>
<title>The Use of Computers to Support Oppression</title>
<body alink="#FFFF00" background="pictures/olive_paper.gif" link="#F2F295" text="#FFFFFF" vlink="#C9B720">
<center>
<h1><b>The Use of Computers to Support Oppression</b></h1><p>
</p></center>

Computer technology enabled the government to organize and enforce such an atrocious system of segregation and control.

<blockquote>
More than any other single technological advancement, the computer fostered the concentration of administrative power in the hands of Africa's white elite. <a href="apartheid.bib.html"><b>[NAR82]</b></a>
</blockquote>

Despite the U.N. arms embargoes, American computers were in widespread use throughout South Africa.  The United States was the largest supplier of computers used in South Africa. American computers were in use in virtually every governmental agency, the police system, and the military, all of which contributed to the control system known as apartheid.  Computer technology did not merely

## Analyze Sample Source Text

In [9]:
soup.title

<title>The Use of Computers to Support Oppression</title>

In [10]:
soup.body

<body alink="#FFFF00" background="pictures/olive_paper.gif" link="#F2F295" text="#FFFFFF" vlink="#C9B720">
<center>
<h1><b>The Use of Computers to Support Oppression</b></h1><p>
</p></center>

Computer technology enabled the government to organize and enforce such an atrocious system of segregation and control.

<blockquote>
More than any other single technological advancement, the computer fostered the concentration of administrative power in the hands of Africa's white elite. <a href="apartheid.bib.html"><b>[NAR82]</b></a>
</blockquote>

Despite the U.N. arms embargoes, American computers were in widespread use throughout South Africa.  The United States was the largest supplier of computers used in South Africa. American computers were in use in virtually every governmental agency, the police system, and the military, all of which contributed to the control system known as apartheid.  Computer technology did not merely support the system of oppression, rather the entire country was 

In [11]:
if soup.find("img"):
    img_tag = soup.img
    img_tag.decompose()
paragraphs = soup.body.find_all("p")
texts = []
for para in paragraphs:
    texts.extend(para.find_all(text = True))
texts = set(texts)
article = " ".join(texts)

article



### Does the article mention companies that we are already tracking? Which ones?

In [12]:
df["companies"] = df["companies"].astype(str).str.strip("[]\'")
df["companies"] = df["companies"].astype(str).str.replace("'", "")

companies = []
for i in df["companies"].unique():
    companies.extend(i.split(", "))
companies = list(set(companies))

companies.remove("None")
companies.remove("na")

print(companies)

['etown', 'chinese literature', 'cisco', 'wayfair', 'sony', 'hp', 'google', 'at&t', 'N26', 'caviar', 'spin', 'accenture', 'tableau', 'oracle', 'naver', 'foodora', 'ibm', 'salesforce', 'hilfr', 'adobe', 'npm', 'kickstarter', 'compass_group', 'doordash', 'mit', 'iFood', 'mozilla', 'yahoo', 'shutterstock', 'whole_foods', 'rappi', 'ele.me', 'github', 'loop transportation', 'foxconn', 'foodera', 'wikipedia', 'samsung', 'carnegie mellon university', 'lionbridge', 'wework', 'tencent', 'bolt', 'microsoft', 'pinterest', 'instacart', 'huawei', 'meituan', 'walmart', 'glitch', 'okcupid', 'glovo', 'korea_advanced_institute_of_science_and_technology', 'ebay', 'sindelantal', 'alibaba', 'amazon', 'deliveroo', 'postmates', 'internet_archive', 'uber', 'loggi', 'lg', 'lyft', 'broadcom', 'youtube', 'intel', 'universal protection service', 'slack', 'square', 'target', 'h3c', 'instagram', 'little_cab', 'didi_food', 'baidu', 'lanetix', 'shipt', 'reddit', 'facebook', 'palantir', 'daemo', 'mechanical_turk', 'b

In [13]:
article = article.lower()

mentioned = []
for c in companies:
    if c in article:
        mentioned.append(c)

print(mentioned)

['ibm', 'mit', 'loggi', 'target']


- this has possiblity of false positive. ex: target is mentioned but not with reference to the company

### Keyword Extraction

In [14]:
import re

article = article.lower()
article = re.sub("\n", "", article)
article = re.sub("(\\d|\\W)+", " ", article)
article = re.sub("[^a-zA-Z]", " ", article)
article = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", article)

article



In [22]:
import nltk
nltk.download("brown")

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.corpus import brown

cv=CountVectorizer(max_df=0.8,stop_words=stopwords, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(" ".join(brown.words()))

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\tygar\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


ValueError: Iterable over raw text documents expected, string object received.

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(X)

# get feature names
feature_names=cv.get_feature_names()
 
# fetch document for which keywords needs to be extracted
doc = article
 
#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

NameError: name 'X' is not defined

In [None]:
#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,5)
 
# now print the results
print("\nAbstract:")
print(doc)
print("\nKeywords:")
for k in keywords:
    print(k,keywords[k])

### Text Summarizer

Source Code/Reference: https://glowingpython.blogspot.com/2014/09/text-summarization-with-nltk.html

In [16]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

class FrequencySummarizer:
    def __init__(self, min_cut=0.05, max_cut=0.9):
        """
         Initilize the text summarizer.
         Words that have a frequency term lower than min_cut 
         or higer than max_cut will be ignored.
        """
        self._min_cut = min_cut
        self._max_cut = max_cut 
        self._stopwords = set(stopwords.words('english') + list(punctuation))

    def _compute_frequencies(self, word_sent):
        """ 
          Compute the frequency of each of word.
          Input: 
           word_sent, a list of sentences already tokenized.
          Output: 
           freq, a dictionary where freq[w] is the frequency of w.
        """
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        # frequencies normalization and fitering
        m = float(max(freq.values()))
        keys = freq.copy().keys()
        for w in keys:
            freq[w] = freq[w]/m
            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
                del freq[w]
        return freq

    def summarize(self, text, n):
        """
          Return a list of n sentences 
          which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)    
        return [sents[j] for j in sents_idx]

    def _rank(self, ranking, n):
        """ return the first n sentences with highest ranking """
        return nlargest(n, ranking, key=ranking.get)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tygar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tygar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
# Add URLs of articles to summarize to `to_summarize`
to_summarize = ["http://www-cs-students.stanford.edu/~cale/cs201/apartheid.comp.html",
               "https://www.vice.com/en_us/article/m7jpgy/open-source-community-changing-github-avatars-drop-ice",
               "https://nypost.com/2020/07/06/protesters-boycott-whole-foods-over-black-lives-matter-mask-policy/"]

fs = FrequencySummarizer()
for article_url in to_summarize:
    
    page = requests.get(article_url)
    soup = BeautifulSoup(page.text, "html.parser")
    
    if soup.find("img"):
        img_tag = soup.img
        img_tag.decompose()
    paragraphs = soup.body.find_all("p")
    texts = []
    for para in paragraphs:
        texts.extend(para.find_all(text = True))
    texts = set(texts)
    article = " ".join(texts)
    
    title = soup.title.text
    
    print('----------------------------------')
    print(title + " (" + article_url + ")")
    summary = fs.summarize(article, 3)
    if summary:
        for s in summary:
            print('*',s)
    else:
        print("* No summary available.")

----------------------------------
The Use of Computers to Support Oppression (http://www-cs-students.stanford.edu/~cale/cs201/apartheid.comp.html)
* Not only did IBM contribute tremendous computing power to the South African apartheid system, but of the fifteen hundred South African workers employed by IBM in 1982, less than 20% were classified as coloured, black or Asian.
* One South African described the population register at work as,
 Where to go from here:

 

The main purpose of the population registry was administration of the influx control system, a system which channeled needed black workers into the labor force to be exploited, and confined others to the desolate homelands.
* Computer equipment was also used in the Department of the Prime Minister, the South African Reserve Bank, South Africa's electrical utility, and the Treasury Department.
----------------------------------
The Open Source Community Is Calling on Github to ‘Drop ICE' (https://www.vice.com/en_us/article/m

## Create Sources & Text DataFrame

In [18]:
df["sources"] = df["sources"] = df["sources"].astype(str).str.strip("[]\'")

In [19]:
sources = []
for i in df["sources"].unique():
    sources.extend(i.split(", "))

sources = list(set(sources))

In [20]:
sources_df = pd.DataFrame({"Source": sources})
sources_df.head()

Unnamed: 0,Source
0,'https://fortune.com/2019/09/16/global-climate...
1,http://www.theinvestor.co.kr/view.php?ud=20180...
2,'https://twitter.com/IfeomaOzoma/status/127254...
3,'https://www.vice.com/en_us/article/g5ppqq/ube...
4,https://medium.com/@fbcontentmods/this-is-a-me...


In [21]:
from tqdm import tqdm

texts = []
for url in tqdm(sources_df["Source"].to_numpy()):
    try:
        resp = requests.get(url)
        soup = BeautifulSoup(resp.text, "html.parser")
    except ConnectionError or ProtocolError or OSError:
        soup = "None"
        
    texts.append(soup)
    
print(len(sources), len(texts))

  0%|                                                                                          | 0/521 [00:00<?, ?it/s]


InvalidSchema: No connection adapters were found for ''https://fortune.com/2019/09/16/global-climate-strike-protest-google-amazon-microsoft-walkout'

In [16]:
# Problem URL
sources[175]

# Figure out how to define a time-out/connection fail situation (Text will be "Null")

'http://wiki.wearedynamo.org/index.php/guidelines_for_academic_requesters'

In [40]:
sources_df["Text"] = texts

ConnectionError: ('Connection aborted.', OSError("(10060, 'WSAETIMEDOUT')"))

In [None]:
sources_df.head()