In [3]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

# Identifying Keywords from News Articles

Idea:
- Webscrape text from article links
- Use TFIDF/NER to idenitfy keywords from the text for tagging purposes
- Use Keywords to create a summary


## Import Data

In [3]:
date_range = pd.date_range('01-01-2006', datetime.today().date())
df = pd.read_csv("../actions.csv")
df["date"] = pd.to_datetime(df["date"])
df.head()

Unnamed: 0.1,Unnamed: 0,id,date,sources,actions,struggles,employment_types,description,online,locations,companies,workers,tags,author,latlngs,addresses
0,0,1,1979-09-01,['https://twitter.com/clancynewyork/status/117...,['protest'],['ethics'],['white_collar_workers'],"IBM workers formed an organization, IBM: Speak...",,['worldwide'],['ibm'],,['international_solidarity'],['nataliyaned'],,
1,1,2,1986-09-16,['https://www.nytimes.com/1986/09/16/science/s...,['open_letter'],['ethics'],['white_collar_workers'],Fourteen employees at AT&T sign a letter warni...,,['usa'],['at&t'],14.0,,['nataliyaned'],"[(45.5733162, -122.5587111463347)]","['10025, Northeast Cascades Parkway, Portland,..."
2,2,3,1986-09-16,['https://www.nytimes.com/1986/09/16/science/s...,['open_letter'],['ethics'],['white_collar_workers'],"A group of computer professionals, led by Comp...",,['usa'],,30.0,,['nataliyaned'],"[(39.7837304, -100.4458825)]",['United States']
3,3,4,1994-01-24,['https://www.nytimes.com/1994/06/12/magazine/...,['open_letter'],['ethics'],['white_collar_workers'],Computer Professionals for Social Responsibili...,True,,,50000.0,,['nataliyaned'],,
4,4,5,1998-11-23,['http://www.cnn.com/tech/computing/9811/23/ms...,['legal_action'],"['unfair_labor_practices', 'pay_and_benefits']",['contract_workers'],A law firm representing 10 current and former ...,,['usa'],['microsoft'],10.0,,['organizejs'],"[(35.139622349999996, -80.92306993327955)]","['Microsoft, Yorkwood, Charlotte, Mecklenburg ..."


## Links to Explore

In [4]:
df["sources"]

0      ['https://twitter.com/clancynewyork/status/117...
1      ['https://www.nytimes.com/1986/09/16/science/s...
2      ['https://www.nytimes.com/1986/09/16/science/s...
3      ['https://www.nytimes.com/1994/06/12/magazine/...
4      ['http://www.cnn.com/tech/computing/9811/23/ms...
                             ...                        
279    ['https://medium.com/@catalina.brennan.gatica/...
280    ['https://www.theverge.com/2020/6/22/21299736/...
281    ['https://medium.com/@CoalitionForCriticalTech...
282    ['https://www.npr.org/2020/06/23/881624553/pin...
283    ['https://www.wsws.org/en/articles/2020/07/04/...
Name: sources, Length: 284, dtype: object

In [5]:
df["sources"][0]

"['https://twitter.com/clancynewyork/status/1175872040814993408?s=19, http://www-cs-students.stanford.edu/~cale/cs201/apartheid.comp.html']"

## Webscrape for source text

In [6]:
url = "http://www-cs-students.stanford.edu/~cale/cs201/apartheid.comp.html"
response = requests.get(url)

In [7]:
soup = BeautifulSoup(response.text, "html.parser")

In [8]:
print(soup)

<html>
<title>The Use of Computers to Support Oppression</title>
<body alink="#FFFF00" background="pictures/olive_paper.gif" link="#F2F295" text="#FFFFFF" vlink="#C9B720">
<center>
<h1><b>The Use of Computers to Support Oppression</b></h1><p>
</p></center>

Computer technology enabled the government to organize and enforce such an atrocious system of segregation and control.

<blockquote>
More than any other single technological advancement, the computer fostered the concentration of administrative power in the hands of Africa's white elite. <a href="apartheid.bib.html"><b>[NAR82]</b></a>
</blockquote>

Despite the U.N. arms embargoes, American computers were in widespread use throughout South Africa.  The United States was the largest supplier of computers used in South Africa. American computers were in use in virtually every governmental agency, the police system, and the military, all of which contributed to the control system known as apartheid.  Computer technology did not merely

## Analyze Sample Source Text

In [9]:
soup.title

<title>The Use of Computers to Support Oppression</title>

In [10]:
img_tag = soup.img
img_tag.decompose()

soup.body

<body alink="#FFFF00" background="pictures/olive_paper.gif" link="#F2F295" text="#FFFFFF" vlink="#C9B720">
<center>
<h1><b>The Use of Computers to Support Oppression</b></h1><p>
</p></center>

Computer technology enabled the government to organize and enforce such an atrocious system of segregation and control.

<blockquote>
More than any other single technological advancement, the computer fostered the concentration of administrative power in the hands of Africa's white elite. <a href="apartheid.bib.html"><b>[NAR82]</b></a>
</blockquote>

Despite the U.N. arms embargoes, American computers were in widespread use throughout South Africa.  The United States was the largest supplier of computers used in South Africa. American computers were in use in virtually every governmental agency, the police system, and the military, all of which contributed to the control system known as apartheid.  Computer technology did not merely support the system of oppression, rather the entire country was 

# Text Summarizer

Source Code/Reference: https://glowingpython.blogspot.com/2014/09/text-summarization-with-nltk.html

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        """
         Initilize the text summarizer.
         Words that have a frequency term lower than min_cut 
         or higer than max_cut will be ignored.
        """
        self._min_cut = min_cut
        self._max_cut = max_cut 
        self._stopwords = set(stopwords.words('english') + list(punctuation))

    def _compute_frequencies(self, word_sent):
        """ 
          Compute the frequency of each of word.
          Input: 
           word_sent, a list of sentences already tokenized.
          Output: 
           freq, a dictionary where freq[w] is the frequency of w.
        """
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        # frequencies normalization and fitering
        m = float(max(freq.values()))
        keys = freq.copy().keys()
        for w in keys:
            freq[w] = freq[w]/m
            if freq[w] >= self._max_cut or freq[w] <= self._min_cut:
                del freq[w]
        return freq

    def summarize(self, text, n):
        """
          Return a list of n sentences 
          which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)    
        return [sents[j] for j in sents_idx]

    def _rank(self, ranking, n):
        """ return the first n sentences with highest ranking """
        return nlargest(n, ranking, key=ranking.get)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tygar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tygar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
# feed_xml = requests.get('http://www-cs-students.stanford.edu/~cale/cs201/apartheid.comp.html')
# feed = BeautifulSoup(feed_xml.text, "html.parser")
# to_summarize = list(map(lambda p: p.text, feed.find_all('guid')))

to_summarize = ["http://www-cs-students.stanford.edu/~cale/cs201/apartheid.comp.html"]

fs = FrequencySummarizer()
for article_url in to_summarize:
    
    print(article_url)
    
    page = requests.get(article_url)
    soup = BeautifulSoup(page.text, "html.parser")
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    title = soup.title.text
    
    print('----------------------------------')
    print(title)
    for s in fs.summarize(text, 20):
        print('*',s)

http://www-cs-students.stanford.edu/~cale/cs201/apartheid.comp.html
----------------------------------
The Use of Computers to Support Oppression
* Computer equipment was also used in the Department of the Prime Minister, the South African Reserve Bank, South Africa's electrical utility, and the Treasury Department.
* Computer equipment was also used in the Department of the Prime Minister, the South African Reserve Bank, South Africa's electrical utility, and the Treasury Department.
* Computer equipment was also used in the Department of the Prime Minister, the South African Reserve Bank, South Africa's electrical utility, and the Treasury Department.
* Computer equipment was also used in the Department of the Prime Minister, the South African Reserve Bank, South Africa's electrical utility, and the Treasury Department.
* Computer equipment was also used in the Department of the Prime Minister, the South African Reserve Bank, South Africa's electrical utility, and the Treasury Depart

In [17]:
list(to_summarize)

[]

## Create Sources & Text DataFrame

In [11]:
df["sources"] = df["sources"] = df["sources"].astype(str).str.strip("[]\'")

In [12]:
sources = []
for i in df["sources"].unique():
    sources.extend(i.split(", "))

sources = list(set(sources))

In [13]:
sources_df = pd.DataFrame({"Source": sources})
sources_df.head()

Unnamed: 0,Source
0,https://www.nytimes.com/2020/04/15/business/am...
1,https://www.sfgate.com/business/article/etown-...
2,https://www.theguardian.com/australia-news/201...
3,https://www.cnet.com/news/uber-lyft-drivers-de...
4,https://www.questia.com/newspaper/1p2-32575557...


In [16]:
from tqdm import tqdm

texts = []
for url in tqdm(sources_df["Source"].to_numpy()):
    try:
        resp = requests.get(url)
        soup = BeautifulSoup(resp.text, "html.parser")
    except ConnectionError or ProtocolError or OSError:
        soup = "None"
        
    texts.append(soup)
    
print(len(sources), len(texts))



  0%|                                                                                                                                                                                                                                                 | 0/485 [00:00<?, ?it/s][A[A

  0%|▍                                                                                                                                                                                                                                        | 1/485 [00:00<02:26,  3.31it/s][A[A

  0%|▉                                                                                                                                                                                                                                        | 2/485 [00:00<02:46,  2.90it/s][A[A

  1%|█▍                                                                                                                                                             

 12%|███████████████████████████▋                                                                                                                                                                                                            | 58/485 [01:20<05:51,  1.21it/s][A[A

 12%|████████████████████████████▏                                                                                                                                                                                                           | 59/485 [01:21<06:46,  1.05it/s][A[A

 12%|████████████████████████████▋                                                                                                                                                                                                           | 60/485 [01:22<06:28,  1.09it/s][A[A

ConnectionError: ('Connection aborted.', OSError("(10060, 'WSAETIMEDOUT')"))

In [16]:
# Problem URL
sources[175]

# Figure out how to define a time-out/connection fail situation (Text will be "Null")

'http://wiki.wearedynamo.org/index.php/guidelines_for_academic_requesters'

In [40]:
sources_df["Text"] = texts

ConnectionError: ('Connection aborted.', OSError("(10060, 'WSAETIMEDOUT')"))

In [None]:
sources_df.head()