In [12]:
import requests
import re
import scrapy
import pandas as pd
import pymongo
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

## Scraping

Various URLs from Earther with different formatting - test cases for the Earther scraper.


In [282]:
url = 'http://earther.gizmodo.com/exxonmobil-wishes-australia-a-happy-new-year-as-country-1840758432'
url = 'https://earther.gizmodo.com/theres-several-orders-of-magnitude-more-plastic-in-rive-1840538231'
url = 'https://deadspin.com/how-did-no-one-notice-this-inspirational-hiker-on-the-p-1818647235'
url = 'https://jalopnik.com/its-time-to-let-go-of-commuter-culture-1840630621'
url = 'https://earther.gizmodo.com/climate-change-stole-the-show-at-the-golden-globes-1840830510'
response = requests.get(url)
scrap = scrapy.http.HtmlResponse(url=url, 
                                 request=scrapy.http.Request(url), body=response.text, encoding='utf-8')

The Earther webcrawler added 3015 articles to the MongoDB (articles collection in items).

In [284]:
url = 'https://www.dailykos.com/news/Climate'
response = requests.get(url)
scrap = scrapy.http.HtmlResponse(url=url, 
                                 request=scrapy.http.Request(url), body=response.text, encoding='utf-8')

In [300]:
scrap.css(".nav-wrapper a::attr(href)").get()

'/part/story/table/by_tag?tag_id=35567&page=1'

## Load data from Mongo

In [5]:
myclient = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
mydb = myclient['items']

In [6]:
mycollection = mydb['articles']

In [7]:
df = pd.DataFrame(list(mycollection.find({})))
df.created_at = pd.to_datetime(df.created_at, infer_datetime_format=True)
df.head()

Unnamed: 0,_id,url,title,twitter_url,image,keywords,description,num_like,num_reply,author,author_link,created_at,body_text,body_links
0,5e14bee3f106fd495904e5c5,https://earther.gizmodo.com/the-worlds-largest...,The World's Largest Floating Wind Farm Is Here,https://earther.gizmodo.com/the-worlds-largest...,https://i.kinja-img.com/gawker-media/image/upl...,"[Portugal, Earther, Wind Energy, Wind for the ...","This is the second day of the new decade, and ...",13000,34,Yessenia Funes,https://kinja.com/yessfun,2020-01-02 17:00:00-05:00,"This is the second day of the new decade, and ...",[http://www.principlepowerinc.com/en/news-pres...
1,5e14bee3f106fd495904e5c6,https://earther.gizmodo.com/trumps-tremendous-...,Trump's 'Tremendous' Superfund Work Includes N...,https://earther.gizmodo.com/trumps-tremendous-...,https://i.kinja-img.com/gawker-media/image/upl...,"[EPA, Donald Trump, Earther, Toxic Legacy, Env...",The Environmental Protection Agency under Dona...,9200,16,Yessenia Funes,https://kinja.com/yessfun,2020-01-03 15:30:00-05:00,The Environmental Protection Agency under Dona...,[https://earther.gizmodo.com/the-epas-superfun...
2,5e14bee3f106fd495904e5c7,https://lifehacker.com/how-to-help-those-affec...,How to Help Those Affected By the Australian W...,https://lifehacker.com/how-to-help-those-affec...,https://i.kinja-img.com/gawker-media/image/upl...,"[Lifehacker, AUSTRALIA, call for help, Fire]","As of Thursday morning, ongoing bushfires have...",15100,12,Josh Ocampo,https://kinja.com/joshocampo,2020-01-02 14:45:00-05:00,"As of Thursday morning, ongoing bushfires have...",[https://www.theguardian.com/australia-news/li...
3,5e14bee3f106fd495904e5c8,https://earther.gizmodo.com/smoke-from-austral...,"Smoke From Australia’s Horrific Wildfires, as ...",https://earther.gizmodo.com/smoke-from-austral...,https://i.kinja-img.com/gawker-media/image/upl...,"[fires, global warming, Satellite Images, heat...",Significant portions of Australia’s southeast ...,38700,15,George Dvorsky,https://kinja.com/georgedvorsky,2020-01-03 10:10:00-05:00,Significant portions of Australia’s southeast ...,[https://earthobservatory.nasa.gov/images/1460...
4,5e14bee3f106fd495904e5c9,https://earther.gizmodo.com/earthquake-shows-h...,Earthquake Shows Puerto Rico's Electric Grid I...,https://earther.gizmodo.com/earthquake-shows-h...,https://i.kinja-img.com/gawker-media/image/upl...,"[Puerto Rico, Hurricane Maria, Earther, Energy...",Puerto Rico just can’t seem to catch a break. ...,8100,6,Yessenia Funes,https://kinja.com/yessfun,2020-01-06 15:55:00-05:00,Puerto Rico just can’t seem to catch a break. ...,[http://ds.iris.edu/seismon/quakes_today_in_N_...


In [8]:
keywords = df.keywords.sum()

In [366]:
keyword_count = pd.Series(keywords).str.lower().str.replace(" ","").value_counts()
keyword_count[keyword_count > 2]

earther                    2536
climatechange               668
science                     527
gizmodo                     294
conservation                268
                           ... 
lymedisease                   3
nuclearwaste                  3
fiatchrysler                  3
hurricanemariacontinues       3
explainers                    3
Length: 856, dtype: int64

In [340]:
pd.set_option('display.max_rows', 100)
pd.Series(keywords).str.lower().str.replace(" ","").value_counts().head(100)

earther                              2536
climatechange                         668
science                               527
gizmodo                               294
conservation                          268
environmentaljustice                  237
weather                               173
climate                               160
wildfires                             128
california                            117
epa                                   105
environment                           103
wildlife                              103
trumpadministration                    96
weatherishappening                     77
ecology                                77
antarctica                             76
naturaldisasters                       75
airpollution                           73
globalwarming                          72
animals                                72
hurricanemaria                         71
puertorico                             69
pollution                         

In [341]:
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist


In [355]:
stopwords_list = stopwords.words('english') + list(string.punctuation) + ["''", '""', '...', '``','’','“','”']

def process_article(article):
    tokens = nltk.word_tokenize(article)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed    

In [356]:
df['tokens'] = df.body_text.map(process_article)

In [348]:
df.head()

Unnamed: 0,_id,url,title,twitter_url,image,keywords,description,num_like,num_reply,author,author_link,created_at,body_text,body_links,tokens
0,5e14bee3f106fd495904e5c5,https://earther.gizmodo.com/the-worlds-largest...,The World's Largest Floating Wind Farm Is Here,https://earther.gizmodo.com/the-worlds-largest...,https://i.kinja-img.com/gawker-media/image/upl...,"[Portugal, Earther, Wind Energy, Wind for the ...","This is the second day of the new decade, and ...",13000,34,Yessenia Funes,https://kinja.com/yessfun,2020-01-02T17:00:00-05:00,"This is the second day of the new decade, and ...",[http://www.principlepowerinc.com/en/news-pres...,"[second, day, new, decade, world, largest, flo..."
1,5e14bee3f106fd495904e5c6,https://earther.gizmodo.com/trumps-tremendous-...,Trump's 'Tremendous' Superfund Work Includes N...,https://earther.gizmodo.com/trumps-tremendous-...,https://i.kinja-img.com/gawker-media/image/upl...,"[EPA, Donald Trump, Earther, Toxic Legacy, Env...",The Environmental Protection Agency under Dona...,9200,16,Yessenia Funes,https://kinja.com/yessfun,2020-01-03T15:30:00-05:00,The Environmental Protection Agency under Dona...,[https://earther.gizmodo.com/the-epas-superfun...,"[environmental, protection, agency, donald, tr..."
2,5e14bee3f106fd495904e5c7,https://lifehacker.com/how-to-help-those-affec...,How to Help Those Affected By the Australian W...,https://lifehacker.com/how-to-help-those-affec...,https://i.kinja-img.com/gawker-media/image/upl...,"[Lifehacker, AUSTRALIA, call for help, Fire]","As of Thursday morning, ongoing bushfires have...",15100,12,Josh Ocampo,https://kinja.com/joshocampo,2020-01-02T14:45:00-05:00,"As of Thursday morning, ongoing bushfires have...",[https://www.theguardian.com/australia-news/li...,"[thursday, morning, ongoing, bushfires, contin..."
3,5e14bee3f106fd495904e5c8,https://earther.gizmodo.com/smoke-from-austral...,"Smoke From Australia’s Horrific Wildfires, as ...",https://earther.gizmodo.com/smoke-from-austral...,https://i.kinja-img.com/gawker-media/image/upl...,"[fires, global warming, Satellite Images, heat...",Significant portions of Australia’s southeast ...,38700,15,George Dvorsky,https://kinja.com/georgedvorsky,2020-01-03T10:10:00-05:00,Significant portions of Australia’s southeast ...,[https://earthobservatory.nasa.gov/images/1460...,"[significant, portions, australia, southeast, ..."
4,5e14bee3f106fd495904e5c9,https://earther.gizmodo.com/earthquake-shows-h...,Earthquake Shows Puerto Rico's Electric Grid I...,https://earther.gizmodo.com/earthquake-shows-h...,https://i.kinja-img.com/gawker-media/image/upl...,"[Puerto Rico, Hurricane Maria, Earther, Energy...",Puerto Rico just can’t seem to catch a break. ...,8100,6,Yessenia Funes,https://kinja.com/yessfun,2020-01-06T15:55:00-05:00,Puerto Rico just can’t seem to catch a break. ...,[http://ds.iris.edu/seismon/quakes_today_in_N_...,"[puerto, rico, seem, catch, break, people, isl..."


In [357]:
total_vocab = set()
for article in df.tokens:
    total_vocab.update(article)
len(total_vocab)

63789

In [358]:
articles_concat = []
for article in df.tokens:
    articles_concat += article

In [359]:
articles_freqdist = FreqDist(articles_concat)

In [376]:
articles_freqdist.most_common(50)

[('climate', 8477),
 ('said', 5923),
 ('new', 5678),
 ('also', 5127),
 ('change', 4960),
 ('like', 4806),
 ('one', 4644),
 ('people', 4628),
 ('could', 4502),
 ('would', 3840),
 ('water', 3700),
 ('year', 3541),
 ('told', 3468),
 ('world', 3309),
 ('earther', 3115),
 ('even', 2993),
 ('time', 2872),
 ('years', 2870),
 ('last', 2676),
 ('according', 2633),
 ('still', 2546),
 ('ice', 2472),
 ('state', 2434),
 ('much', 2361),
 ('study', 2351),
 ('around', 2307),
 ('environmental', 2281),
 ('percent', 2244),
 ('first', 2222),
 ('may', 2176),
 ('species', 2170),
 ('energy', 2164),
 ('many', 2146),
 ('get', 2134),
 ('u.s.', 2133),
 ('carbon', 2133),
 ('way', 2126),
 ('national', 2045),
 ('research', 1962),
 ('scientists', 1959),
 ('make', 1891),
 ('sea', 1868),
 ('air', 1835),
 ('well', 1804),
 ('back', 1791),
 ('since', 1778),
 ('two', 1769),
 ('see', 1737),
 ('emissions', 1725),
 ('need', 1686)]

In [383]:
df.keywords.map(lambda lst : ' '.join([k.replace(' ','-') for k in lst]))

0       Portugal Earther Wind-Energy Wind-for-the-Win ...
1       EPA Donald-Trump Earther Toxic-Legacy Environm...
2                 Lifehacker AUSTRALIA call-for-help Fire
3       fires global-warming Satellite-Images heat-wav...
4       Puerto-Rico Hurricane-Maria Earther Energy Ear...
                              ...                        
3010                                              Earther
3011    Puerto-Rico El-Yunque hurricanes science disas...
3012                     Lifehacker Pets Environment Dogs
3013    polar science Earther art sound-of-climate-sil...
3014    whales science Earther conservation marine-bio...
Name: keywords, Length: 3015, dtype: object

## K-Means Clustering on keywords

First, we need to make a sparse array of keywords (1 column per keyword, 1 row per article).

In [10]:
key_df = pd.DataFrame(df.keywords.copy())

In [11]:
key_df

Unnamed: 0,keywords
0,"[Portugal, Earther, Wind Energy, Wind for the ..."
1,"[EPA, Donald Trump, Earther, Toxic Legacy, Env..."
2,"[Lifehacker, AUSTRALIA, call for help, Fire]"
3,"[fires, global warming, Satellite Images, heat..."
4,"[Puerto Rico, Hurricane Maria, Earther, Energy..."
...,...
3010,[Earther]
3011,"[Puerto Rico, El Yunque, hurricanes, science, ..."
3012,"[Lifehacker, Pets, Environment, Dogs]"
3013,"[polar, science, Earther, art, sound of climat..."


In [13]:
vect = CountVectorizer()

key_df = pd.DataFrame(df.keywords.copy())

X = vect.fit_transform(key_df.pop('keywords').map(lambda lst : ' '.join([k.replace(' ','_') for k in lst])))

for i, col in enumerate(vect.get_feature_names()):
    key_df[col] = pd.Series(X[:, i].toarray().ravel())


In [14]:
key_df.loc[1481][key_df.loc[1481] > 0]

anniversary             1
best_of_earther         1
best_of_gizmodo         1
earther                 1
earther_is_best_blog    1
Name: 1481, dtype: int64

In [15]:
df.loc[1481].keywords

['Earther',
 'anniversary',
 'best of gizmodo',
 'earther is best blog',
 'best of earther']

NOTE: We may want to drop all "Earther", "Gizmodo", "Jalopnik", etc. keywords.

Next, we perform k-means clustering!

In [16]:
k_means = KMeans(n_clusters = 10)
k_means.fit(key_df)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [17]:
k_means.predict(key_df)

array([8, 8, 0, ..., 0, 6, 5], dtype=int32)

In [18]:
df['cluster'] = k_means.predict(key_df)

In [22]:
pd.Series(df[df['cluster'] == 1].keywords.sum()).value_counts().head(40)

Earther                         227
Environmental Justice           195
environmental justice            33
Climate Change                   23
Flint Water Crisis               22
Hurricane Maria                  22
Puerto Rico                      20
EPA                              19
Lead                             18
Air Pollution                    16
Trump Administration             13
Bayou Bridge Pipeline            10
Keystone XL                       8
Health                            8
Goddamn Pollution                 8
Indigenous Rights                 8
Oil and Gas                       8
Hurricane Harvey                  7
Pollution                         7
Louisiana                         7
Standing Rock                     7
climate change                    6
Bears Ears National Monument      6
Art                               6
Dakota Access Pipeline            6
Oil Pipelines                     5
Environment                       5
Natural Gas                 

In [23]:
pd.Series(df[df['cluster'] == 0].keywords.sum()).value_counts().head(40)

Gizmodo                      70
Jalopnik                     43
Lifehacker                   41
climate change               27
Jezebel                      26
Climate Change               17
The Root                     17
environment                  16
Environment                  13
The Slot                     12
Deadspin                     12
donald trump                  9
car policy                    8
News                          8
animals                       8
recycling                     8
recycle                       7
technology                    7
EPA                           7
Flint water crisis            7
Vitals                        6
puerto rico                   5
Puerto Rico                   5
automaton                     5
epa                           5
environmental racism          5
natural disasters             5
Flint Michigan                5
Donald Trump                  5
Hurricane Maria               5
Flint water contamination     5
Technolo

In [40]:
pd.Series(df[df['cluster'] == 1].keywords.sum()).value_counts().head(40)

Earther                         227
Environmental Justice           195
environmental justice            33
Climate Change                   23
Flint Water Crisis               22
Hurricane Maria                  22
Puerto Rico                      20
EPA                              19
Lead                             18
Air Pollution                    16
Trump Administration             13
Bayou Bridge Pipeline            10
Keystone XL                       8
Health                            8
Goddamn Pollution                 8
Indigenous Rights                 8
Oil and Gas                       8
Hurricane Harvey                  7
Pollution                         7
Louisiana                         7
Standing Rock                     7
climate change                    6
Bears Ears National Monument      6
Art                               6
Dakota Access Pipeline            6
Oil Pipelines                     5
Environment                       5
Natural Gas                 