In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
fname = '../saved_files/scraped_articles.json'

In [3]:
with open(fname) as json_data:
    d = json.load(json_data)

These are the sources that we specified to download from in **NewsPapers.json**.

In [4]:
for i, site in enumerate((list(d['newspapers'])[:5])):
    print(i, site)

0 newyorktimes_business
1 newyorktimes_science
2 newyorktimes_technology
3 cnnmoney
4 marketwatch_newsletters


In [5]:
# create a dataframe, with all of the articles, one row per article

for i, site in enumerate((list(d['newspapers']))):
    articles = list(d['newspapers'][site]['articles'])
    if i == 0:
        df = pd.DataFrame.from_dict(articles)
    else:
        new_df = pd.DataFrame.from_dict(articles)
        df = pd.concat([df, new_df], ignore_index = True)

In [6]:
df.shape

(631, 5)

In [7]:
df.head()

Unnamed: 0,author,link,published,text,title
0,"[David Streitfeld, Natasha Singer, Steven Erla...",https://www.nytimes.com/2018/03/24/technology/...,2018-03-24T20:43:21,"There are other avenues still, said Jascha Kay...",How Calls for Privacy May Upend Business for F...
1,[Emily Flitter],https://www.nytimes.com/2018/03/24/business/we...,2018-03-24T17:16:53,"And so Mr. Tran, who is in his mid-50s and spe...",The Former Khmer Rouge Slave Who Blew the Whis...
2,[Keith Bradsher],https://www.nytimes.com/2018/03/25/business/ch...,2018-03-25T15:21:56,"In that analogy, Mr. Guo will essentially beco...","China Splits Top Jobs at Central Bank, Adding ..."
3,[Ben Casselman],https://www.nytimes.com/2018/03/24/business/jo...,2018-03-25T03:00:18,News of Mr. Williams’s likely selection was fi...,A Top Candidate for New York Fed’s Leader: San...
4,[Jim Tankersley],https://www.nytimes.com/2018/03/24/business/tr...,2018-03-24T14:34:10,What is increasingly clear to many on Capitol ...,"Trump’s Tariffs Keep Allies, Markets and Indus..."


In [8]:
df.to_pickle('/mnt/c/Users/echan/spam_project/saved_files/dirty_df.pkl')

In [9]:
# list of spam sites
spam = list(d['newspapers'])[-11:]

In [10]:
# create a column for the label of the article, a label of 0 is ham, a label of 1 is spam
df['label'] = 0

In [11]:
for i, site in enumerate(spam):
    df.loc[df['link'].str.contains('{}'.format(site)), 'label'] = 1

In [12]:
# 463 non spam articles, 168 spam articles
df[['title', 'label']].groupby('label').agg('count')

Unnamed: 0_level_0,title
label,Unnamed: 1_level_1
0,463
1,168


Prepare the articles and titles for modeling by cleaning them. Things done in the cleaning process: 
- remove stopwords 
- convert to lowercase 
- remove punctuation 
- remove numbers 

In [13]:
# python module for cleaning!
%run ../modules/cleaning_helper1.py 

In [14]:
df1 = df.copy()

In [15]:
# function imported from python module
df1 = clean_column(df1, 'title', 'clean_title')

In [16]:
df1 = clean_column(df1, 'text', 'clean_text')

In [17]:
# function imported from python module
filtration(df1, 'clean_title')

In [18]:
filtration(df1, 'clean_text')

In [19]:
df1.head()

Unnamed: 0,author,link,published,text,title,label,clean_title,clean_text
0,"[David Streitfeld, Natasha Singer, Steven Erla...",https://www.nytimes.com/2018/03/24/technology/...,2018-03-24T20:43:21,"There are other avenues still, said Jascha Kay...",How Calls for Privacy May Upend Business for F...,0,calls privacy may upend business facebook google,avenues still said jascha kaykas wolff chief m...
1,[Emily Flitter],https://www.nytimes.com/2018/03/24/business/we...,2018-03-24T17:16:53,"And so Mr. Tran, who is in his mid-50s and spe...",The Former Khmer Rouge Slave Who Blew the Whis...,0,former khmer rouge slave blew whistle wells fargo,mr tran mid 50s speaks english heavy vietnames...
2,[Keith Bradsher],https://www.nytimes.com/2018/03/25/business/ch...,2018-03-25T15:21:56,"In that analogy, Mr. Guo will essentially beco...","China Splits Top Jobs at Central Bank, Adding ...",0,china splits top jobs central bank adding anot...,analogy mr guo essentially become chairman chi...
3,[Ben Casselman],https://www.nytimes.com/2018/03/24/business/jo...,2018-03-25T03:00:18,News of Mr. Williams’s likely selection was fi...,A Top Candidate for New York Fed’s Leader: San...,0,top candidate new york feds leader san franciscos,news mr williamss likely selection first repor...
4,[Jim Tankersley],https://www.nytimes.com/2018/03/24/business/tr...,2018-03-24T14:34:10,What is increasingly clear to many on Capitol ...,"Trump’s Tariffs Keep Allies, Markets and Indus...",0,trumps tariffs keep allies markets industry gu...,increasingly clear many capitol hill business ...


In [20]:
df1.to_pickle('../saved_files/cleaned_df.pkl')