# Python Process Book for CS109 Project


In [None]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

## Overview and Motivation:

This project looks at trends in health topics over time.  The members of this project team are all public health students.  We were interested in how topics in health in a popular newspaper, The New York Times, have changed over time.  We decided to gain a deeper understanding of the Latent Dirilecht Allocation (LDA) method and use this topic modeling method to find the major topics in health over the past 5 decades (the period of time we were able to collect data for).

## Initial Questions:

 Which topics are persistent over time?  
 Which topics have a spike, when do they occur, and why did it happen?     

## Data:

Data was pulled from the New York Times article API.  Using the API console (http://developer.nytimes.com/io-docs), we were able to inspect the type of results for a given query.  Originally, we decided to look at results using the 'fq=newsdesk:Health' option which would pull results under the Health topic section of the Times (approximately 680,000 documents).  However, after looking at the first 1000 results, we found that the documents pulled mainly consisted of videos, slideshows, and interactive features instead of articles.  We remedied this issue by instead using a query for the keyword 'Health' which searched all articles and their headlines for the word health.  Looking at the results from the API console showed that overall, using 'Health' as our query term instead of newsdesk:Health produced approiximately 40,000 more documents (720,000 total).

The Times API has several limitnig factors when pulling data: 10,000 calls per day and a maximum of 100 pages per query.  To handle these limitations, our code pulled data by year and split each month into 3 parts (based on testing dates ranges for number of pages which would be pulled so the number of pages would be less than 100).  Year and count was entered manually to keep track of how many calls were being made and to break up the data calls for when errors occurred (such as timeouts, key errors, and date errors - when these errors were encountered the solution was appended to the code which resulted in the final version below).     

In this code, we requested the json dictionaries, used the relevant information to create a dataframe with the date, id, document type (article, blog, video), newsdesk section and subsection, and text from the title, abstract, and first paragraph.  The data was saved in a csv file for that section and tracked using an excel file - DateTracker.

In [None]:
%%time
count = 1460 #enter new count starting number
year =  #enter year
months = ['01', '01', '01', '02', '02', '02', '03', '03', '03', '04', '04', '04', '05', '05', '05', '06', '06', '06', '07', '07', '07', '08', '08', '08', '09', '09', '09', '10', '10', '10', '11', '11', '11', '12', '12', '12']
startdays = ['01', '11', '21', '01', '11', '21', '01', '11', '21', '01', '11', '21', '01', '11', '21', '01', '11', '21', '01', '11', '21', '01', '11', '21', '01', '11', '21', '01', '11', '21', '01', '11', '21', '01', '11', '21']
enddays = ['10', '20', '31', '10', '20', '27', '10', '20', '31', '10', '20', '30', '10', '20', '31', '10', '20', '30', '10', '20', '31', '10', '20', '31', '10', '20', '30', '10', '20', '31', '10', '20', '30', '10', '20', '31']

pcount = 0

for d in range(36):
    sdate = year + months[d] + startdays[d]
    edate = year + months[d] + enddays[d]

    docs=[]
    #get 1st page and number of documents
    url1 = "http://api.nytimes.com/svc/search/v2/articlesearch.json?q=Health&page=1&begin_date={}&end_date={}&api-key=5cdab36b05348a4da2e74046dfb16a03:17:73541790".format(sdate, edate) 
    lpage = requests.get(url1).json()['response']['meta']['hits']
    
    #calculate number of pages for call; if there are no hits skip to end
    if lpage is not 0:
        numpages = int(lpage/10 + 2) 
        pcount += numpages
    
        #get json files for first page
        pagedoc1 = requests.get(url1).json()['response']['docs']
        for j in range(0,len(pagedoc1)):
            docs.append(pagedoc1[j])    
    
        #get json dictionaries for rest of the pages
        for i in range(2, numpages): 
            url = "http://api.nytimes.com/svc/search/v2/articlesearch.json?q=Health&page={}&begin_date={}&end_date={}&api-key=5cdab36b05348a4da2e74046dfb16a03:17:73541790".format(i, sdate, edate)
            pagedocs = requests.get(url).json()['response']['docs']
            time.sleep(1)
        
            for j in range(0,len(pagedocs)):
                docs.append(pagedocs[j])

    #pull information from json file into dictionary        
        docsinfo = []
        for d in docs:
            obs = {}
            obs['id'] = d['_id']
            obs['type'] = d['type_of_material']
            obs['doctype'] = d['document_type']
            obs['date'] = d['pub_date']
            obs['news_desk'] = d['news_desk']
            obs['section'] = d['section_name']
            obs['subsection'] = d['subsection_name']
            obs['abstract'] = d['abstract']
            obs['paragraph'] = d['lead_paragraph']
        
            if d['headline'].get('main') is not None:
                obs['headline'] = d['headline']['main']
            elif d['headline'].get('name') is not None:
                obs['headline'] = d['headline']['name']
            else:
                obs['headline'] = ' '
    
            if obs['date'] is not None:
                obs['date'] = obs['date'][0:10]
    
            #Take out abstracts and lead paragraphs with none to join text
            if obs['abstract'] is None:
                a = ' '
            else: 
                a = obs['abstract']
            if obs['paragraph'] == 'TK TK TK' or obs['paragraph'] is None:
                p = ' '
            else:
                p = obs['paragraph']
    
            text = [obs['headline'], p, a]
            obs['text'] = " ".join(text)
    
            docsinfo.append(obs)

    #create dataframe from dictionary, make date column date type, and store in csv
        docsdf = pd.DataFrame(docsinfo)
        docsdf['date'] = pd.to_datetime(docsdf['date'])
        docsdf.to_csv("data/docsdf-{}.csv".format(count), encoding = 'utf-8') 

    count += 1 
    

Next the files were concatenated into a single dataframe and saved to a csv file of our data which is located in our dropbox: https://www.dropbox.com/s/eedgwugamiw0zwd/total.csv?dl=0 

In [None]:
#1966-2015
#There were no documents for 08/11/1978 - 10/31/1978 (leading to gap in csv files)
frames = []
for i in range(1,1338) : 
    dfs = pd.read_csv("docsdf-"+str(i)+".csv")
    frames.append(dfs)
for i in range(1346,1784) : 
    dfs = pd.read_csv("docsdf-"+str(i)+".csv")
    frames.append(dfs)
totaldf = pd.concat(frames)
totaldf['date'] = pd.to_datetime(totaldf['date'])

In [None]:
totaldf.to_csv("total.csv", index=False)

We looked at the types of documents to see if there were any trends and found that the majority of the documents were articles.  We are mainly interested in articles, and two of the document types are more recent types (multimedia and blogpost), we decided to only include articles in our corpus.

In [None]:
# Code to look at frequencies of document types
sns.set(style="white", context="talk")
ax = sns.barplot(x=('Article', 'Blogpost', 'Column', 'Multimedia', 'Recipe'), y=type_counts)
ax.set(title="Document Type Frequencies",ylim=(0,400000),yticks=[100000,200000,300000,400000])
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+0.2, height+10000, '%d'%height, fontsize=14)
#ax.text(-0.2,380000, "369,890", fontsize=14)
#ax.text(0.82,40000, "31,369", fontsize=14)
sns.despine(bottom=True)

In [None]:
#Only use documents that were specified as article will be used
df = totaldf[totaldf['doctype'] == 'article']

## Exploratory Data Analysis:

We used spark to clean and analyze our data since these processes are easily parallelized.  Spark was implemented using homebrew on a Mac. 

In [None]:
import os
os.environ['PYSPARK_PYTHON'] = '/Applications/anaconda/bin/python'

In [None]:
import findspark
findspark.init()
print findspark.find()

In [None]:
import pyspark
conf = (pyspark.SparkConf()
    .setMaster('local')
    .setAppName('pyspark')
    .set("spark.executor.memory", "2g"))
sc = pyspark.SparkContext(conf=conf)

In [None]:
import sys
rdd = sc.parallelize(xrange(10),10)

In [None]:
from pyspark.sql import SQLContext
sqlsc=SQLContext(sc)

We began by cleaning our code in order to perform LDA.  We decided to use NLTK to process our text since it is the leading platform for natural language processing.  The text was tokenized, tagged for part of speech, made all lowercase, lemmatized, and the nouns were extracted.  

*We considered issues with making words all lowercase due to some nouns such as AIDs and WHO but decided that not using .lower would potentially cause more problems.

In [None]:
import nltk
from nltk.tokenize import TreebankWordTokenizer #token
from nltk.corpus import stopwords #stopwords
from nltk.stem import WordNetLemmatizer #lemma

In [None]:
#The necessary NLTK packages need to be downloaded to implement some of the functions
nltk.download()

In [None]:
#The method information needed to clean the data

wnl = WordNetLemmatizer()
stops = stopwords.words('english')
stops.append(u'health') #Include health in our stopwords since it will be in most if not all of our documents
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

In [None]:
#Clean the text for each document to extract the nouns 
#Cleaning includes lemmatization and removal of stopwords

def get_parts(thetext):
    nouns=[]
    tokens = TreebankWordTokenizer().tokenize(thetext)
    tagged = nltk.pos_tag(tokens) # a list of tuples
    for tup in tagged : 
        w, tag = tup  
        print w, tag
        if tag in ['NN', 'NNS', 'NNP', 'NNPS']:
            word = wnl.lemmatize(w.lower())
            print word
            if word[-1] in punctuation : 
                word = word[:-1]
            if word in stops or word in punctuation or len(word)==1 :
                continue
            nouns.append(word)
    nouns2=[]
    for n in nouns:
        if len(n)!=0:
            nouns2.append(n)
    return nouns2

In [None]:
%%time

parseout = []
for index, row in df.iterrows() : 
    parseout.append(get_parts(row.text))

In [None]:
import itertools
document = [list(itertools.chain.from_iterable(p)) for p in parseout]

In [None]:
#Get a list of words in each document compiled into a list 
ldadatardd=sc.parallelize(parseout).flatMap(lambda l: l)
ldadatardd.cache()
ldadatardd.take(5)

In [None]:
#Get a wordcount and a list of tuples containing a word from our data and an index for that word

wordcount = (ldadatardd.flatMap(lambda word: word)
             .map(lambda word: (word, 1))
             .reduceByKey(lambda a, b: a + b))

vocabtups = (wordcount.map(lambda (x,y): x)
             .zipWithIndex()
).cache()

In [None]:
vocab=vocabtups.collectAsMap()
id2word=vocabtups.map(lambda (x,y): (y,x)).collectAsMap()

In [None]:
#Get a count of the words in a document
from collections import defaultdict
def get_count(s) : 
    d = defaultdict(int)
    for w in s : 
        if vocab.has_key(w) :
            i = vocab[w]
            d[i] += 1
    return d.items()

In [None]:
#Collect the words from each document in our data into a corpus
documents = ldadatardd.map(get_count)
corpus=documents.collect()

In [None]:
import gensim

In [None]:
lda2015 = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=50, update_every=1, chunksize=10000, passes=1)

 ### Decisions when performing LDA 
 
 We decided to consider topics by decade since we know health topics will change over time and a decade is a reasonable time frame **Need better justification**.  *(If we had more time for this project, we would like to consider other time frames).
 
 We used the a document as our cluster size when performing LDA (instead of sentence) under the assumption that each article has a main topic.
 
 

## Final Analysis:

## Presentation: