<a href="https://colab.research.google.com/github/dannyjimenez98/Module-2-Exercise/blob/main/DataCurationProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import pprint
import math, time

In [None]:
API_KEY='VOyJLAyvGqAyAHJot0I05y868muPksiG'
BASE_URL = 'https://api.nytimes.com/svc/search/v2/articlesearch.json'

In [None]:
def getArticleText(article_url):
    ''' Parses the article's url and extracts the body text
        Input: article's url
        Output: article's body text
    '''
    # request response from article's url
    article_response = requests.get(article_url)
    soup = BeautifulSoup(article_response.text, 'html.parser')

    # get content with id of story inside section tag
    story = soup.find(id='story').find('section')

    # gets a list of every word in story that is in a p tag
    text = story.find_all('p')

    # iterates through text list 
    article_text = ''
    for i in text:
        # append each word to each other to return full article text
        article_text += i.get_text() + ' '
    

    return article_text

In [None]:
def getCompanyArticlesData(query):
    '''
    Gets json data of every article from our request search parameters and extracts information 
    from each article to get full article text and store information into a dataframe

    Input: search query 
        - Company we are analyzing
    Output: Returns list of each article's publishing date, headline, body text, and url 
    '''
    # request parameters
    page=0
    params={'api-key':API_KEY,
        'q':query,
        'begin_date':'20190101',
        'end_date':'20211231',
        'fq':f'document_type:("article") AND section_name:("Business" "Technology") AND organizations:({query})',
        'page':page,
        'fl':('web_url','pub_date','headline','meta'), 
        'sort':'oldest'}

    # request article data
    response = requests.get(BASE_URL,params)
    print(response.url)

    data = response.json()

    # number of articles returned
    hits = data['response']['meta']['hits']

    # pages of articles to iterate through
    total_pages = math.ceil(hits/10)

    article_data = [] # empty list holding rows of each article's data

    for page in range(1): 
    # for page in range(total_pages): 
        try:
            # updates page in articles parameters for pagination
            params['page'] = page
            # print(f'PAGE {page}/{total_pages-1}')

            # gets current page's articles raw data
            response = requests.get(BASE_URL,params)
            
            data = response.json()
            
            # iterate through each article's raw data on current page
            for i in data['response']['docs']:
                headline = i['headline']['main']
                pub_date = i['pub_date']
                article_url = i['web_url']

                article_content = getArticleText(article_url)

                # adds new row of data for this article
                article_data.append([pub_date,headline,article_content,article_url])

            time.sleep(7)
        except:
            print("There's an issue here")

    return article_data

In [None]:
data = getCompanyArticlesData('Facebook Inc')

https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key=VOyJLAyvGqAyAHJot0I05y868muPksiG&q=Facebook+Inc&begin_date=20190101&end_date=20211231&fq=document_type%3A%28%22article%22%29+AND+section_name%3A%28%22Business%22+%22Technology%22%29+AND+organizations%3A%28Facebook+Inc%29&page=0&fl=web_url&fl=pub_date&fl=headline&fl=meta&sort=oldest


In [None]:
df=pd.DataFrame(data,columns=['Date','Headline','Article Text','Article URL'])
df['Date'] = pd.to_datetime(df['Date']).dt.date
df

Unnamed: 0,Date,Headline,Article Text,Article URL
0,2019-01-01,"Big Tech May Look Troubled, but It’s Just Gett...","SAN JOSE, Calif. — Silicon Valley ended 2018 s...",https://www.nytimes.com/2019/01/01/technology/...
1,2019-01-18,The Week in Tech: How Google and Facebook Spaw...,"Each week, technology reporters and columnists...",https://www.nytimes.com/2019/01/18/technology/...
2,2019-01-19,F.T.C. Is Said to Be Considering Large Faceboo...,WASHINGTON — The Federal Trade Commission is i...,https://www.nytimes.com/2019/01/18/technology/...
3,2019-01-19,Are ‘10-Year Challenge’ Photos a Boon to Faceb...,The #10YearChallenge was all fun and memes unt...,https://www.nytimes.com/2019/01/19/technology/...
4,2019-01-22,Did Australia Hurt Phone Security Around the W...,"SYDNEY, Australia — A new law in Australia giv...",https://www.nytimes.com/2019/01/22/technology/...
5,2019-01-23,He Reported on Facebook. Now He Approaches It ...,How do New York Times journalists use technolo...,https://www.nytimes.com/2019/01/23/technology/...
6,2019-01-25,The Week in Tech: Silicon Valley Hobnobs in Davos,"SAN FRANCISCO — Each week, technology reporter...",https://www.nytimes.com/2019/01/25/technology/...
7,2019-01-25,"Zuckerberg Plans to Integrate WhatsApp, Instag...","SAN FRANCISCO — Mark Zuckerberg, Facebook’s ch...",https://www.nytimes.com/2019/01/25/technology/...
8,2019-01-30,Does Facebook Really Know How Many Fake Accoun...,Facebook sells advertisers on its access to re...,https://www.nytimes.com/2019/01/30/technology/...
9,2019-01-30,Facebook’s Profits and Revenue Climb as It Gai...,SAN FRANCISCO — Facebook’s worst year ever was...,https://www.nytimes.com/2019/01/30/technology/...


### Template code
`df.to_csv('./{foldername}/{filename}.csv', index=False)`

In [None]:
# Write to CSV file for later analysis
# df.to_csv('./{foldername}/{filename}.csv', index=False) # template
df.to_csv('testfile.csv', index=False) # template

/content
