# Stock Data Scraper Function

In [13]:
import pandas as pd
import os
#%pip install pytrends
import pytrends
from pytrends.request import TrendReq
#%pip install pageviewapi
import pageviewapi
#%pip install yfinance
import yfinance as yf

In [22]:
def Big_scraper(kw_list_1, kw_list_2, ticker, start,end):
    
    """
    Description:
    ------------
    
    The function initially grabs historical, indexed, hourly data for when the keyword 
    was searched most as shown on Google Trends' Interest Over Time section.
    It then cleans the data to show daily hits on the keyword in Google news.

    
    input:
    -----
    kw_list_1: List of up to 5 key words that will be scraped from google trends for the dates given.
             Here, the scraping will pull the total posted items in google news that contains
             one of the key words.
    
    kw_list_2: List of wikipedia article titles (unlimited length) that will pull the amount of
            views the article recieved each day. 

    ticker: the ticker abriviation of the desired stock. Must be netered in as an all capitalized string 
    example Apple Inc. woud be "AAPL"
             
    start: the start of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    end: the end of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
             
    return:
    -------
    
    combined: a dataframe containing the sum of the daily keyword hits in google news (key words labeled _x),
    
    data frame cointaing stock info including open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day,
    
    and the sum of how many times key word wikipedia pages were viewed in a day (key words labeled _y)
    """
    
    year_s = int(start[0:4])
    month_s = int(start[4:6])
    day_s = int(start[6:8])
    year_e = int(end[0:4])
    month_e = int(end[4:6])
    day_e = int(end[6:8])
    
    starter = pd.to_datetime(f"'{year_s}-{month_s}-{day_s}'")
    ender = pd.to_datetime(f"'{year_e}-{month_e}-{day_e}'")
    
    
    pytrends = TrendReq(hl='en-US', tz=360, retries=10)
    jeff = pytrends.get_historical_interest(kw_list_1, \
                                 year_start = year_s, month_start = month_s, day_start = day_s, hour_start = 1, \
                                 year_end = year_e, month_end = month_e, day_end = day_e, hour_end = 23, \
                                 cat = 0, geo = '', gprop = 'news', sleep = 60)
    
    jeff = jeff.iloc[:, 0:-1] # eliminates the isPartial Column
    jeff = jeff.reset_index().drop_duplicates(subset = "date") #removing duplicates from the index
    jeff = jeff.groupby(pd.Grouper(key="date", freq="D")).mean() # coverts to the mean of daily scores

    dow = yf.Ticker("^DJI")
    dow_h = dow.history(start='2021-01-01', end="2021-01-31")
    dow_h = pd.DataFrame(dow_h)
    dow_names = {"Open":"dow_open","Close":"dow_close","Low": "dow_low",
    'High':'dow_high','Volume':'dow_vol'}
    dow_h=dow_h.rename(dow_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)

    nas = yf.Ticker("^IXIC")
    nas_h = nas.history(start='2021-01-01', end="2021-01-31")
    nas_h = pd.DataFrame(nas_h)
    nas_names = {"Open":"nas_open", "Close":"nas_close", "Low": "nas_low",
    'High':'nas_high','Volume':'nas_vol'}

    nas_h=nas_h.rename(nas_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)

    market = dow_h.merge(nas_h,left_index=True, right_index=True, how="left")
    
    tick = yf.Ticker(ticker)
    hist = tick.history(start=starter, end=ender)
    hist = pd.DataFrame(hist)
    
    combined = jeff.merge(hist, left_index=True, right_index=True, how="left")
    combined = combined.merge(market, left_index=True, right_index=True, how="left")  
    
    d = pd.DataFrame()
    for key_word in kw_list_2:
        geoff = pageviewapi.per_article('en.wikipedia', key_word, start, end,
                                    access='all-access', agent='all-agents', granularity='daily')
        dicty = dict(geoff)
        views = pd.DataFrame(dicty["items"])
        views["timestamp"] = pd.to_datetime((views["timestamp"]), format="%Y%m%d%H")
        views = views.set_index("timestamp")
        page = pd.Series(views["views"])
        d[key_word] = page
        
    combined = combined.merge(d, left_index=True, right_index=True, how="left") 
    
    return combined

In [23]:
#example pull
kw_list_1 = ["Apple", "Apple Inc.", "IPhone", "MacBook", "MacOS"]
kw_list_2 = ["Apple Inc.", "IPhone"]
Big_scraper(kw_list_1,kw_list_2,"AAPL","20211201","20211214")

Unnamed: 0_level_0,Apple,Apple Inc._x,IPhone_x,MacBook,MacOS,Open,High,Low,Close,Volume,...,dow_low,dow_close,dow_vol,nas_open,nas_high,nas_low,nas_close,nas_vol,Apple Inc._y,IPhone_y
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-01,16.608696,0.0,19.434783,2.0,0.26087,167.266892,170.083311,164.320648,164.560349,152052500.0,...,,,,,,,,,14391,8014
2021-12-02,20.916667,0.0,24.958333,1.166667,0.0,158.538019,163.991063,157.599213,163.55162,136739200.0,...,,,,,,,,,14364,8072
2021-12-03,15.708333,0.0,16.541667,2.625,0.25,163.811298,164.750104,159.516766,161.634064,118023100.0,...,,,,,,,,,13359,7546
2021-12-04,15.208333,0.0,23.875,1.375,0.0,,,,,,...,,,,,,,,,11606,7311
2021-12-05,17.375,0.0,20.625,1.041667,0.291667,,,,,,...,,,,,,,,,12245,7586
2021-12-06,14.291667,0.0,18.416667,3.708333,0.0,164.080946,167.66639,164.070964,165.10965,107497000.0,...,,,,,,,,,13862,8381
2021-12-07,16.541667,0.0,17.875,2.583333,0.0,168.864855,171.361674,168.125791,170.962173,120405400.0,...,,,,,,,,,14563,8188
2021-12-08,15.125,0.0,23.791667,2.708333,0.0,171.91098,175.736109,170.482792,174.857224,116998900.0,...,,,,,,,,,14027,7697
2021-12-09,17.5,0.0,28.041667,2.375,0.0,174.687436,176.525091,173.69869,174.337875,108923700.0,...,,,,,,,,,13975,7590
2021-12-10,22.208333,0.0,24.458333,0.666667,0.0,174.987069,179.401443,174.467727,179.221664,115402700.0,...,,,,,,,,,12530,7226


## Individual Functions (with examples)

In [20]:
def google_trends(kw_list, year_start, month_start, year_end, month_end, day_end, day_start=1, hour_start=0, hour_end=23):
    
    """
    Description:
    ------------
    
    The function initially grabs historical, indexed, hourly data for when the keyword 
    was searched most as shown on Google Trends' Interest Over Time section.
    It then cleans the data to show daily hits on the keyword in Google news.

    
    input:
    -----
    kw_list: List of up to 5 key words that will be scraped from the timeline given to the function.
             Here, the scraping will pull the total posted items in google news. the contains
             one of the key words.
             
    Rest of the varibles are self-explatory and used to set the timeline to scrape the key words. 
    
    Plug in as decribed 
    year_start / end: YYYY int
    month_start / end: M int
    day_start /  end: D int
    hour_start / end: H int
    
             
    return:
    -------
    
    jeff: a dataframe containing the sum of the daily keyword hits in google news
    """
    
    pytrends = TrendReq(hl='en-US', tz=360, retries=10)
    jeff = pytrends.get_historical_interest(kw_list, \
                                 year_start = year_start, month_start = month_start, day_start = day_start, hour_start = hour_start, \
                                 year_end = year_end, month_end = month_end, day_end = day_end, hour_end = hour_end, \
                                 cat = 0, geo = '', gprop = 'news', sleep = 60)
    
    jeff = jeff.iloc[:, 0:-1] # eliminates the isPartial Column
    jeff = jeff.reset_index().drop_duplicates(subset = "date") #removing duplicates from the index
    jeff = jeff.groupby(pd.Grouper(key="date", freq="D")).sum() # coverts to the sum of daily posts
    
    return jeff

In [21]:
jeff1= google_trends(kw_list, 2021, 12, 2021, 12, 31)

In [22]:
kw_list = ["Apple", "Apple Inc.", "IPhone", "MacBook", "MacOS"]

In [23]:
def stock_stats(ticker: str, start_date: str, end_date: str):
    """
    Description: Scrapes historial daily stock data from the Yahoo Fince sight
    and returns a dataframe containing daily open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day.
    
    inputs:
    ------
    ticker: the ticker abriviation of the desired stock. Must be netered in as an all capitalized string 
    example Apple Inc. woud be "AAPL"
    
    start_date: the start of the desired timeline you want scrape. Date Must be entered in as "YYYY-MM-DD"
    
    end_date: Self explanetory, Date Must be entered in as "YYYY-MM-DD"
    
    return:
    ------
    hist: dataframe containing open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day. 
    
    """
    
    tick = yf.Ticker(ticker)
    hist = tick.history(start=start_date, end=end_date)
    hist = pd.DataFrame(hist)
    return hist

In [26]:
def wiki_scraper(kw_list: list, start_date: str, end_date: str):
    '''
    Description: Pulls the sum of how many times a wikipedia page was viewed that day
    
    inputs:
    ------
    
    kw_list: list of wikipedia page names to be scrpapped, can be of unlimted length
    
    start_date: the start of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    end_date: the end of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    '''
    d = pd.DataFrame()
    for key_word in kw_list:
        geoff = pageviewapi.per_article('en.wikipedia', key_word, start_date, end_date,
                                    access='all-access', agent='all-agents', granularity='daily')
        dicty = dict(geoff)
        views = pd.DataFrame(dicty["items"])
        views["timestamp"] = pd.to_datetime((views["timestamp"]), format="%Y%m%d%H")
        views = views.set_index("timestamp")
        page = pd.Series(views["views"])
        d[key_word] = page
        
    return d

In [27]:
jeff3 = wiki_scraper(kw_list, '20211201', '20211231')
jeff3.head()

Unnamed: 0_level_0,Apple,Apple Inc.,IPhone,MacBook,MacOS
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-12-01,8237,14391,8014,704,133006
2021-12-02,8299,14364,8072,619,42741
2021-12-03,8040,13359,7546,559,37979
2021-12-04,7673,11606,7311,604,26680
2021-12-05,7810,12245,7586,588,26376


In [28]:
def joiner(google_trends, yahoo_finace, wiki_pagecount):
    """
    Description: joins all stock data sets into one dataframe
    
    input:
    ------
    google_trends: data frame counting daily hit counts for google news stories on specific key words
    
    yahoo_finace: data frame cointaing stock info including open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day.
    
    wiki_pagecount: the sum of how many times key wikipedia pages were viewed in a day
    """
    
    combined = google_trends.merge(yahoo_finace, left_index=True, right_index=True, how="left")
    combined = combined.merge(wiki_pagecount, left_index=True, right_index=True, how="left")
    return combined

In [None]:
# Data Scraper Without Google Trends
## includes Dow Jones and NASDAQ aggragates

In [7]:
dow = yf.Ticker("^DJI")
dow_h = dow.history(start='2021-01-01', end="2021-01-31")
dow_h = pd.DataFrame(dow_h)
dow_names = {"Open":"dow_open",
"Close":"dow_close",
"Low": "dow_low",
'High':'dow_high',
'Volume':'dow_vol'
}
dow_h=dow_h.rename(dow_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)


Unnamed: 0_level_0,dow_open,dow_high,dow_low,dow_close,dow_vol
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-04,30627.470703,30674.279297,29881.820312,30223.890625,475080000
2021-01-05,30204.25,30504.890625,30141.779297,30391.599609,350910000
2021-01-06,30362.779297,31022.650391,30313.070312,30829.400391,500430000
2021-01-07,30901.179688,31193.400391,30897.859375,31041.130859,427810000
2021-01-08,31069.580078,31140.669922,30793.269531,31097.970703,381150000


In [11]:
nas = yf.Ticker("^IXIC")
nas_h = nas.history(start='2021-01-01', end="2021-01-31")
nas_h = pd.DataFrame(nas_h)
nas_names = {"Open":"nas_open",
"Close":"nas_close",
"Low": "nas_low",
'High':'nas_high',
'Volume':'nas_vol'
}
nas_h=nas_h.rename(nas_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)

dow_h.merge(nas_h,left_index=True, right_index=True, how="left")

Unnamed: 0_level_0,dow_open,dow_high,dow_low,dow_close,dow_vol,nas_open,nas_high,nas_low,nas_close,nas_vol
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-01-04,30627.470703,30674.279297,29881.820312,30223.890625,475080000,12958.519531,12958.719727,12543.240234,12698.450195,6546740000
2021-01-05,30204.25,30504.890625,30141.779297,30391.599609,350910000,12665.650391,12828.269531,12665.650391,12818.959961,6904420000
2021-01-06,30362.779297,31022.650391,30313.070312,30829.400391,500430000,12666.150391,12909.629883,12649.990234,12740.790039,7648340000
2021-01-07,30901.179688,31193.400391,30897.859375,31041.130859,427810000,12867.339844,13090.910156,12867.339844,13067.480469,6777010000
2021-01-08,31069.580078,31140.669922,30793.269531,31097.970703,381150000,13160.219727,13208.089844,13036.549805,13201.980469,7223660000
2021-01-11,31015.369141,31096.980469,30832.060547,31008.689453,356540000,13048.780273,13138.269531,12999.509766,13036.429688,6876420000
2021-01-12,31015.009766,31114.560547,30888.759766,31068.689453,362620000,13062.05957,13105.040039,12963.919922,13072.429688,7181380000
2021-01-13,31084.880859,31153.369141,30992.050781,31060.470703,413250000,13088.009766,13171.150391,13051.05957,13128.950195,7072920000
2021-01-14,31085.669922,31223.779297,30982.240234,30991.519531,427810000,13174.75,13220.160156,13098.410156,13112.639648,6671090000
2021-01-15,30926.769531,30941.980469,30612.669922,30814.259766,433000000,13099.900391,13139.830078,12949.759766,12998.5,6402970000


In [34]:
def Market_scraper(kw_list_2, ticker, start,end):
    
    """
    Description:
    ------------
    
    The function initially grabs historical, indexed, hourly data for when the keyword 
    was searched most as shown on Google Trends' Interest Over Time section.
    It then cleans the data to show daily hits on the keyword in Google news.

    
    input:
    -----
    ### kw_list_1: List of up to 5 key words that will be scraped from google trends for the dates given.
             Here, the scraping will pull the total posted items in google news that contains
             one of the key words ### removed to be edited and replaced later .
    
    kw_list_2: List of wikipedia article titles (unlimited length) that will pull the amount of
            views the article recieved each day. 

    ticker: the ticker abriviation of the desired stock. Must be netered in as an all capitalized string 
    example Apple Inc. woud be "AAPL"
             
    start: the start of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    end: the end of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
             
    return:
    -------
    
    ### combined: a dataframe containing the sum of the daily keyword hits in google news (key words labeled _x) ### to be edited,
    
    the individual stock info including open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day,

    as well as the open close, high, low, volume, of the NASDAQ and DOW Jones Indudtiral Average
    
    and the sum of how many times key word wikipedia pages were viewed in a day (key words labeled _y)
    """
    
    year_s = int(start[0:4])
    month_s = int(start[4:6])
    day_s = int(start[6:8])
    year_e = int(end[0:4])
    month_e = int(end[4:6])
    day_e = int(end[6:8])
    
    starter = pd.to_datetime(f"'{year_s}-{month_s}-{day_s}'")
    ender = pd.to_datetime(f"'{year_e}-{month_e}-{day_e}'")
    
    dow = yf.Ticker("^DJI")
    dow_h = dow.history(start='2021-01-01', end="2021-01-31")
    dow_h = pd.DataFrame(dow_h)
    dow_names = {"Open":"dow_open","Close":"dow_close","Low": "dow_low",
    'High':'dow_high','Volume':'dow_vol'}
    dow_h=dow_h.rename(dow_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)

    nas = yf.Ticker("^IXIC")
    nas_h = nas.history(start='2021-01-01', end="2021-01-31")
    nas_h = pd.DataFrame(nas_h)
    nas_names = {"Open":"nas_open", "Close":"nas_close", "Low": "nas_low",
    'High':'nas_high','Volume':'nas_vol'}

    nas_h=nas_h.rename(nas_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)

    market = dow_h.merge(nas_h,left_index=True, right_index=True, how="left")
    
    
    tick = yf.Ticker(ticker)
    hist = tick.history(start=starter, end=ender)
    hist = pd.DataFrame(hist)
    
    combined = hist.merge(market, left_index=True, right_index=True, how="left") 

    d = pd.DataFrame()
    for key_word in kw_list_2:
        geoff = pageviewapi.per_article('en.wikipedia', key_word, start, end,
                                    access='all-access', agent='all-agents', granularity='daily')
        dicty = dict(geoff)
        views = pd.DataFrame(dicty["items"])
        views["timestamp"] = pd.to_datetime((views["timestamp"]), format="%Y%m%d%H")
        views = views.set_index("timestamp")
        page = pd.Series(views["views"])
        d[key_word] = page
        
    combined = combined.merge(d, left_index=True, right_index=True, how="right") 
    
    return combined

In [35]:
apple = Market_scraper(kw_list_1,"AAPL","20210101","20211231")

In [36]:
apple.index

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10',
               ...
               '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
               '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
               '2021-12-30', '2021-12-31'],
              dtype='datetime64[ns]', name='timestamp', length=365, freq=None)