# Stock Data Scraper Function

In [1]:
import pandas as pd
import os
#%pip install pytrends
import pytrends
from pytrends.request import TrendReq
#%pip install pageviewapi
import pageviewapi
#%pip install yfinance
import yfinance as yf

In [9]:
def Big_scraper(kw_list_1, kw_list_2, ticker, start,end):
    
    """
    Description:
    ------------
    
    The function initially grabs historical, indexed, hourly data for when the keyword 
    was searched most as shown on Google Trends' Interest Over Time section.
    It then cleans the data to show daily hits on the keyword in Google news.

    
    input:
    -----
    kw_list_1: List of up to 5 key words that will be scraped from google trends for the dates given.
             Here, the scraping will pull the total posted items in google news that contains
             one of the key words.
    
    kw_list_2: List of wikipedia article titles (unlimited length) that will pull the amount of
            views the article recieved each day. 

    ticker: the ticker abriviation of the desired stock. Must be netered in as an all capitalized string 
    example Apple Inc. woud be "AAPL"
             
    start: the start of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    end: the end of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
             
    return:
    -------
    
    combined: a dataframe containing the sum of the daily keyword hits in google news (key words labeled _x),
    
    data frame cointaing stock info including open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day,
    
    and the sum of how many times key word wikipedia pages were viewed in a day (key words labeled _y)
    """
    
    year_s = int(start[0:4])
    month_s = int(start[4:6])
    day_s = int(start[6:8])
    year_e = int(end[0:4])
    month_e = int(end[4:6])
    day_e = int(end[6:8])
    
    starter = pd.to_datetime(f"'{year_s}-{month_s}-{day_s}'")
    ender = pd.to_datetime(f"'{year_e}-{month_e}-{day_e}'")
    
    
    pytrends = TrendReq(hl='en-US', tz=360, retries=10)
    jeff = pytrends.get_historical_interest(kw_list_1, \
                                 year_start = year_s, month_start = month_s, day_start = day_s, hour_start = 1, \
                                 year_end = year_e, month_end = month_e, day_end = day_e, hour_end = 23, \
                                 cat = 0, geo = '', gprop = 'news', sleep = 60)
    
    jeff = jeff.iloc[:, 0:-1] # eliminates the isPartial Column
    jeff = jeff.reset_index().drop_duplicates(subset = "date") #removing duplicates from the index
    jeff = jeff.groupby(pd.Grouper(key="date", freq="D")).mean() # coverts to the mean of daily scores

    dow = yf.Ticker("^DJI")
    dow_h = dow.history(start=starter, end=ender)
    dow_h = pd.DataFrame(dow_h)
    dow_names = {"Open":"dow_open","Close":"dow_close","Low": "dow_low",
    'High':'dow_high','Volume':'dow_vol'}
    dow_h=dow_h.rename(dow_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)

    nas = yf.Ticker("^IXIC")
    nas_h = nas.history(start=starter, end=ender)
    nas_h = pd.DataFrame(nas_h)
    nas_names = {"Open":"nas_open", "Close":"nas_close", "Low": "nas_low",
    'High':'nas_high','Volume':'nas_vol'}
    nas_h=nas_h.rename(nas_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)

    market = dow_h.merge(nas_h,left_index=True, right_index=True, how="left")
    
    tick = yf.Ticker(ticker)
    hist = tick.history(start=starter, end=ender)
    hist = pd.DataFrame(hist)
    
    combined = jeff.merge(hist, left_index=True, right_index=True, how="left")
    combined = combined.merge(market, left_index=True, right_index=True, how="left")  
    
    d = pd.DataFrame()
    for key_word in kw_list_2:
        geoff = pageviewapi.per_article('en.wikipedia', key_word, start, end,
                                    access='all-access', agent='all-agents', granularity='daily')
        dicty = dict(geoff)
        views = pd.DataFrame(dicty["items"])
        views["timestamp"] = pd.to_datetime((views["timestamp"]), format="%Y%m%d%H")
        views = views.set_index("timestamp")
        page = pd.Series(views["views"])
        d[key_word] = page
        
    combined = combined.merge(d, left_index=True, right_index=True, how="left") 
    
    return combined

In [10]:
#example pull
kw_list_1 = ["Apple", "Apple Inc.", "IPhone", "MacBook", "MacOS"]
kw_list_2 = ["Apple Inc.", "IPhone"]
Big_scraper(kw_list_1,kw_list_2,"AAPL","20211201","20211214")

Unnamed: 0_level_0,Apple,Apple Inc._x,IPhone_x,MacBook,MacOS,Open,High,Low,Close,Volume,...,dow_low,dow_close,dow_vol,nas_open,nas_high,nas_low,nas_close,nas_vol,Apple Inc._y,IPhone_y
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-01,17.0,0.0,19.434783,2.0,0.26087,167.266892,170.083311,164.320648,164.560349,152052500.0,...,34006.980469,34022.039062,496000000.0,15752.269531,15816.820312,15243.929688,15254.049805,6266020000.0,14391,8014
2021-12-02,20.916667,0.0,24.958333,1.166667,0.0,158.538019,163.991063,157.599213,163.55162,136739200.0,...,34076.25,34639.789062,466900000.0,15181.820312,15444.540039,15150.120117,15381.320312,5390100000.0,14364,8072
2021-12-03,16.0,0.0,16.541667,2.625,0.25,163.811298,164.750104,159.516766,161.634064,118023100.0,...,34264.570312,34580.078125,439550000.0,15428.709961,15470.360352,14931.05957,15085.469727,5859520000.0,13359,7546
2021-12-04,15.666667,0.0,23.875,1.375,0.0,,,,,,...,,,,,,,,,11606,7311
2021-12-05,17.375,0.0,20.625,1.041667,0.291667,,,,,,...,,,,,,,,,12245,7586
2021-12-06,14.291667,0.0,18.416667,3.708333,0.0,164.080946,167.66639,164.070964,165.10965,107497000.0,...,34633.429688,35227.03125,416720000.0,15117.629883,15281.990234,14931.610352,15225.150391,5095960000.0,13862,8381
2021-12-07,16.5,0.0,17.875,2.583333,0.0,168.864855,171.361674,168.125791,170.962173,120405400.0,...,35423.988281,35719.429688,474940000.0,15510.910156,15720.089844,15507.660156,15686.919922,5091220000.0,14563,8188
2021-12-08,15.083333,0.0,23.875,2.791667,0.0,171.91098,175.736109,170.482792,174.857224,116998900.0,...,35602.648438,35754.75,387650000.0,15690.650391,15792.639648,15618.879883,15786.990234,4600800000.0,14027,7697
2021-12-09,18.041667,0.0,28.208333,2.375,0.0,174.687436,176.525091,173.69869,174.337875,108923700.0,...,35577.140625,35754.691406,353020000.0,15720.540039,15796.049805,15511.120117,15517.370117,4484230000.0,13975,7590
2021-12-10,22.208333,0.0,24.208333,0.666667,0.0,174.987069,179.401443,174.467727,179.221664,115402700.0,...,35710.429688,35970.988281,361200000.0,15629.589844,15677.599609,15477.849609,15630.599609,4395460000.0,12530,7226


## Individual Functions (with examples)

In [16]:
def google_trends(kw_list, start, end):
    
    """
    Description:
    ------------
    
    The function initially grabs historical, indexed, hourly data for when the keyword 
    was searched most as shown on Google Trends' Interest Over Time section.
    It then cleans the data to show daily hits on the keyword in Google news.

    
    input:
    -----
    kw_list: List of up to 5 key words that will be scraped from the timeline given to the function.
             Here, the scraping will pull the total posted items in google news. the contains
             one of the key words.
             
    start: the start of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    end: Self explanetory, Date Must be entered in as "YYYYMMDD"
    
             
    return:
    -------
    
    jeff: a dataframe containing the mean of the daily google trends index score for up to 5 keywords
    """
    year_s = int(start[0:4])
    month_s = int(start[4:6])
    day_s = int(start[6:8])
    year_e = int(end[0:4])
    month_e = int(end[4:6])
    day_e = int(end[6:8])
      
    pytrends = TrendReq(hl='en-US', tz=360, retries=10)
    jeff = pytrends.get_historical_interest(kw_list, \
                                 year_start = year_s, month_start = month_s, day_start = day_s, hour_start = 1, \
                                 year_end = year_e, month_end = month_e, day_end = day_e, hour_end = 23, \
                                 cat = 0, geo = '', gprop = 'news', sleep = 60)
    
    jeff = jeff.iloc[:, 0:-1] # eliminates the isPartial Column
    jeff = jeff.reset_index().drop_duplicates(subset = "date") #removing duplicates from the index
    jeff = jeff.groupby(pd.Grouper(key="date", freq="D")).sum() # coverts to the sum of daily posts
    
    return jeff

In [22]:
jeff1= google_trends(kw_list_1, "20210101","20210114")
jeff1.head()

Unnamed: 0_level_0,Apple,Apple Inc.,IPhone,MacBook,MacOS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-01,347,0,518,26,0
2021-01-02,451,0,433,45,0
2021-01-03,412,0,467,25,0
2021-01-04,424,0,382,49,0
2021-01-05,456,0,445,32,0


In [19]:
def stock_stats(ticker: str, start: str, end: str):
    """
    Description: Scrapes historial daily stock data from the Yahoo Fince sight
    and returns a dataframe containing daily open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day.
    
    inputs:
    ------
    ticker: the ticker abriviation of the desired stock. Must be netered in as an all capitalized string 
    example Apple Inc. woud be "AAPL"
    
    start: the start of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    end: Self explanetory, Date Must be entered in as "YYYYMMDD"
    
    return:
    ------
    hist: dataframe containing open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day. 
    
    As well as the open close, high, low, volume, of the NASDAQ and DOW Jones Indudtiral Average
    
    """
    year_s = int(start[0:4])
    month_s = int(start[4:6])
    day_s = int(start[6:8])
    year_e = int(end[0:4])
    month_e = int(end[4:6])
    day_e = int(end[6:8])
    
    starter = pd.to_datetime(f"'{year_s}-{month_s}-{day_s}'")
    ender = pd.to_datetime(f"'{year_e}-{month_e}-{day_e}'")
    
    dow = yf.Ticker("^DJI")
    dow_h = dow.history(start=starter, end=ender)
    dow_h = pd.DataFrame(dow_h)
    dow_names = {"Open":"dow_open","Close":"dow_close","Low": "dow_low",
    'High':'dow_high','Volume':'dow_vol'}
    dow_h=dow_h.rename(dow_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)

    nas = yf.Ticker("^IXIC")
    nas_h = nas.history(start = starter, end = ender)
    nas_h = pd.DataFrame(nas_h)
    nas_names = {"Open":"nas_open", "Close":"nas_close", "Low": "nas_low",
    'High':'nas_high','Volume':'nas_vol'}
    nas_h=nas_h.rename(nas_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)

    market = dow_h.merge(nas_h,left_index=True, right_index=True, how="left")


    tick = yf.Ticker(ticker)
    hist = tick.history(start=starter, end=ender)
    hist = pd.DataFrame(hist)

    hist = hist.merge(market,left_index=True, right_index=True, how="left")
    
    return hist

In [21]:
jeff2= stock_stats("AAPL", "20210101","20210114")
jeff2.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,dow_open,dow_high,dow_low,dow_close,dow_vol,nas_open,nas_high,nas_low,nas_close,nas_vol
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-01-04,132.533082,132.622413,125.823047,128.453461,143301900,0,0,30627.470703,30674.279297,29881.820312,30223.890625,475080000,12958.519531,12958.719727,12543.240234,12698.450195,6546740000
2021-01-05,127.937286,130.766226,127.480679,130.041611,97664900,0,0,30204.25,30504.890625,30141.779297,30391.599609,350910000,12665.650391,12828.269531,12665.650391,12818.959961,6904420000
2021-01-06,126.775939,130.081327,125.44584,125.664215,155088000,0,0,30362.779297,31022.650391,30313.070312,30829.400391,500430000,12666.150391,12909.629883,12649.990234,12740.790039,7648340000
2021-01-07,127.411196,130.657029,126.914892,129.952271,109578200,0,0,30901.179688,31193.400391,30897.859375,31041.130859,427810000,12867.339844,13090.910156,12867.339844,13067.480469,6777010000
2021-01-08,131.45111,131.649643,129.267374,131.073929,105158200,0,0,31069.580078,31140.669922,30793.269531,31097.970703,381150000,13160.219727,13208.089844,13036.549805,13201.980469,7223660000


In [23]:
def wiki_scraper(kw_list: list, start_date: str, end_date: str):
    '''
    Description: Pulls the sum of how many times a wikipedia page was viewed that day
    
    inputs:
    ------
    
    kw_list: list of wikipedia page names to be scrpapped, can be of unlimted length
    
    start_date: the start of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    end_date: the end of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    '''
    d = pd.DataFrame()
    for key_word in kw_list:
        geoff = pageviewapi.per_article('en.wikipedia', key_word, start_date, end_date,
                                    access='all-access', agent='all-agents', granularity='daily')
        dicty = dict(geoff)
        views = pd.DataFrame(dicty["items"])
        views["timestamp"] = pd.to_datetime((views["timestamp"]), format="%Y%m%d%H")
        views = views.set_index("timestamp")
        page = pd.Series(views["views"])
        d[key_word] = page
        
    return d

In [25]:
kw_list = ["Apple", "Apple Inc.", "IPhone", "MacBook", "MacOS"]

In [26]:
jeff3 = wiki_scraper(kw_list, '20211201', '20211231')
jeff3.head()

Unnamed: 0_level_0,Apple,Apple Inc.,IPhone,MacBook,MacOS
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-12-01,8237,14391,8014,704,133006
2021-12-02,8299,14364,8072,619,42741
2021-12-03,8040,13359,7546,559,37979
2021-12-04,7673,11606,7311,604,26680
2021-12-05,7810,12245,7586,588,26376


In [28]:
def joiner(google_trends, yahoo_finace, wiki_pagecount):
    """
    Description: joins all stock data sets into one dataframe, after they have been cleaned and variables added
    
    input:
    ------
    google_trends: data frame counting daily hit counts for google news stories on specific key words
    
    yahoo_finace: data frame cointaing stock info including open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day.
    
    wiki_pagecount: the sum of how many times key wikipedia pages were viewed in a day
    """
    
    combined = google_trends.merge(yahoo_finace, left_index=True, right_index=True, how="left")
    combined = combined.merge(wiki_pagecount, left_index=True, right_index=True, how="left")
    return combined

### Data Scraper Without Google Trends

In [32]:
def Market_scraper(kw_list_2, ticker, start,end):
    
    """
    Description:
    ------------
    
    The function initially grabs historical, indexed, hourly data for when the keyword 
    was searched most as shown on Google Trends' Interest Over Time section.
    It then cleans the data to show daily hits on the keyword in Google news.

    
    input:
    -----
    ### kw_list_1: List of up to 5 key words that will be scraped from google trends for the dates given.
             Here, the scraping will pull the total posted items in google news that contains
             one of the key words ### removed to be edited and replaced later .
    
    kw_list_2: List of wikipedia article titles (unlimited length) that will pull the amount of
            views the article recieved each day. 

    ticker: the ticker abriviation of the desired stock. Must be netered in as an all capitalized string 
    example Apple Inc. woud be "AAPL"
             
    start: the start of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    end: the end of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
             
    return:
    -------
    
    ### combined: a dataframe containing the sum of the daily keyword hits in google news (key words labeled _x) ### to be edited,
    
    the individual stock info including open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day,

    as well as the open close, high, low, volume, of the NASDAQ and DOW Jones Indudtiral Average
    
    and the sum of how many times key word wikipedia pages were viewed in a day (key words labeled _y)
    """
    
    year_s = int(start[0:4])
    month_s = int(start[4:6])
    day_s = int(start[6:8])
    year_e = int(end[0:4])
    month_e = int(end[4:6])
    day_e = int(end[6:8])
    
    starter = pd.to_datetime(f"'{year_s}-{month_s}-{day_s}'")
    ender = pd.to_datetime(f"'{year_e}-{month_e}-{day_e}'")
    
    dow = yf.Ticker("^DJI")
    dow_h = dow.history(start=starter, end=ender)
    dow_h = pd.DataFrame(dow_h)
    dow_names = {"Open":"dow_open","Close":"dow_close","Low": "dow_low",
    'High':'dow_high','Volume':'dow_vol'}
    dow_h=dow_h.rename(dow_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)

    nas = yf.Ticker("^IXIC")
    nas_h = nas.history(start=starter, end=ender)
    nas_h = pd.DataFrame(nas_h)
    nas_names = {"Open":"nas_open", "Close":"nas_close", "Low": "nas_low",
    'High':'nas_high','Volume':'nas_vol'}
    nas_h=nas_h.rename(nas_names, axis=1).drop(["Dividends","Stock Splits"], axis=1)

    market = dow_h.merge(nas_h,left_index=True, right_index=True, how="left")
    
    
    tick = yf.Ticker(ticker)
    hist = tick.history(start=starter, end=ender)
    hist = pd.DataFrame(hist)
    
    combined = hist.merge(market, left_index=True, right_index=True, how="left") 

    d = pd.DataFrame()
    for key_word in kw_list_2:
        geoff = pageviewapi.per_article('en.wikipedia', key_word, start, end,
                                    access='all-access', agent='all-agents', granularity='daily')
        dicty = dict(geoff)
        views = pd.DataFrame(dicty["items"])
        views["timestamp"] = pd.to_datetime((views["timestamp"]), format="%Y%m%d%H")
        views = views.set_index("timestamp")
        page = pd.Series(views["views"])
        d[key_word] = page
        
    combined = combined.merge(d, left_index=True, right_index=True, how="right") 
    
    return combined

In [33]:
apple = Market_scraper(kw_list_1,"AAPL","20210101","20211231")
apple.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,dow_open,dow_high,dow_low,...,nas_open,nas_high,nas_low,nas_close,nas_vol,Apple,Apple Inc.,IPhone,MacBook,MacOS
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01,,,,,,,,,,,...,,,,,,2636,12694,7252,511,24790
2021-01-02,,,,,,,,,,,...,,,,,,2857,14297,7816,618,27483
2021-01-03,,,,,,,,,,,...,,,,,,2952,15899,8019,698,28429
2021-01-04,132.533082,132.622413,125.823047,128.453461,143301900.0,0.0,0.0,30627.470703,30674.279297,29881.820312,...,12958.519531,12958.719727,12543.240234,12698.450195,6546740000.0,3529,17389,8188,691,35098
2021-01-05,127.937286,130.766226,127.480679,130.041611,97664900.0,0.0,0.0,30204.25,30504.890625,30141.779297,...,12665.650391,12828.269531,12665.650391,12818.959961,6904420000.0,3365,17277,7997,707,35234


## Variable Calculation, Wiki Data

In [75]:
kw_list = ["Apple Inc.", "IPhone", "MacBook", "MacOS", "Apple Watch"]

apple_wiki = wiki_scraper(kw_list, "20190101", "20220425")

In [76]:
apple_wiki["Wiki_total"] = (apple_wiki["Apple Inc."] + apple_wiki["IPhone"] + apple_wiki["MacBook"] + apple_wiki["MacOS"] + apple_wiki["Apple Watch"])

In [77]:
apple_wiki.head()

Unnamed: 0_level_0,Apple Inc.,IPhone,MacBook,MacOS,Apple Watch,Wiki_total
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-01,9342,8377,713,3258,1864,23554
2019-01-02,12458,9109,881,4518,2472,29438
2019-01-03,17971,10870,865,4525,2513,36744
2019-01-04,16865,9986,855,4828,2368,34902
2019-01-05,13002,9762,849,4120,3189,30922


In [78]:
# Momentum_1
apple_wiki["Wiki_Moment_1"] =  (apple_wiki["Wiki_total"] / apple_wiki["Wiki_total"].shift(7)) * 100
# Momentum_2
apple_wiki["Wiki_Moment_2"] =  (apple_wiki["Wiki_total"] - apple_wiki["Wiki_total"].shift(7)) * 100

# Momentum_1_s three day shift (instead of 7)
apple_wiki["Wiki_Moment_1_s"] =  (apple_wiki["Wiki_total"] / apple_wiki["Wiki_total"].shift(3)) * 100
# Momentum_2_s
apple_wiki["Wiki_Moment_2_s"] =  (apple_wiki["Wiki_total"] - apple_wiki["Wiki_total"].shift(3)) * 100

In [79]:
# Moving average
apple_wiki["Wiki_MAvg"] = apple_wiki["Wiki_total"].rolling("7d").mean()
# Disparity
apple_wiki["Disparity"] = (apple_wiki["Wiki_total"]/apple_wiki["Wiki_MAvg"]) * 100
# Rate of Change Normal Way
apple_wiki["Wiki_ROC"] = (apple_wiki["Wiki_total"]-apple_wiki["Wiki_total"].shift(7))/(apple_wiki["Wiki_total"].shift(7)) *100
apple_wiki["Wiki_ROC_s"] = (apple_wiki["Wiki_total"]-apple_wiki["Wiki_total"].shift(3))/(apple_wiki["Wiki_total"].shift(3)) *100
#Rate of Change Paper Way (doesn't make sense but just in case)
apple_wiki['Wiki_Rocp'] = (apple_wiki["Wiki_total"]/apple_wiki["Wiki_Moment_2"]) *100
# Exponential Moving Average
apple_wiki["Wiki_EMA"] = (apple_wiki["Wiki_total"]-apple_wiki["Wiki_MAvg"].shift(1))*(2/(7+1))+apple_wiki["Wiki_MAvg"].shift(1)
apple_wiki["Wiki_EMA"]

timestamp
2019-01-01             NaN
2019-01-02    25025.000000
2019-01-03    29058.000000
2019-01-04    31159.500000
2019-01-05    31100.125000
                  ...     
2022-04-21    56468.607143
2022-04-22    55352.892857
2022-04-23    55864.035714
2022-04-24    54365.285714
2022-04-25    55309.571429
Name: Wiki_EMA, Length: 1211, dtype: float64

In [90]:
apple_wiki["Wiki_MAvg_Move"] = apple_wiki["Wiki_MAvg"] > apple_wiki["Wiki_MAvg"].shift(1) 
apple_wiki["Wiki_MAvg_Move"] = apple_wiki["Wiki_MAvg_Move"].replace({True:1,False: 0})


timestamp
2019-01-01    0
2019-01-02    1
2019-01-03    1
2019-01-04    1
2019-01-05    0
             ..
2022-04-21    0
2022-04-22    0
2022-04-23    1
2022-04-24    0
2022-04-25    1
Name: Wiki_MAvg_Move, Length: 1211, dtype: int64

In [80]:
apple_wiki.head()

Unnamed: 0_level_0,Apple Inc.,IPhone,MacBook,MacOS,Apple Watch,Wiki_total,Wiki_Moment_1,Wiki_Moment_2,Wiki_Moment_1_s,Wiki_Moment_2_s,Wiki_MAvg,Disparity,Wiki_ROC,Wiki_ROC_s,Wiki_Rocp,Wiki_EMA
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2019-01-01,9342,8377,713,3258,1864,23554,,,,,23554.0,100.0,,,,
2019-01-02,12458,9109,881,4518,2472,29438,,,,,26496.0,111.103563,,,,25025.0
2019-01-03,17971,10870,865,4525,2513,36744,,,,,29912.0,122.840332,,,,29058.0
2019-01-04,16865,9986,855,4828,2368,34902,,,148.178653,1134800.0,31159.5,112.010783,,48.178653,,31159.5
2019-01-05,13002,9762,849,4120,3189,30922,,,105.041103,148400.0,31112.0,99.389303,,5.041103,,31100.125


In [81]:
def Wiki_variables(df):
    '''
    Description: Returns new wiki variables from original dataframe imported from wiki_scraper

    Warning: You must calcualte Wiki_Total First, the sum of all the wiki pages daily hit counts
    
    '''
    # Momentum_1
    df["Wiki_Moment_1"] =  (df["Wiki_total"] / df["Wiki_total"].shift(7)) * 100
    # Momentum_2
    df["Wiki_Moment_2"] =  (df["Wiki_total"] - df["Wiki_total"].shift(7)) * 100
    # Momentum_1_s three day shift (instead of 7)
    df["Wiki_Moment_1_s"] =  (df["Wiki_total"] / df["Wiki_total"].shift(3)) * 100
    # Momentum_2_s
    df["Wiki_Moment_2_s"] =  (df["Wiki_total"] - df["Wiki_total"].shift(3)) * 100
    # Moving average
    df["Wiki_MAvg"] = df["Wiki_total"].rolling("7d").mean()
    # Moving average 3 day
    df["Wiki_MAvg_s"] = df["Wiki_total"].rolling("3d").mean()
    # Disparity
    df["Wiki_Disparity"] = (df["Wiki_total"]/df["Wiki_MAvg"]) * 100
    # Disparity 3 day
    df["Wiki_Disparity_s"] = (df["Wiki_total"]/df["Wiki_MAvg_s"]) * 100
    # Rate of Change Normal Way
    df["Wiki_ROC"] = (df["Wiki_total"]-df["Wiki_total"].shift(7))/(df["Wiki_total"].shift(7)) *100
    df["Wiki_ROC_s"] = (df["Wiki_total"]-df["Wiki_total"].shift(3))/(df["Wiki_total"].shift(3)) *100
    #Rate of Change Paper Way (doesn't make sense but just in case)
    df['Wiki_Rocp'] = (df["Wiki_total"]/df["Wiki_Moment_2"]) *100
    # Exponential Moving Average
    df["Wiki_EMA"] = (df["Wiki_total"]-df["Wiki_MAvg"].shift(1))*(2/(7+1))+df["Wiki_MAvg"].shift(1)

    # calculating the Relative Strength Index, based on 14 day window
    df["Wiki_diff"] = df["Wiki_total"].diff(1)
    df["Wiki_gain"] = df["Wiki_diff"].clip(lower=0).round(2) #keeps all values above or below a given threshold, lower=lower bound
    df["Wiki_loss"] = df["Wiki_diff"].clip(upper=0).round(2)
    df['Wiki_avg_gain'] = df['Wiki_gain'].rolling(14).mean()
    df['Wiki_avg_loss'] = df['Wiki_loss'].rolling(14).mean()
    df['Wiki_rs'] = df['Wiki_avg_gain'] / df['Wiki_avg_loss']
    df['Wiki_RSI'] = 100 - (100 / (1.0 + df['Wiki_rs']))

    # Calculatiing the Move Variables 
    df["Wiki_Move"] = df["Wiki_total"] > df["Wiki_total"].shift(1) 
    df["Wiki_Move"] = df["Wiki_Move"].replace({True:1,False: 0})
    
      
    df["Wiki_MAvg_Move"] = df["Wiki_MAvg"] > df["Wiki_MAvg"].shift(1) 
    df["Wiki_MAvg_Move"] = df["Wiki_MAvg_Move"].replace({True:1,False: 0})
    df["Wiki_MAvg_s_Move"] = df["Wiki_MAvg_s"] > df["Wiki_MAvg_s"].shift(1) 
    df["Wiki_MAvg_s_Move"] = df["Wiki_MAvg_s_Move"].replace({True:1,False: 0})

    df["Wiki_EMA_Move"] = df["Wiki_EMA"] > df["Wiki_EMA"].shift(1) 
    df["Wiki_EMA_Move"] = df["Wiki_EMA_Move"].replace({True:1,False: 0})

    df["Wiki_Disparity_Move"] = df["Wiki_Disparity"] > df["Wiki_Disparity"].shift(1) 
    df["Wiki_Disparity_Move"] = df["Wiki_Disparity_Move"].replace({True:1,False: 0})
    df["Wiki_Disparity_s_Move"] = df["Wiki_Disparity_s"] > df["Wiki_Disparity_s"].shift(1) 
    df["Wiki_Disparity_s_Move"] = df["Wiki_Disparity_s_Move"].replace({True:1,False: 0})

    df["Wiki_RSI_Move"] = df["Wiki_RSI"] > df["Wiki_RSI"].shift(1) 
    df["Wiki_RSI_Move"] = df["Wiki_RSI_Move"].replace({True:1,False: 0})

    
    return df

In [None]:
def Google_variables(df):
    # Momentum 1
    df["Google_Moment_1"] =  (df["Google_total"] / df["Google_total"].shift(7)) * 100
    # Momentum_2
    df["Google_Moment_2"] =  (df["Google_total"] - df["Google_total"].shift(7)) * 100
    # Momentum_1_s three day shift (instead of 7)
    df["Google_Moment_1_s"] =  (df["Google_total"] / df["Google_total"].shift(3)) * 100
    # Momentum_2_s
    df["Google_Moment_2_s"] =  (df["Google_total"] - df["Google_total"].shift(3)) * 100
    # Moving average
    df["Google_MAvg"] = df["Google_total"].rolling("7d").mean()
    # Disparity
    df["Disparity"] = (df["Google_total"]/df["Google_MAvg"]) * 100
    # Rate of Change Normal Way
    df["Google_ROC"] = (df["Google_total"]-df["Google_total"].shift(7))/(df["Google_total"].shift(7)) *100
    df["Google_ROC_s"] = (df["Google_total"]-df["Google_total"].shift(3))/(df["Google_total"].shift(3)) *100
    #Rate of Change Paper Way (doesn't make sense but just in case)
    df['Google_Rocp'] = (df["Google_total"]/df["Google_Moment_2"]) *100
    # Exponential Moving Average
    df["Google_EMA"] = (df["Google_total"]-df["Google_MAvg"].shift(1))*(2/(7+1))+df["Google_MAvg"].shift(1)

    # calculating the Relative Strength Index, based on 14 day window
    df["Google_diff"] = df["Google_total"].diff(1)
    df["Google_gain"] = df["Google_diff"].clip(lower=0).round(2) #keeps all values above or below a given threshold, lower=lower bound
    df["Google_loss"] = df["Google_diff"].clip(upper=0).round(2)
    df['Google_avg_gain'] = df['Google_gain'].rolling(14).mean()
    df['Google_avg_loss'] = df['Google_loss'].rolling(14).mean()
    df['Google_rs'] = df['Google_avg_gain'] / df['Google_avg_loss']
    df['Google_RSI'] = 100 - (100 / (1.0 + df['Google_rs']))

    # Calculatiing the Move Variables 
    df["Google_Move"] = df["Google_total"] > df["Google_total"].shift(1) 
    df["Google_Move"] = df["Google_Move"].replace({True:1,False: 0})
    
    df["Google_MAvg_Move"] = df["Google_MAvg"] > df["Google_MAvg"].shift(1) 
    df["Google_MAvg_Move"] = df["Google_MAvg_Move"].replace({True:1,False: 0})
    df["Google_MAvg_s_Move"] = df["Google_MAvg_s"] > df["Google_MAvg_s"].shift(1) 
    df["Google_MAvg_s_Move"] = df["Google_MAvg_s_Move"].replace({True:1,False: 0})

    df["Google_EMA_Move"] = df["Google_EMA"] > df["Google_EMA"].shift(1) 
    df["Google_EMA_Move"] = df["Google_EMA_Move"].replace({True:1,False: 0})

    df["Google_Disparity_Move"] = df["Google_Disparity"] > df["Google_Disparity"].shift(1) 
    df["Google_Disparity_Move"] = df["Google_Disparity_Move"].replace({True:1,False: 0})
    df["Google_Disparity_s_Move"] = df["Google_Disparity_s"] > df["Google_Disparity_s"].shift(1) 
    df["Google_Disparity_s_Move"] = df["Google_Disparity_s_Move"].replace({True:1,False: 0})

    df["Google_RSI_Move"] = df["Google_RSI"] > df["Google_RSI"].shift(1) 
    df["Google_RSI_Move"] = df["Google_RSI_Move"].replace({True:1,False: 0})

    return df

In [None]:
def Stock_variables(df):
    # Momentum 1
    df["Stock_Moment_1"] =  (df["Close"] / df["Close"].shift(5)) * 100
    # Momentum_2
    df["Stock_Moment_2"] =  (df["Close"] - df["Close"].shift(5)) * 100
    # Momentum_1_s three day shift (instead of 5)
    df["Stock_Moment_1_s"] =  (df["Close"] / df["Close"].shift(3)) * 100
    # Momentum_2_s
    df["Stock_Moment_2_s"] =  (df["Close"] - df["Close"].shift(3)) * 100
    # Moving average
    df["Stock_MAvg"] = df["Close"].rolling("5d").mean()
    # Disparity
    df["Disparity"] = (df["Close"]/df["Stock_MAvg"]) * 100
    # Rate of Change Normal Way
    df["Stock_ROC"] = (df["Close"]-df["Close"].shift(5))/(df["Close"].shift(5)) *100
    df["Stock_ROC_s"] = (df["Close"]-df["Close"].shift(3))/(df["Close"].shift(3)) *100
    #Rate of Change Paper Way (doesn't make sense but just in case)
    df['Stock_Rocp'] = (df["Close"]/df["Stock_Moment_2"]) *100
    # Exponential Moving Average
    df["Stock_EMA"] = (df["Close"]-df["Stock_MAvg"].shift(1))*(2/(5+1))+df["Stock_MAvg"].shift(1)

    # calculating the Relative Strength Index, based on 14 day window
    df["Stock_diff"] = df["Close"].diff(1)
    df["Stock_gain"] = df["Stock_diff"].clip(lower=0).round(2) #keeps all values above or below a given threshold, lower=lower bound
    df["Stock_loss"] = df["Stock_diff"].clip(upper=0).round(2)
    df['Stock_avg_gain'] = df['Stock_gain'].rolling(14).mean()
    df['Stock_avg_loss'] = df['Stock_loss'].rolling(14).mean()
    df['Stock_rs'] = df['Stock_avg_gain'] / df['Stock_avg_loss']
    df['Stock_RSI'] = 100 - (100 / (1.0 + df['Stock_rs']))

    # Calculatiing the Move Variables 
    df["Stock_Move"] = df["Close"] > df["Close"].shift(1) 
    df["Stock_Move"] = df["Stock_Move"].replace({True:1,False: 0})
    
    df["Stock_MAvg_Move"] = df["Stock_MAvg"] > df["Stock_MAvg"].shift(1) 
    df["Stock_MAvg_Move"] = df["Stock_MAvg_Move"].replace({True:1,False: 0})
    df["Stock_MAvg_s_Move"] = df["Stock_MAvg_s"] > df["Stock_MAvg_s"].shift(1) 
    df["Stock_MAvg_s_Move"] = df["Stock_MAvg_s_Move"].replace({True:1,False: 0})

    df["Stock_EMA_Move"] = df["Stock_EMA"] > df["Stock_EMA"].shift(1) 
    df["Stock_EMA_Move"] = df["Stock_EMA_Move"].replace({True:1,False: 0})

    df["Stock_Disparity_Move"] = df["Stock_Disparity"] > df["Stock_Disparity"].shift(1) 
    df["Stock_Disparity_Move"] = df["Stock_Disparity_Move"].replace({True:1,False: 0})
    df["Stock_Disparity_s_Move"] = df["Stock_Disparity_s"] > df["Stock_Disparity_s"].shift(1) 
    df["Stock_Disparity_s_Move"] = df["Stock_Disparity_s_Move"].replace({True:1,False: 0})

    df["Stock_RSI_Move"] = df["Stock_RSI"] > df["Stock_RSI"].shift(1) 
    df["Stock_RSI_Move"] = df["Stock_RSI_Move"].replace({True:1,False: 0})

    return df

In [None]:
def NASDAQ_variables(df):
    # Momentum 1
    df["nas_Moment_1"] =  (df["nas_close"] / df["nas_close"].shift(5)) * 100
    # Momentum_2
    df["nas_Moment_2"] =  (df["nas_close"] - df["nas_close"].shift(5)) * 100
    # Momentum_1_s three day shift (instead of 5)
    df["nas_Moment_1_s"] =  (df["nas_close"] / df["nas_close"].shift(3)) * 100
    # Momentum_2_s
    df["nas_Moment_2_s"] =  (df["nas_close"] - df["nas_close"].shift(3)) * 100
    # Moving average
    df["nas_MAvg"] = df["nas_close"].rolling("5d").mean()
    # Disparity
    df["Disparity"] = (df["nas_close"]/df["nas_MAvg"]) * 100
    # Rate of Change Normal Way
    df["nas_ROC"] = (df["nas_close"]-df["nas_close"].shift(5))/(df["nas_close"].shift(5)) *100
    df["nas_ROC_s"] = (df["nas_close"]-df["nas_close"].shift(3))/(df["nas_close"].shift(3)) *100
    #Rate of Change Paper Way (doesn't make sense but just in case)
    df['nas_Rocp'] = (df["nas_close"]/df["nas_Moment_2"]) *100
    # Exponential Moving Average
    df["nas_EMA"] = (df["nas_close"]-df["nas_MAvg"].shift(1))*(2/(5+1))+df["nas_MAvg"].shift(1)

    # calculating the Relative Strength Index, based on 14 day window
    df["nas_diff"] = df["nas_close"].diff(1)
    df["nas_gain"] = df["nas_diff"].clip(lower=0).round(2) #keeps all values above or below a given threshold, lower=lower bound
    df["nas_loss"] = df["nas_diff"].clip(upper=0).round(2)
    df['nas_avg_gain'] = df['nas_gain'].rolling(14).mean()
    df['nas_avg_loss'] = df['nas_loss'].rolling(14).mean()
    df['nas_rs'] = df['nas_avg_gain'] / df['nas_avg_loss']
    df['nas_RSI'] = 100 - (100 / (1.0 + df['nas_rs']))

    # Calculatiing the Move Variables 
    df["nas_Move"] = df["nas_close"] > df["nas_close"].shift(1) 
    df["nas_Move"] = df["nas_Move"].replace({True:1,False: 0})
    
    df["nas_MAvg_Move"] = df["nas_MAvg"] > df["nas_MAvg"].shift(1) 
    df["nas_MAvg_Move"] = df["nas_MAvg_Move"].replace({True:1,False: 0})
    df["nas_MAvg_s_Move"] = df["nas_MAvg_s"] > df["nas_MAvg_s"].shift(1) 
    df["nas_MAvg_s_Move"] = df["nas_MAvg_s_Move"].replace({True:1,False: 0})

    df["nas_EMA_Move"] = df["nas_EMA"] > df["nas_EMA"].shift(1) 
    df["nas_EMA_Move"] = df["nas_EMA_Move"].replace({True:1,False: 0})

    df["nas_Disparity_Move"] = df["nas_Disparity"] > df["nas_Disparity"].shift(1) 
    df["nas_Disparity_Move"] = df["nas_Disparity_Move"].replace({True:1,False: 0})
    df["nas_Disparity_s_Move"] = df["nas_Disparity_s"] > df["nas_Disparity_s"].shift(1) 
    df["nas_Disparity_s_Move"] = df["nas_Disparity_s_Move"].replace({True:1,False: 0})

    df["nas_RSI_Move"] = df["nas_RSI"] > df["nas_RSI"].shift(1) 
    df["nas_RSI_Move"] = df["nas_RSI_Move"].replace({True:1,False: 0})

    return df

In [None]:
def dowDAQ_variables(df):
    # Momentum 1
    df["dow_Moment_1"] =  (df["dow_close"] / df["dow_close"].shift(5)) * 100
    # Momentum_2
    df["dow_Moment_2"] =  (df["dow_close"] - df["dow_close"].shift(5)) * 100
    # Momentum_1_s three day shift (instead of 5)
    df["dow_Moment_1_s"] =  (df["dow_close"] / df["dow_close"].shift(3)) * 100
    # Momentum_2_s
    df["dow_Moment_2_s"] =  (df["dow_close"] - df["dow_close"].shift(3)) * 100
    # Moving average
    df["dow_MAvg"] = df["dow_close"].rolling("5d").mean()
    # Disparity
    df["Disparity"] = (df["dow_close"]/df["dow_MAvg"]) * 100
    # Rate of Change Normal Way
    df["dow_ROC"] = (df["dow_close"]-df["dow_close"].shift(5))/(df["dow_close"].shift(5)) *100
    df["dow_ROC_s"] = (df["dow_close"]-df["dow_close"].shift(3))/(df["dow_close"].shift(3)) *100
    #Rate of Change Paper Way (doesn't make sense but just in case)
    df['dow_Rocp'] = (df["dow_close"]/df["dow_Moment_2"]) *100
    # Exponential Moving Average
    df["dow_EMA"] = (df["dow_close"]-df["dow_MAvg"].shift(1))*(2/(5+1))+df["dow_MAvg"].shift(1)

    # calculating the Relative Strength Index, based on 14 day window
    df["dow_diff"] = df["dow_close"].diff(1)
    df["dow_gain"] = df["dow_diff"].clip(lower=0).round(2) #keeps all values above or below a given threshold, lower=lower bound
    df["dow_loss"] = df["dow_diff"].clip(upper=0).round(2)
    df['dow_avg_gain'] = df['dow_gain'].rolling(14).mean()
    df['dow_avg_loss'] = df['dow_loss'].rolling(14).mean()
    df['dow_rs'] = df['dow_avg_gain'] / df['dow_avg_loss']
    df['dow_RSI'] = 100 - (100 / (1.0 + df['dow_rs']))

    # Calculatiing the Move Variables 
    df["dow_Move"] = df["dow_close"] > df["dow_close"].shift(1) 
    df["dow_Move"] = df["dow_Move"].replace({True:1,False: 0})
    
    df["dow_MAvg_Move"] = df["dow_MAvg"] > df["dow_MAvg"].shift(1) 
    df["dow_MAvg_Move"] = df["dow_MAvg_Move"].replace({True:1,False: 0})
    df["dow_MAvg_s_Move"] = df["dow_MAvg_s"] > df["dow_MAvg_s"].shift(1) 
    df["dow_MAvg_s_Move"] = df["dow_MAvg_s_Move"].replace({True:1,False: 0})

    df["dow_EMA_Move"] = df["dow_EMA"] > df["dow_EMA"].shift(1) 
    df["dow_EMA_Move"] = df["dow_EMA_Move"].replace({True:1,False: 0})

    df["dow_Disparity_Move"] = df["dow_Disparity"] > df["dow_Disparity"].shift(1) 
    df["dow_Disparity_Move"] = df["dow_Disparity_Move"].replace({True:1,False: 0})
    df["dow_Disparity_s_Move"] = df["dow_Disparity_s"] > df["dow_Disparity_s"].shift(1) 
    df["dow_Disparity_s_Move"] = df["dow_Disparity_s_Move"].replace({True:1,False: 0})

    df["dow_RSI_Move"] = df["dow_RSI"] > df["dow_RSI"].shift(1) 
    df["dow_RSI_Move"] = df["dow_RSI_Move"].replace({True:1,False: 0})

    return df