# Ford Stock Data

In [1]:
import pandas as pd
import os
#%pip install pytrends
import pytrends
from pytrends.request import TrendReq
#pip install pageviewapi
import pageviewapi
#%pip install yfinance
import yfinance as yf

In [10]:
def Big_scraper(kw_list_1, kw_list_2, ticker, start,end):
    
    """
    Description:
    ------------
    
    The function initially grabs historical, indexed, hourly data for when the keyword 
    was searched most as shown on Google Trends' Interest Over Time section.
    It then cleans the data to show daily hits on the keyword in Google news.

    
    input:
    -----
    kw_list_1: List of up to 5 key words that will be scraped from google trends for the dates given.
             Here, the scraping will pull the total posted items in google news that contains
             one of the key words.
    
    kw_list_2: List of wikipedia article titles (unlimited length) that will pull the amount of
            views the article recieved each day. 

    ticker: the ticker abriviation of the desired stock. Must be netered in as an all capitalized string 
    example Apple Inc. woud be "AAPL"
             
    start: the start of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    end: the end of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
             
    return:
    -------
    
    combined: a dataframe containing the sum of the daily keyword hits in google news (key words labeled _x),
    
    data frame cointaing stock info including open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day,
    
    and the sum of how many times key word wikipedia pages were viewed in a day (key words labeled _y)
    """
    
    year_s = int(start[0:4])
    month_s = int(start[4:6])
    day_s = int(start[6:8])
    year_e = int(end[0:4])
    month_e = int(end[4:6])
    day_e = int(end[6:8])
    
    starter = pd.to_datetime(f"'{year_s}-{month_s}-{day_s}'")
    ender = pd.to_datetime(f"'{year_e}-{month_e}-{day_e}'")
    
    
    pytrends = TrendReq(hl='en-US', tz=360, retries=10)
    jeff = pytrends.get_historical_interest(kw_list_1, \
                                 year_start = year_s, month_start = month_s, day_start = day_s, hour_start = 1, \
                                 year_end = year_e, month_end = month_e, day_end = day_e, hour_end = 23, \
                                 cat = 0, geo = '', gprop = 'news', sleep = 60)
    
    jeff = jeff.iloc[:, 0:-1] # eliminates the isPartial Column
    jeff = jeff.reset_index().drop_duplicates(subset = "date") #removing duplicates from the index
    jeff = jeff.groupby(pd.Grouper(key="date", freq="D")).sum() # coverts to the sum of daily posts
    
    tick = yf.Ticker(ticker)
    hist = tick.history(start=starter, end=ender)
    hist = pd.DataFrame(hist)
    
    combined = jeff.merge(hist, left_index=True, right_index=True, how="left") 
    
    d = pd.DataFrame()
    for key_word in kw_list_2:
        geoff = pageviewapi.per_article('en.wikipedia', key_word, start, end,
                                    access='all-access', agent='all-agents', granularity='daily')
        dicty = dict(geoff)
        views = pd.DataFrame(dicty["items"])
        views["timestamp"] = pd.to_datetime((views["timestamp"]), format="%Y%m%d%H")
        views = views.set_index("timestamp")
        page = pd.Series(views["views"])
        d[key_word] = page
        
    combined = combined.merge(d, left_index=True, right_index=True, how="left") 
    
    return combined

In [11]:
kw_list_1 = ["Ford", "F-150", "Ford Bronco", "Ford Mustang", "Ford Stock"]
kw_list_2 = ["Ford Motor Company", "Ford Mustang", "Ford F Series", "Ford Bronco", "Lincoln Navigator", "Lincoln Aviator", "Ford GT"]

In [12]:
Ford = Big_scraper(kw_list_1, kw_list_2,"F", "20210101", "20210114")

In [13]:
Ford

Unnamed: 0_level_0,Ford,F-150,Ford Bronco_x,Ford Mustang_x,Ford Stock,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ford Motor Company,Ford Mustang_y,Ford F Series,Ford Bronco_y,Lincoln Navigator,Lincoln Aviator,Ford GT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-01-01,592,0,0,21,31,,,,,,,,4819,3186,6,1724,653,766,1967
2021-01-02,555,0,0,0,0,,,,,,,,5138,3312,4,1667,779,1209,1915
2021-01-03,384,0,47,0,0,,,,,,,,5356,3576,12,1774,746,1109,1842
2021-01-04,568,0,67,46,0,8.721023,8.75072,8.344861,8.433952,85043100.0,0.0,0.0,5467,3276,7,1574,728,734,1640
2021-01-05,442,0,40,15,0,8.384458,8.631933,8.374559,8.562639,70127800.0,0.0,0.0,5402,3294,12,1613,676,704,1510
2021-01-06,753,0,0,20,0,8.701226,8.84971,8.592337,8.750721,72590200.0,0.0,0.0,5156,3110,16,1524,609,623,1414
2021-01-07,614,18,0,0,14,8.84971,8.988297,8.790317,8.968499,77117100.0,0.0,0.0,5001,2952,8,1450,603,551,1402
2021-01-08,331,0,0,14,0,9.008095,9.047691,8.800216,8.909104,59162200.0,0.0,0.0,5044,3024,7,1659,580,568,1443
2021-01-09,620,0,12,11,14,,,,,,,,5163,3548,5,2315,1181,737,1633
2021-01-10,477,0,12,0,0,,,,,,,,5364,3519,5,2176,1190,644,1525


## individual functions

In [350]:
def google_trends(kw_list, year_start, month_start, year_end, month_end, day_end, day_start=1, hour_start=0, hour_end=23):
    
    """
    Description:
    ------------
    
    The function initially grabs historical, indexed, hourly data for when the keyword 
    was searched most as shown on Google Trends' Interest Over Time section.
    It then cleans the data to show daily hits on the keyword in Google news.

    
    input:
    -----
    kw_list: List of up to 5 key words that will be scraped from the timeline given to the function.
             Here, the scraping will pull the total posted items in google news. the contains
             one of the key words.
             
    Rest of the varibles are self-explatory and used to set the timeline to scrape the key words. 
    
    Plug in as decribed 
    year_start / end: YYYY int
    month_start / end: M int
    day_start /  end: D int
    hour_start / end: H int
    
             
    return:
    -------
    
    jeff: a dataframe containing the sum of the daily keyword hits in google news
    """
    
    pytrends = TrendReq(hl='en-US', tz=360, retries=10)
    jeff = pytrends.get_historical_interest(kw_list, \
                                 year_start = year_start, month_start = month_start, day_start = day_start, hour_start = hour_start, \
                                 year_end = year_end, month_end = month_end, day_end = day_end, hour_end = hour_end, \
                                 cat = 0, geo = '', gprop = 'news', sleep = 60)
    
    jeff = jeff.iloc[:, 0:-1] # eliminates the isPartial Column
    jeff = jeff.reset_index().drop_duplicates(subset = "date") #removing duplicates from the index
    jeff = jeff.groupby(pd.Grouper(key="date", freq="D")).sum() # coverts to the sum of daily posts
    
    return jeff

In [335]:
def stock_stats(ticker: str, start_date: str, end_date: str):
    """
    Description: Scrapes historial daily stock data from the Yahoo Fince sight
    and returns a dataframe containing daily open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day.
    
    inputs:
    ------
    ticker: the ticker abriviation of the desired stock. Must be netered in as an all capitalized string 
    example Apple Inc. woud be "AAPL"
    
    start_date: the start of the desired timeline you want scrape. Date Must be entered in as "YYYY-MM-DD"
    
    end_date: Self explanetory, Date Must be entered in as "YYYY-MM-DD"
    
    return:
    ------
    hist: dataframe containing open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day. 
    
    """
    
    tick = yf.Ticker(ticker)
    hist = tick.history(start=start_date, end=end_date)
    hist = pd.DataFrame(hist)
    return hist

In [320]:
def wiki_scraper(kw_list: list, start_date: str, end_date: str):
    '''
    Description: Pulls the sum of how many times a wikipedia page was viewed that day
    
    inputs:
    ------
    
    kw_list: list of wikipedia page names to be scrpapped, can be of unlimted length
    
    start_date: the start of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    end_date: the end of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    '''
    d = pd.DataFrame()
    for key_word in kw_list:
        geoff = pageviewapi.per_article('en.wikipedia', key_word, start_date, end_date,
                                    access='all-access', agent='all-agents', granularity='daily')
        dicty = dict(geoff)
        views = pd.DataFrame(dicty["items"])
        views["timestamp"] = pd.to_datetime((views["timestamp"]), format="%Y%m%d%H")
        views = views.set_index("timestamp")
        page = pd.Series(views["views"])
        d[key_word] = page
        
    return d

In [6]:
kw_list_2 = ["Ford Motor Company", "Ford Mustang", "Ford F Series", "Ford Bronco", "Lincoln Navigator", "Lincoln Aviator", "Ford GT"]

In [None]:
google_trends(kw_list1, 2019,12,2019,12,)

In [341]:
def joiner(google_trends, yahoo_finace, wiki_pagecount):
    """
    Description: joins all stock data sets into one dataframe
    
    input:
    ------
    google_trends: data frame counting daily hit counts for google news stories on specific key words
    
    yahoo_finace: data frame cointaing stock info including open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day.
    
    wiki_pagecount: the sum of how many times key wikipedia pages were viewed in a day
    """
    
    combined = google_trends.merge(yahoo_finace, left_index=True, right_index=True, how="left")
    combined = combined.merge(wiki_pagecount, left_index=True, right_index=True, how="left")
    return combined