# Itau Stock Data

In [1]:
import pandas as pd
import os
#%pip install pytrends
import pytrends
from pytrends.request import TrendReq
#pip install pageviewapi
import pageviewapi
#%pip install yfinance
import yfinance as yf

In [5]:
def Big_scraper(kw_list_1, kw_list_2, ticker, start,end):
    
    """
    Description:
    ------------
    
    The function initially grabs historical, indexed, hourly data for when the keyword 
    was searched most as shown on Google Trends' Interest Over Time section.
    It then cleans the data to show daily hits on the keyword in Google news.

    
    input:
    -----
    kw_list_1: List of up to 5 key words that will be scraped from google trends for the dates given.
             Here, the scraping will pull the total posted items in google news that contains
             one of the key words.
    
    kw_list_2: List of wikipedia article titles (unlimited length) that will pull the amount of
            views the article recieved each day. 

    ticker: the ticker abriviation of the desired stock. Must be netered in as an all capitalized string 
    example Apple Inc. woud be "AAPL"
             
    start: the start of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
    
    end: the end of the desired timeline you want scrape. Date Must be entered in as "YYYYMMDD"
             
    return:
    -------
    
    combined: a dataframe containing the sum of the daily keyword hits in google news (key words labeled _x),
    
    data frame cointaing stock info including open, close, high, low prices of the stock,
    as well as the stocks daily trading volume and the amount if there was a split or dividend 
    preformed on the stock that day,
    
    and the sum of how many times key word wikipedia pages were viewed in a day (key words labeled _y)
    """
    
    year_s = int(start[0:4])
    month_s = int(start[4:6])
    day_s = int(start[6:8])
    year_e = int(end[0:4])
    month_e = int(end[4:6])
    day_e = int(end[6:8])
    
    starter = pd.to_datetime(f"'{year_s}-{month_s}-{day_s}'")
    ender = pd.to_datetime(f"'{year_e}-{month_e}-{day_e}'")
    
    
    pytrends = TrendReq(hl='en-US', tz=360, retries=10)
    jeff = pytrends.get_historical_interest(kw_list_1, \
                                 year_start = year_s, month_start = month_s, day_start = day_s, hour_start = 1, \
                                 year_end = year_e, month_end = month_e, day_end = day_e, hour_end = 23, \
                                 cat = 0, geo = '', gprop = 'news', sleep = 60)
    
    jeff = jeff.iloc[:, 0:-1] # eliminates the isPartial Column
    jeff = jeff.reset_index().drop_duplicates(subset = "date") #removing duplicates from the index
    jeff = jeff.groupby(pd.Grouper(key="date", freq="D")).mean() # coverts to the mean of daily search score
    
    tick = yf.Ticker(ticker)
    hist = tick.history(start=starter, end=ender)
    hist = pd.DataFrame(hist)
    
    combined = jeff.merge(hist, left_index=True, right_index=True, how="left") 
    
    d = pd.DataFrame()
    for key_word in kw_list_2:
        geoff = pageviewapi.per_article('en.wikipedia', key_word, start, end,
                                    access='all-access', agent='all-agents', granularity='daily')
        dicty = dict(geoff)
        views = pd.DataFrame(dicty["items"])
        views["timestamp"] = pd.to_datetime((views["timestamp"]), format="%Y%m%d%H")
        views = views.set_index("timestamp")
        page = pd.Series(views["views"])
        d[key_word] = page
        
    combined = combined.merge(d, left_index=True, right_index=True, how="left") 
    
    return combined

In [17]:
kw_list_1 = ["UBS", "UBS Financial Services Inc.", "UBS Investment Bank", "UBS Global Wealth Management", "UBS Asset Management"]
kw_list_2 = ["UBS", "Union Bank of Switzerland", "UBS tax evasion controversies", "Banking in Switzerland"]

In [18]:
UBS = Big_scraper(kw_list_1, kw_list_2,"UBS", "20210101", "20211231")

In [None]:
UBS_2 = Big_scraper(kw_list_1, kw_list_2,"UBS", "20210101", "20211231")

In [None]:
UBS_2.head()

Unnamed: 0_level_0,Ford,F-150,Ford Bronco_x,Ford Mustang_x,Ford Stock,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ford Motor Company,Ford Mustang_y,Ford F Series,Ford Bronco_y,Lincoln Navigator,Lincoln Aviator,Ford GT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-01-01,25.73913,0.0,0.0,0.913043,1.347826,,,,,,,,4819,3186,6.0,1724,653,766,1967
2021-01-02,23.125,0.0,0.0,0.0,0.0,,,,,,,,5138,3312,4.0,1667,779,1209,1915
2021-01-03,16.0,0.0,1.958333,0.0,0.0,,,,,,,,5356,3576,12.0,1774,746,1109,1842
2021-01-04,23.666667,0.0,2.791667,1.916667,0.0,8.721023,8.75072,8.344861,8.433952,85043100.0,0.0,0.0,5467,3276,7.0,1574,728,734,1640
2021-01-05,18.416667,0.0,1.666667,0.625,0.0,8.384458,8.631933,8.374559,8.562639,70127800.0,0.0,0.0,5402,3294,12.0,1613,676,704,1510


In [None]:
UBS_2.to_csv("2021_Ford_Data.csv")

In [None]:
UBS.head()

Unnamed: 0_level_0,Ford,F-150,Ford Bronco_x,Ford Mustang_x,Ford Stock,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ford Motor Company,Ford Mustang_y,Ford F Series,Ford Bronco_y,Lincoln Navigator,Lincoln Aviator,Ford GT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-01-01,592,0,0,21,31,,,,,,,,4819,3186,6.0,1724,653,766,1967
2021-01-02,555,0,0,0,0,,,,,,,,5138,3312,4.0,1667,779,1209,1915
2021-01-03,384,0,47,0,0,,,,,,,,5356,3576,12.0,1774,746,1109,1842
2021-01-04,568,0,67,46,0,8.721023,8.75072,8.344861,8.433952,85043100.0,0.0,0.0,5467,3276,7.0,1574,728,734,1640
2021-01-05,442,0,40,15,0,8.384458,8.631933,8.374559,8.562639,70127800.0,0.0,0.0,5402,3294,12.0,1613,676,704,1510
