# Web Scraping with Pandas

Based on the Data Professor youtube video:
https://youtu.be/JUSFaWkAASI

### Load Dependencies

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import yfinance as yf

In [2]:
# Import DB user and password
from api_keys import mysql_hostname
from api_keys import mysql_port
from api_keys import mysql_username
from api_keys import mysql_pass

In [3]:
# MySQL specific connection string
database_name = 'etlprojectdb'
table_price = 'price'
table_companies = 'companies'
database_url = f"mysql+mysqlconnector://{mysql_username}:{mysql_pass}@{mysql_hostname}:{mysql_port}/{database_name}"

In [4]:
# Create the engine
from sqlalchemy import create_engine
engine = create_engine(database_url)
connection = engine.connect()

### Create Web Scraping

In [5]:
# Wikipedia List of List of S&P 500 companies
url_wikipedia = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

# Output csv data file name
file_out = os.path.join('List_of_S6P_500_companies.csv')

In [6]:
def load_data(url, table_no):
    html = pd.read_html(url_wikipedia, header=0)
    df = html[table_no]
    return df

In [7]:
companies_list_df = load_data(url_wikipedia, table_no=0)
companies_list_df

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",1976-08-09,66740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,ABBV,AbbVie Inc.,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABMD,ABIOMED Inc,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,ACN,Accenture plc,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...,...
500,YUM,Yum! Brands Inc,reports,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
501,ZBRA,Zebra Technologies,reports,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
502,ZBH,Zimmer Biomet,reports,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
503,ZION,Zions Bancorp,reports,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


In [8]:
# Save data as csv
companies_list_df.to_csv(file_out, index=False)

In [9]:
#getting the information for the Comp Table for query

comp_tab_df = companies_list_df

comp_tab_df.drop(columns=['SEC filings', 'Headquarters Location','CIK','Founded'],axis=1, inplace=True)

In [10]:
#remaning columns for loading
comp_tab_df.rename(columns={"Symbol": "comp_tick", "Security": "comp_name", "GICS Sector": "sect_name",
                             "GICS Sub-Industry":"sub_sect_name",
                             "Date first added":"first_trade_date"}, inplace=True)

comp_tab_df.set_index('comp_tick', inplace=True)
comp_tab_df.head(10)

Unnamed: 0_level_0,comp_name,sect_name,sub_sect_name,first_trade_date
comp_tick,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,3M Company,Industrials,Industrial Conglomerates,1976-08-09
ABT,Abbott Laboratories,Health Care,Health Care Equipment,1964-03-31
ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,2012-12-31
ABMD,ABIOMED Inc,Health Care,Health Care Equipment,2018-05-31
ACN,Accenture plc,Information Technology,IT Consulting & Other Services,2011-07-06
ATVI,Activision Blizzard,Communication Services,Interactive Home Entertainment,2015-08-31
ADBE,Adobe Inc.,Information Technology,Application Software,1997-05-05
AMD,Advanced Micro Devices Inc,Information Technology,Semiconductors,2017-03-20
AAP,Advance Auto Parts,Consumer Discretionary,Automotive Retail,2015-07-09
AES,AES Corp,Utilities,Independent Power Producers & Energy Traders,1998-10-02


In [11]:
#spit the row that has two dates
companies_list_df['first_trade_date'].iloc[53].split('(')[0]

'1983-11-30 '

In [12]:
#remove the secon date
companies_list_df['first_trade_date'].iloc[53]=companies_list_df['first_trade_date'].iloc[53].split('(')[0]

In [13]:
#replacing na values with 0
companies_list_df.replace(to_replace="nan", value="0", inplace=False)

companies_list_df.head(505)

Unnamed: 0_level_0,comp_name,sect_name,sub_sect_name,first_trade_date
comp_tick,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,3M Company,Industrials,Industrial Conglomerates,1976-08-09
ABT,Abbott Laboratories,Health Care,Health Care Equipment,1964-03-31
ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,2012-12-31
ABMD,ABIOMED Inc,Health Care,Health Care Equipment,2018-05-31
ACN,Accenture plc,Information Technology,IT Consulting & Other Services,2011-07-06
...,...,...,...,...
YUM,Yum! Brands Inc,Consumer Discretionary,Restaurants,1997-10-06
ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,2019-12-23
ZBH,Zimmer Biomet,Health Care,Health Care Equipment,2001-08-07
ZION,Zions Bancorp,Financials,Regional Banks,2001-06-22


In [None]:
#load to sql
#comp_tab_df.to_sql(table_companies, con = engine, if_exists = 'append', chunksize = 50)


### Using Yfinance for retrieve stock prices

In [14]:
# List of Stock Symbols
list_symbol_companies = list(companies_list_df.index)

In [15]:
# Retrieve the stock prices
stock_price = yf.download(list_symbol_companies)

[*********************100%***********************]  505 of 505 completed

2 Failed downloads:
- BRK.B: No data found, symbol may be delisted
- BF.B: 1d data not available for startTime=-2208988800 and endTime=1603845261. Only 100 years worth of day granularity data are allowed to be fetched per request.


In [16]:
stock_price.head()

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,XLNX,XOM,XRAY,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1962-01-02,,,,,,,,,,,...,,902400.0,,,,,,,,
1962-01-03,,,,,,,,,,,...,,1200000.0,,,,,,,,
1962-01-04,,,,,,,,,,,...,,1088000.0,,,,,,,,
1962-01-05,,,,,,,,,,,...,,1222400.0,,,,,,,,
1962-01-08,,,,,,,,,,,...,,1388800.0,,,,,,,,


In [17]:
stock_price['Close']['AAPL']

Date
1962-01-02           NaN
1962-01-03           NaN
1962-01-04           NaN
1962-01-05           NaN
1962-01-08           NaN
                 ...    
2020-10-21    116.870003
2020-10-22    115.750000
2020-10-23    115.040001
2020-10-26    115.050003
2020-10-27    116.599998
Name: AAPL, Length: 14810, dtype: float64

In [18]:
stock_price = stock_price.loc['2010-01-01':]

stock_price.head()

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,XLNX,XOM,XRAY,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2010-01-04,20.579165,4.496876,39.094307,6.604801,,22.60673,8.74,18.022673,33.464951,37.09,...,2824700.0,27809100.0,1051400.0,5112800.0,,2962200.0,782400.0,168800.0,3974600.0,
2010-01-05,20.355618,5.005957,38.861965,6.616219,,22.44544,8.53,17.877064,33.671783,37.700001,...,3469700.0,30174700.0,763400.0,3255800.0,,3298700.0,1718100.0,168800.0,5605500.0,
2010-01-06,20.283297,4.798555,39.200817,6.51098,,22.23321,8.4,17.976341,34.029732,37.619999,...,6329200.0,35044700.0,1595100.0,2634300.0,,4178900.0,1277300.0,385300.0,12615200.0,
2010-01-07,20.257004,4.939965,39.191139,6.498945,,21.876654,8.4,18.125263,33.99791,36.889999,...,9252700.0,27192100.0,1096100.0,4508500.0,,2452400.0,1683500.0,183600.0,24716800.0,
2010-01-08,20.250422,4.84569,39.346043,6.54215,,22.114357,8.23,18.21792,33.86269,36.689999,...,5005800.0,24891800.0,803600.0,2460100.0,,3772300.0,2149500.0,266500.0,6903000.0,


### Save data as csv

In [19]:
# Save data as csv
stock_price.to_csv('stock_price.csv', index=False)

In [20]:
clean_stock_price = stock_price

#clean_stock_price.drop(columns=['Adj Close', 'High','Low','Open'],axis=1, inplace=True)
clean_stock_price.drop(columns=['Open'],axis=1, inplace=True)

In [21]:
clean_stock_price.head()

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,XLNX,XOM,XRAY,XRX,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2010-01-04,20.579165,4.496876,39.094307,6.604801,,22.60673,8.74,18.022673,33.464951,37.09,...,2824700.0,27809100.0,1051400.0,5112800.0,,2962200.0,782400.0,168800.0,3974600.0,
2010-01-05,20.355618,5.005957,38.861965,6.616219,,22.44544,8.53,17.877064,33.671783,37.700001,...,3469700.0,30174700.0,763400.0,3255800.0,,3298700.0,1718100.0,168800.0,5605500.0,
2010-01-06,20.283297,4.798555,39.200817,6.51098,,22.23321,8.4,17.976341,34.029732,37.619999,...,6329200.0,35044700.0,1595100.0,2634300.0,,4178900.0,1277300.0,385300.0,12615200.0,
2010-01-07,20.257004,4.939965,39.191139,6.498945,,21.876654,8.4,18.125263,33.99791,36.889999,...,9252700.0,27192100.0,1096100.0,4508500.0,,2452400.0,1683500.0,183600.0,24716800.0,
2010-01-08,20.250422,4.84569,39.346043,6.54215,,22.114357,8.23,18.21792,33.86269,36.689999,...,5005800.0,24891800.0,803600.0,2460100.0,,3772300.0,2149500.0,266500.0,6903000.0,


In [22]:
price_df = pd.DataFrame(columns = ["comp_tick","date","close_price","volume","currency"])

In [23]:
row_df = pd.DataFrame(columns = ["comp_tick","close_price","volume","currency"])
count = 505
tick = 1
comp = 0
for row in clean_stock_price.itertuples():

    data = row
    date = row[0]
    row_dict=[]
    for i in list_symbol_companies:

        row_dict.append({'comp_tick':list_symbol_companies[comp], 'date':date,'close_price':data[tick], 'volume':  data[count+tick],'currency':"USD"})
        #print(row_dict)
        
        tick = 1+tick
        comp = comp+1
    tick = 1
    comp = 0
    price_df = price_df.append(row_dict)

In [24]:
#del price_df
price_df.head(100)

Unnamed: 0,comp_tick,date,close_price,volume,currency
0,MMM,2010-01-04,20.579165,22.389128,USD
1,ABT,2010-01-04,4.496876,4.770000,USD
2,ABBV,2010-01-04,39.094307,40.380001,USD
3,ABMD,2010-01-04,6.604801,7.643214,USD
4,ACN,2010-01-04,,,USD
...,...,...,...,...,...
95,CNC,2010-01-04,17.748413,26.500000,USD
96,CNP,2010-01-04,29.257708,41.424999,USD
97,CERN,2010-01-04,44.974976,61.430000,USD
98,CF,2010-01-04,24.081123,30.500000,USD


In [25]:
price_df_new = price_df
price_df_new.set_index('comp_tick', inplace=True)
price_df_new.head(10)

Unnamed: 0_level_0,date,close_price,volume,currency
comp_tick,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,2010-01-04,20.579165,22.389128,USD
ABT,2010-01-04,4.496876,4.77,USD
ABBV,2010-01-04,39.094307,40.380001,USD
ABMD,2010-01-04,6.604801,7.643214,USD
ACN,2010-01-04,,,USD
ATVI,2010-01-04,22.60673,26.629999,USD
ADBE,2010-01-04,8.74,8.74,USD
AMD,2010-01-04,18.022673,26.129908,USD
AAP,2010-01-04,33.464951,42.07,USD
AES,2010-01-04,37.09,37.09,USD


In [26]:
price_df_new.head(100)

Unnamed: 0_level_0,date,close_price,volume,currency
comp_tick,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MMM,2010-01-04,20.579165,22.389128,USD
ABT,2010-01-04,4.496876,4.770000,USD
ABBV,2010-01-04,39.094307,40.380001,USD
ABMD,2010-01-04,6.604801,7.643214,USD
ACN,2010-01-04,,,USD
...,...,...,...,...
CNC,2010-01-04,17.748413,26.500000,USD
CNP,2010-01-04,29.257708,41.424999,USD
CERN,2010-01-04,44.974976,61.430000,USD
CF,2010-01-04,24.081123,30.500000,USD


In [None]:
#Load to the sql database

#price_df.to_sql(table_price, con = engine, if_exists = 'append', chunksize = 1000)