# Project 9: Web Scraping, APIs & Wrappers (US Stocks)

## Web Scraping - the Dow Jones Constituents

In [1]:
import pandas as pd

In [3]:
const = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[1]

In [4]:
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date Added,Notes,"Index Weighting (Apr 30, 2020)"
0,3M,NYSE,NYSE: MMM,Conglomerate,1976-08-09,as Minnesota Mining and Manufacturing,4.35%
1,American Express,NYSE,NYSE: AXP,Financial services,1982-08-30,,2.68%
2,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19,,8.01%
3,Boeing,NYSE,NYSE: BA,Aerospace and defense,1987-03-12,,3.87%
4,Caterpillar Inc.,NYSE,NYSE: CAT,Construction and Mining,1991-05-06,,3.34%
5,Chevron Corporation,NYSE,NYSE: CVX,Petroleum industry,2008-02-19,also 1930-07-18 to 1999-11-01,2.63%
6,Cisco Systems,NASDAQ,CSCO,Information technology,2009-06-08,,1.21%
7,The Coca-Cola Company,NYSE,NYSE: KO,Food industry,1987-03-12,also 1932-05-26 to 1935-11-20,1.31%
8,Dow Inc.,NYSE,NYSE: DOW,Chemical industry,2019-04-02,,1.04%
9,ExxonMobil,NYSE,NYSE: XOM,Petroleum industry,1928-10-01,as Standard Oil of New Jersey,1.32%


In [5]:
const = const.iloc[:, :5].copy()
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date Added
0,3M,NYSE,NYSE: MMM,Conglomerate,1976-08-09
1,American Express,NYSE,NYSE: AXP,Financial services,1982-08-30
2,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19
3,Boeing,NYSE,NYSE: BA,Aerospace and defense,1987-03-12
4,Caterpillar Inc.,NYSE,NYSE: CAT,Construction and Mining,1991-05-06
5,Chevron Corporation,NYSE,NYSE: CVX,Petroleum industry,2008-02-19
6,Cisco Systems,NASDAQ,CSCO,Information technology,2009-06-08
7,The Coca-Cola Company,NYSE,NYSE: KO,Food industry,1987-03-12
8,Dow Inc.,NYSE,NYSE: DOW,Chemical industry,2019-04-02
9,ExxonMobil,NYSE,NYSE: XOM,Petroleum industry,1928-10-01


In [6]:
const.rename(columns = {"Date Added":"Date_Added"}, inplace = True)

In [7]:
const.Date_Added = pd.to_datetime(const.Date_Added)

In [8]:
const.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Company     30 non-null     object        
 1   Exchange    30 non-null     object        
 2   Symbol      30 non-null     object        
 3   Industry    30 non-null     object        
 4   Date_Added  30 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(4)
memory usage: 1.3+ KB


## Normalizing Unicode Strings and Getting the Ticker Symbols

In [9]:
import unicodedata

In [11]:
const.Symbol[0]

'NYSE:\xa0MMM'

In [None]:
const.Symbol.to_list()

In [None]:
const.info()

In [12]:
const.Symbol.apply(lambda x: unicodedata.normalize("NFKD", x))

0     NYSE: MMM
1     NYSE: AXP
2          AAPL
3      NYSE: BA
4     NYSE: CAT
5     NYSE: CVX
6          CSCO
7      NYSE: KO
8     NYSE: DOW
9     NYSE: XOM
10     NYSE: GS
11     NYSE: HD
12    NYSE: IBM
13         INTC
14    NYSE: JNJ
15    NYSE: JPM
16    NYSE: MCD
17    NYSE: MRK
18         MSFT
19    NYSE: NKE
20    NYSE: PFE
21     NYSE: PG
22    NYSE: RTX
23    NYSE: TRV
24    NYSE: UNH
25     NYSE: VZ
26      NYSE: V
27    NYSE: WMT
28          WBA
29    NYSE: DIS
Name: Symbol, dtype: object

In [13]:
const.Symbol = const.Symbol.apply(lambda x: unicodedata.normalize("NFKD", x))

In [14]:
const.Symbol[0]

'NYSE: MMM'

In [15]:
const["Ticker"] = const.Symbol.str.split(": ").apply(lambda x: x[-1])

In [16]:
const

Unnamed: 0,Company,Exchange,Symbol,Industry,Date_Added,Ticker
0,3M,NYSE,NYSE: MMM,Conglomerate,1976-08-09,MMM
1,American Express,NYSE,NYSE: AXP,Financial services,1982-08-30,AXP
2,Apple Inc.,NASDAQ,AAPL,Information technology,2015-03-19,AAPL
3,Boeing,NYSE,NYSE: BA,Aerospace and defense,1987-03-12,BA
4,Caterpillar Inc.,NYSE,NYSE: CAT,Construction and Mining,1991-05-06,CAT
5,Chevron Corporation,NYSE,NYSE: CVX,Petroleum industry,2008-02-19,CVX
6,Cisco Systems,NASDAQ,CSCO,Information technology,2009-06-08,CSCO
7,The Coca-Cola Company,NYSE,NYSE: KO,Food industry,1987-03-12,KO
8,Dow Inc.,NYSE,NYSE: DOW,Chemical industry,2019-04-02,DOW
9,ExxonMobil,NYSE,NYSE: XOM,Petroleum industry,1928-10-01,XOM


In [17]:
ticker_list = const.Ticker.to_list()

In [18]:
ticker_list

['MMM',
 'AXP',
 'AAPL',
 'BA',
 'CAT',
 'CVX',
 'CSCO',
 'KO',
 'DOW',
 'XOM',
 'GS',
 'HD',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'MCD',
 'MRK',
 'MSFT',
 'NKE',
 'PFE',
 'PG',
 'RTX',
 'TRV',
 'UNH',
 'VZ',
 'V',
 'WMT',
 'WBA',
 'DIS']

In [21]:
const.to_csv("data/const.csv", index = False)

## Loading and Saving Historical Stock Prices

In [None]:
import pandas as pd
import yfinance as yf

In [None]:
ticker_list

In [None]:
prices = yf.download(ticker_list, start = "2007-01-01", end = "2020-03-31")

In [None]:
prices

In [None]:
prices.info()

In [None]:
prices = prices.loc[:,"Close"].copy()

In [None]:
prices.info()

In [None]:
prices

In [None]:
prices.to_csv("const_prices.csv")

In [None]:
dji = yf.download("^DJI", start = "2007-01-01",  end = "2020-03-31")

In [None]:
dji

In [None]:
dji.to_csv("dji.csv")