# S&P 500 ETL Project

### This notebook performs the followins tasks:

1. Create a connection to the MySQL database hosted in the Linode Server
2. Query the database
3. Serve as baseline for the script that will be run on the Flask server

### Load Lybraries

In [73]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
import datetime as dt


# Import DB user and password
from api_keys_kf import mysql_hostname
from api_keys_kf import mysql_port
from api_keys_kf import mysql_username
from api_keys_kf import mysql_pass

### Connect to the Database

In [3]:
# MySQL specific connection string
database_name = 'etlprojectdb'
table_price = 'price'
table_companies = 'companies'
database_url = f"mysql+mysqlconnector://{mysql_username}:{mysql_pass}@{mysql_hostname}:{mysql_port}/{database_name}"

In [4]:
# Create the engine
from sqlalchemy import create_engine
engine = create_engine(database_url)
connection = engine.connect()

### Time Query

In [5]:
# date and time of the database
pd.read_sql(f"SELECT now();", engine)

Unnamed: 0,now()
0,2020-10-24 16:01:12


### Query Database

In [23]:
# Select all the companies from the database
companies_df = pd.read_sql(f"SELECT * FROM {table_companies}", engine)

In [24]:
# Show the first companies on the dataset
companies_df.head()

Unnamed: 0,id,comp_tick,comp_name,sect_name,sub_sect_name,first_trade_date
0,12,A,Agilent Technologies Inc,Health Care,Health Care Equipment,2000-06-05
1,29,AAL,American Airlines Group,Industrials,Airlines,2015-03-23
2,9,AAP,Advance Auto Parts,Consumer Discretionary,Automotive Retail,2015-07-09
3,47,AAPL,Apple Inc.,Information Technology,"Technology Hardware, Storage & Peripherals",1982-11-30
4,3,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,2012-12-31


In [68]:
# Select all the companies from the database
price_df = pd.read_sql(f"SELECT * FROM {table_price}", engine)

In [77]:
# Show the first companies on the dataset
price_df.head(5000)

Unnamed: 0,id,comp_tick,date,close_price,volume,currency
0,1,ZTS,2010-01-04,,,USD
1,2,ZTS,2010-01-05,,,USD
2,3,ZTS,2010-01-06,,,USD
3,4,ZTS,2010-01-07,,,USD
4,5,ZTS,2010-01-08,,,USD
...,...,...,...,...,...,...
2723,2724,ZTS,2020-10-19,159.99,1271100.0,USD
2724,2725,ZTS,2020-10-20,159.69,2145700.0,USD
2725,2726,ZTS,2020-10-21,161.18,1456700.0,USD
2726,2727,ZTS,2020-10-22,162.38,878800.0,USD


In [78]:
app_pri = pd.read_sql(f"SELECT * FROM {table_price} where comp_tick = 'AAPL'", engine)
app_pri.head()

Unnamed: 0,id,comp_tick,date,close_price,volume,currency
0,4,AAPL,2010-01-04,7.64321,493730000.0,USD
1,509,AAPL,2010-01-05,7.65643,601905000.0,USD
2,1014,AAPL,2010-01-06,7.53464,552160000.0,USD
3,1519,AAPL,2010-01-07,7.52071,477131000.0,USD
4,2024,AAPL,2010-01-08,7.57071,447611000.0,USD


### Data Wrangling

In [85]:
#get comp info based on current date

Comp_info = pd.read_sql(f"Select c.comp_name, c.comp_tick, c.sect_name, p.close_price AS LATEST_CLOSE_PRICE, p.date AS LATEST_PRICE_DATE from {table_companies} AS c join {table_price} as p on p.comp_tick = c.comp_tick WHERE c.comp_tick = 'AAPL' order by p.date desc limit 1", engine)

In [86]:
Comp_info

Unnamed: 0,comp_name,comp_tick,sect_name,LATEST_CLOSE_PRICE,LATEST_PRICE_DATE
0,Apple Inc.,AAPL,Information Technology,115.04,2020-10-23


In [109]:
# get information based on Sect and select date ranges

sect_info = pd.read_sql(f"Select c.comp_tick, c.comp_name, c.sect_name, c.sub_sect_name, p.close_price AS CLOSE_PRICE, p.date from {table_companies} AS c join {table_price} as p on p.comp_tick = c.comp_tick where c.sect_name = 'Information Technology' and p.date between '2020-10-01'and '2020-10-23'",engine)

In [110]:
sect_info.head(100)

Unnamed: 0,comp_tick,comp_name,sect_name,sub_sect_name,CLOSE_PRICE,date
0,AAPL,Apple Inc.,Information Technology,"Technology Hardware, Storage & Peripherals",116.79,2020-10-01
1,ACN,Accenture plc,Information Technology,IT Consulting & Other Services,225.18,2020-10-01
2,ADBE,Adobe Inc.,Information Technology,Application Software,499.51,2020-10-01
3,ADI,"Analog Devices, Inc.",Information Technology,Semiconductors,118.99,2020-10-01
4,ADP,Automatic Data Processing,Information Technology,Internet Services & Infrastructure,137.70,2020-10-01
...,...,...,...,...,...,...
95,FIS,Fidelity National Information Services,Information Technology,Data Processing & Outsourced Services,145.15,2020-10-02
96,FISV,Fiserv Inc,Information Technology,Data Processing & Outsourced Services,101.65,2020-10-02
97,FLIR,FLIR Systems,Information Technology,Electronic Equipment & Instruments,35.49,2020-10-02
98,FLT,FleetCor Technologies Inc,Information Technology,Data Processing & Outsourced Services,234.46,2020-10-02


In [97]:
# get stock info based on specific date ranges

date_range_info = pd.read_sql(f"Select c.comp_tick, c.comp_name, c.sect_name, c.sub_sect_name, p.close_price AS CLOSE_PRICE, p.date from {table_companies} AS c join {table_price} as p on p.comp_tick = c.comp_tick where p.date between '2020-10-15'and '2020-10-23'",engine)



In [98]:
date_range_info

Unnamed: 0,comp_tick,sect_name,sub_sect_name,CLOSE_PRICE,date
0,A,Health Care,Health Care Equipment,105.32,2020-10-15
1,AAL,Industrials,Airlines,12.23,2020-10-15
2,AAP,Consumer Discretionary,Automotive Retail,157.52,2020-10-15
3,AAPL,Information Technology,"Technology Hardware, Storage & Peripherals",120.71,2020-10-15
4,ABBV,Health Care,Pharmaceuticals,85.23,2020-10-15
...,...,...,...,...,...
3530,YUM,Consumer Discretionary,Restaurants,101.28,2020-10-23
3531,ZBH,Health Care,Health Care Equipment,142.71,2020-10-23
3532,ZBRA,Information Technology,Electronic Equipment & Instruments,303.72,2020-10-23
3533,ZION,Financials,Regional Banks,33.20,2020-10-23


In [107]:
#getting a specific closing price based on date
date_info = pd.read_sql(f"Select c.comp_tick, c.sect_name, c.sub_sect_name, p.close_price AS CLOSING_PRICE, p.volume, p.date from {table_companies} AS c join {table_price} as p on p.comp_tick = c.comp_tick where p.date ='2020-10-23'",engine)



In [108]:
date_info

Unnamed: 0,comp_tick,sect_name,sub_sect_name,CLOSING_PRICE,volume,date
0,A,Health Care,Health Care Equipment,106.17,833900.0,2020-10-23
1,AAL,Industrials,Airlines,12.60,90598100.0,2020-10-23
2,AAP,Consumer Discretionary,Automotive Retail,154.72,543600.0,2020-10-23
3,AAPL,Information Technology,"Technology Hardware, Storage & Peripherals",115.04,82396600.0,2020-10-23
4,ABBV,Health Care,Pharmaceuticals,84.34,4237600.0,2020-10-23
...,...,...,...,...,...,...
500,YUM,Consumer Discretionary,Restaurants,101.28,1039400.0,2020-10-23
501,ZBH,Health Care,Health Care Equipment,142.71,723300.0,2020-10-23
502,ZBRA,Information Technology,Electronic Equipment & Instruments,303.72,293500.0,2020-10-23
503,ZION,Financials,Regional Banks,33.20,3093000.0,2020-10-23


In [105]:
#company performance based on date range

comp_perf = pd.read_sql(f"Select c.comp_tick, c.sect_name, c.sub_sect_name, p.close_price AS CLOSING_PRICE, p.volume, p.date from {table_companies} AS c join {table_price} as p on p.comp_tick = c.comp_tick where c.comp_tick = 'AAL' and p.date between '2020-10-15'and '2020-10-23'",engine)


In [106]:
comp_perf

Unnamed: 0,comp_tick,sect_name,sub_sect_name,CLOSING_PRICE,volume,date
0,AAL,Industrials,Airlines,12.23,33776100.0,2020-10-15
1,AAL,Industrials,Airlines,12.46,32717300.0,2020-10-16
2,AAL,Industrials,Airlines,12.56,50560700.0,2020-10-19
3,AAL,Industrials,Airlines,12.8,46748800.0,2020-10-20
4,AAL,Industrials,Airlines,12.75,37503600.0,2020-10-21
5,AAL,Industrials,Airlines,13.15,90788900.0,2020-10-22
6,AAL,Industrials,Airlines,12.6,90598100.0,2020-10-23


In [None]:
#Converting table to Jason

In [41]:
import json
result = Comp_info.to_json(orient="records")
parsed = json.loads(result)

In [43]:
print(parsed)

[{'comp_name': 'Apple Inc.', 'sect_name': 'Information Technology'}]


### Data Analysis