# ETL Project

In [2]:
#Dependencies
import pandas as pd
from sqlalchemy import create_engine

## Extract CSVs into DataFrame

In [3]:
#Read in top250 file and view dataframe 
top250_file = "data/Top250.csv"
top250 = pd.read_csv(top250_file)
top250.head()

Unnamed: 0,Rank,Restaurant,Content,Sales,YOY_Sales,Units,YOY_Units,Headquarters,Segment_Category
0,1,McDonald's,,40412,4.9%,13846,-0.5%,,Quick Service & Burger
1,2,Starbucks,,21380,8.6%,15049,3.0%,,Quick Service & Coffee Cafe
2,3,Chick-fil-A,While Popeyes got a lot of the chicken buzz in...,11320,13.0%,2470,5.0%,,Quick Service & Chicken
3,4,Taco Bell,,11293,9.0%,6766,2.7%,,Quick Service & Mexican
4,5,Burger King,,10204,2.7%,7346,0.2%,,Quick Service & Burger


In [4]:
#Read in future50 file and view dataframe 
future50_file = "data/Future50.csv"
future50 = pd.read_csv(future50_file)
future50.head()

Unnamed: 0,Rank,Restaurant,Location,Sales,YOY_Sales,Units,YOY_Units,Unit_Volume,Franchising
0,1,Evergreens,"Seattle, Wash.",24,130.5%,26,116.7%,1150,No
1,2,Clean Juice,"Charlotte, N.C.",44,121.9%,105,94.4%,560,Yes
2,3,Slapfish,"Huntington Beach, Calif.",21,81.0%,21,90.9%,1370,Yes
3,4,Clean Eatz,"Wilmington, N.C.",25,79.7%,46,58.6%,685,Yes
4,5,Pokeworks,"Irvine, Calif.",49,77.1%,50,56.3%,1210,Yes


In [5]:
#Read in Independence100 file and view dataframe 
ind100_file = "data/Independence100.csv"
ind100 = pd.read_csv(ind100_file)
ind100.head()

Unnamed: 0,Rank,Restaurant,Sales,Average Check,City,State,Meals Served
0,1,Carmine's (Times Square),39080335.0,40,New York,N.Y.,469803.0
1,2,The Boathouse Orlando,35218364.0,43,Orlando,Fla.,820819.0
2,3,Old Ebbitt Grill,29104017.0,33,Washington,D.C.,892830.0
3,4,LAVO Italian Restaurant & Nightclub,26916180.0,90,New York,N.Y.,198500.0
4,5,Bryant Park Grill & Cafe,26900000.0,62,New York,N.Y.,403000.0


## Transform DataFrames

In [6]:
#Drop any null values

#Rename any columns

#Make the Sales units match across tables



### Join Future50 and Ind100 on location

In [16]:
# Step 1. Make a copy of Future 50 df
future50_df = future50.copy()
future50_df.head()


<bound method NDFrame.head of     Rank                        Restaurant                  Location  Sales  \
0      1                        Evergreens            Seattle, Wash.     24   
1      2                       Clean Juice           Charlotte, N.C.     44   
2      3                          Slapfish  Huntington Beach, Calif.     21   
3      4                        Clean Eatz          Wilmington, N.C.     25   
4      5                         Pokeworks            Irvine, Calif.     49   
5      6                       Playa Bowls             Belmar,  N.J.     39   
6      7                  The Simple Greek            Blue Bell, Pa.     24   
7      8                         Melt Shop            New York, N.Y.     20   
8      9                        Creamistry      Yorba Linda,  Calif.     24   
9     10              Joella's Hot Chicken           Louisville, Ky.     29   
10    11                     Eggs Up Grill         Spartanburg, S.C.     30   
11    12              

In [8]:
# Step 2 Split Future50 Locations into cities and states columns into a new df
future50_split = future50_df["Location"].str.split(",", n = 1, expand = True)
future50_split.head()

Unnamed: 0,0,1
0,Seattle,Wash.
1,Charlotte,N.C.
2,Huntington Beach,Calif.
3,Wilmington,N.C.
4,Irvine,Calif.


In [9]:
# Step 3 Create City and State columns on existing future50_df then drop the Location column
future50_df["City"]= future50_split[0]
future50_df["State"]= future50_split[1]
future50_df = future50_df.drop(columns = ['Location'])
future50_df.head()

Unnamed: 0,Rank,Restaurant,Sales,YOY_Sales,Units,YOY_Units,Unit_Volume,Franchising,City,State
0,1,Evergreens,24,130.5%,26,116.7%,1150,No,Seattle,Wash.
1,2,Clean Juice,44,121.9%,105,94.4%,560,Yes,Charlotte,N.C.
2,3,Slapfish,21,81.0%,21,90.9%,1370,Yes,Huntington Beach,Calif.
3,4,Clean Eatz,25,79.7%,46,58.6%,685,Yes,Wilmington,N.C.
4,5,Pokeworks,49,77.1%,50,56.3%,1210,Yes,Irvine,Calif.


In [10]:
# Step 4 Join Future50 and Ind100 on location
ind100.head()

Unnamed: 0,Rank,Restaurant,Sales,Average Check,City,State,Meals Served
0,1,Carmine's (Times Square),39080335.0,40,New York,N.Y.,469803.0
1,2,The Boathouse Orlando,35218364.0,43,Orlando,Fla.,820819.0
2,3,Old Ebbitt Grill,29104017.0,33,Washington,D.C.,892830.0
3,4,LAVO Italian Restaurant & Nightclub,26916180.0,90,New York,N.Y.,198500.0
4,5,Bryant Park Grill & Cafe,26900000.0,62,New York,N.Y.,403000.0


In [None]:
#Aggregate Ind100 by summing (sales or meals) to find overall top

#Grouby Restaurant, summing by (sales or meals)

#Set rank to index

#Reset index to show new rank

## Load DataFrames into Database

In [12]:
# Import pgadmin password from python file
from etlpw import pw

In [13]:
# Create Database Connection
engine = create_engine(f'postgresql://postgres:{pw}@localhost:5432/restaurant_db')
#engine = create_engine(f'postgresql://{connection_string}')

In [14]:
# Confirm tables
engine.table_names()

['future50', 'ind100', 'top250']

In [20]:
# Add dataframes to sql for Future 50 df.
future50.to_sql(name='future50', con=engine, if_exists='append', index=False)


In [21]:
# Add dataframes to sql for ind100 df.
ind100.to_sql(name='ind100', con=engine, if_exists='append', index=False)

In [22]:
# Add dataframes to sql for top250 df.
top250.to_sql(name='top250', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the customer_location table

In [15]:
pd.read_sql_query('select * from ind100', con=engine).head()

Unnamed: 0,Rank,Restaurant,Sales,Average Check,City,State,Meals Served
0,1,Carmine's (Times Square),39080335.0,40,New York,N.Y.,469803.0
1,2,The Boathouse Orlando,35218364.0,43,Orlando,Fla.,820819.0
2,3,Old Ebbitt Grill,29104017.0,33,Washington,D.C.,892830.0
3,4,LAVO Italian Restaurant & Nightclub,26916180.0,90,New York,N.Y.,198500.0
4,5,Bryant Park Grill & Cafe,26900000.0,62,New York,N.Y.,403000.0
