This files contains all commands required in order to set up the DB locally for you.
Please change - 'username_and_password' variable in the next code section according to your local enviornment settings!!!

Import packages and start engine connection:

In [1]:
import pandas as pd
#import MySQLdb
import pymysql as mdb
mdb.install_as_MySQLdb()
#import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.sql import text

# Creating connection: "mysql://username:password@host:port"
# TODO: Change this according to your local settings!!!
username_and_password = 'root:ru,gcur'
engine = create_engine(f"mysql://{username_and_password}@localhost")
con = engine.connect()

dataset_dir = '../dataset/yelp_%s.csv'

Create the 'yelp' schema:

In [None]:
con.execute('Create Schema yelp')

business table:

In [2]:
business_df = pd.read_csv(dataset_dir % 'business', encoding='utf-8')
# Make needed changes to the DF:
business_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)

In [None]:
con.execute(
  """
  CREATE TABLE yelp.business (
  id INT AUTO_INCREMENT PRIMARY KEY,
	business_id VARCHAR(22) NOT NULL, 
	name VARCHAR(66) NOT NULL, 
	neighborhood VARCHAR(40), 
	address VARCHAR(118) NOT NULL, 
	city VARCHAR(50), 
	state VARCHAR(3) NOT NULL, 
	postal_code VARCHAR(8), 
	latitude DECIMAL(38, 15) NOT NULL, 
	longitude DECIMAL(38, 18) NOT NULL, 
	stars FLOAT NOT NULL, 
	review_count INT NOT NULL, 
	is_open BOOL NOT NULL, 
	categories VARCHAR(286)
	)
  """
)

business_df.to_sql(
  con=con,
  schema='yelp',
  name='business',
  if_exists='append',
  index=False,
  index_label='id'
)

business_hours table:

In [None]:
business_hours_df = pd.read_csv(dataset_dir % 'business_hours', encoding='utf-8')
# Make needed changes to the DF:
business_hours_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
con.execute(
  """
  CREATE TABLE yelp.business_hours (
  id INT AUTO_INCREMENT PRIMARY KEY,
	business_id VARCHAR(22) NOT NULL, 
	monday VARCHAR(11), 
	tuesday VARCHAR(11), 
	wednesday VARCHAR(11), 
	thursday VARCHAR(11), 
	friday VARCHAR(11), 
	saturday VARCHAR(11), 
	sunday VARCHAR(11)
	)
  """
)

business_hours_df.to_sql(
  con=con,
  schema='yelp',
  name='business_hours',
  if_exists='append',
  index=False,
  index_label='id'
)

checkin table:

In [10]:
checkin_df = pd.read_csv(dataset_dir % 'checkin', encoding='utf-8')
# Make needed changes to the DF:
checkin_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
con.execute(
  """
  CREATE TABLE yelp.checkin (
  id INT AUTO_INCREMENT PRIMARY KEY,
	business_id VARCHAR(22) NOT NULL, 
	weekday VARCHAR(3) NOT NULL, 
	hour TIME NOT NULL, 
	checkins INT NOT NULL
	)
  """
)

checkin_df.to_sql(
  con=con,
  schema='yelp',
  name='checkin',
  if_exists='append',
  index=False,
  index_label='id'
)

user table:

In [6]:
user_df = pd.read_csv(dataset_dir % 'user', encoding='utf-8')
# Make needed changes to the DF:
user_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)

In [None]:
con.execute(
  """
  CREATE TABLE yelp.user (
  id INT AUTO_INCREMENT PRIMARY KEY,
	user_id VARCHAR(22) NOT NULL,
	name VARCHAR(33),
	review_count INT NOT NULL,
	yelping_since DATE NOT NULL,
	friends MEDIUMTEXT,
	useful INT NOT NULL,
	funny INT NOT NULL,
	cool INT NOT NULL,
	fans INT NOT NULL,
	elite MEDIUMTEXT,
	average_stars FLOAT NOT NULL,
	compliment_hot INT NOT NULL,
	compliment_more INT NOT NULL,
	compliment_profile INT NOT NULL,
	compliment_cute INT NOT NULL,
	compliment_list INT NOT NULL,
	compliment_note INT NOT NULL,
	compliment_plain INT NOT NULL,
	compliment_cool INT NOT NULL,
	compliment_funny INT NOT NULL,
	compliment_writer INT NOT NULL,
	compliment_photos INT NOT NULL
	)
  """
)

user_df.to_sql(
  con=con,
  schema='yelp',
  name='user',
  if_exists='append',
  index=False,
  index_label='id'
)

review table:

In [7]:
review_df = pd.read_csv(dataset_dir % 'review', encoding='utf-8')
# Make needed changes to the DF:
review_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
review_df.drop_duplicates('id', inplace=True)
"""
In order to improve query times on existing VARCHAR IDs, we'll add the existing "ID" to the review table, as an improvment to the existing "business_id" & "user_id" columns.
We'll call the new columns "new_user_id" & "new_business_id":
"""
user_ids_df = user_df[['id', 'user_id']].rename(columns={'id': 'new_user_id'})
review_df = review_df.merge(user_ids_df, left_on='user_id', right_on='user_id', how='left')
business_ids_df = business_df[['id', 'business_id']].rename(columns={'id': 'new_business_id'})
review_df = review_df.merge(business_ids_df, left_on='business_id', right_on='business_id', how='left')

In [4]:

con.execute(
  """
  CREATE TABLE yelp.review (
  id INT AUTO_INCREMENT PRIMARY KEY,
	review_id VARCHAR(22) NOT NULL, 
	user_id VARCHAR(22) NOT NULL, 
  new_user_id INT,
	business_id VARCHAR(22) NOT NULL, 
  new_business_id INT,
	stars INT NOT NULL, 
	date DATE NOT NULL, 
	text VARCHAR(5010) NOT NULL, 
	useful INT NOT NULL, 
	funny INT NOT NULL, 
	cool INT NOT NULL
	)
  """
)

review_df.to_sql(
  con=con,
  schema='yelp',
  name='review',
  if_exists='append',
  index=False,
  index_label='id'
)

tip table:

In [20]:
tip_df = pd.read_csv(dataset_dir % 'tip', encoding='utf-8')
# Make needed changes to the DF:
tip_df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
con.execute(
  """
  CREATE TABLE yelp.tip (
  id INT AUTO_INCREMENT PRIMARY KEY,
	text VARCHAR(510), 
	date DATE NOT NULL, 
	likes DECIMAL(38, 0) NOT NULL, 
	business_id VARCHAR(22) NOT NULL, 
	user_id VARCHAR(22) NOT NULL
	)
  """
)

tip_df.to_sql(
  con=con,
  schema='yelp',
  name='tip',
  if_exists='append',
  index=False,
  index_label='id'
)

state table (This table isn't included in the original data, but one we found in order to answer our busines questions):

In [4]:
state_df = pd.read_csv('../small_business_data_by_state_usa_2017.csv', encoding='utf-8')

con.execute(
  """
  CREATE TABLE yelp.state (
  name VARCHAR(2) PRIMARY KEY,
	businesses INT NOT NULL
	)
  """
)

state_df.to_sql(
  con=con,
  schema='yelp',
  name='state',
  if_exists='append',
  index=False,
  index_label='name'
)