Run this file in order to create your local data warehouse to be used for our project's business questions.
This is data warehouse is constructed according to our star scheme.

Import packages and start engine connection:

In [25]:
import pandas as pd
#import MySQLdb
import pymysql as mdb
mdb.install_as_MySQLdb()
#import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.sql import text

# Creating connection: "mysql://username:password@host:port"
# TODO: Change this according to your local settings!!!
username_and_password = 'root:ru,gcur'
engine = create_engine(f"mysql://{username_and_password}@localhost")
con = engine.connect()

dataset_dir = '../dataset/yelp_%s.csv'

Create the yelp_dw schema:

In [2]:
con.execute('Create Schema yelp_dw')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x2376cc37dd8>

Now we'll create the DFs that will be added to the DW.
Before inserting the data into the DW, we first need to clean problematic data, such as rows with missing values.
These can include rows with null values or objects which have foreign keys which don't exist in the data.

We'll start by getting the data from the CSV files:

In [15]:
business_df = pd.read_csv(dataset_dir % 'business', encoding='utf-8')
review_df = pd.read_csv(dataset_dir % 'review', encoding='utf-8')
user_df = pd.read_csv(dataset_dir % 'user', encoding='utf-8')

Create the User dim & data:

In [31]:
# Create the User dim:
con.execute(
  """
  CREATE TABLE yelp_dw.user_dim (
    user_id VARCHAR(22) PRIMARY KEY NOT NULL,
    name VARCHAR(33),
    review_count INT NOT NULL,
    sum_votes INT NOT NULL,
    fans INT NOT NULL,
    elite MEDIUMTEXT,
    sum_compliments INT NOT NULL,
    grade FLOAT NOT NULL
  );
  """
)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x23700b5c2b0>

In [28]:
# Create a helper view to be used in the creation of the user_dim data:
con.execute(
  """
  CREATE VIEW yelp.max_user_values AS
    SELECT 
        MAX(review_count) AS max_review_count,
        MAX(fans) AS max_fans,
        MAX(useful + funny + cool) AS max_votes,
        MAX(compliment_hot + compliment_more + compliment_profile + compliment_cute + compliment_list + compliment_note + compliment_plain + compliment_cool + compliment_funny + compliment_writer + compliment_photos) AS max_compliments
    FROM
        yelp.user
  """
)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x23701c5e748>

In [32]:
con.execute(
  """
  INSERT INTO yelp_dw.user_dim
  SELECT
      user_id,
      name,
      review_count,
      sum_votes,
      fans,
      elite,
      sum_compliments,
      0.3 * sum_votes / max_votes + 0.2 * review_count / max_review_count + 0.3 * fans / max_fans + 0.2 * sum_compliments / max_compliments AS grade
  FROM
      (SELECT 
          user_id,
              name,
              review_count,
              useful + funny + cool AS sum_votes,
              fans,
              elite,
              compliment_hot + compliment_more + compliment_profile + compliment_cute + compliment_list + compliment_note + compliment_plain + compliment_cool + compliment_funny + compliment_writer + compliment_photos AS sum_compliments,
              max_review_count,
              max_votes,
              max_fans,
              max_compliments
      FROM
          yelp.user CROSS JOIN yelp.max_user_values
    ) AS users
  """
)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x237018e6668>

Check for problems with the DFs' data:

Used the following code to get all 'state' values in order to compare manually to the data we've got from the internet regarding small businesses in the USA circa 2017.
You don't have to run the following code cell:

In [23]:
states_df = pd.DataFrame({'state': list(set(business_df['state'].to_list()))}).sort_values(by='state', ascending=True)
states_df.to_csv('../states.csv', index=True, encoding='utf-8')

Start by creating the dimensions:

In [None]:
# Create the Year dimension:
con.execute(
  """
  CREATE TABLE yelp_dw.year_dim (
    year_id INT AUTO_INCREMENT PRIMARY KEY,
      year INT
  );
  """
)

con.execute(
  """
  INSERT INTO yelp_dw.year_dim
  SELECT
    ROW_NUMBER() OVER () AS year_id,
      YEAR(date) as year
  FROM yelp.review
  GROUP BY year
  """
)

# Create the Vote dimension:
con.execute(
  """
  CREATE TABLE yelp_dw.vote_dim (
    vote_id INT AUTO_INCREMENT PRIMARY KEY,
      sum_votes INT NOT NULL
  );
  """
)

con.execute(
  """
  INSERT INTO yelp_dw.vote_dim
  SELECT
    ROW_NUMBER() OVER () AS vote_id,
      useful + funny + cool as sum_votes
  FROM yelp.review
  GROUP BY sum_votes
  """
)

Create the fact table: