# Data Cleaning in SQL

# Access Database

In [1]:
import psycopg2 as pg
import pandas as pd
import pandas.io.sql as pd_sql

import pickle
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
pd.set_option('display.precision', 3)

In [2]:
plt.style.use('ggplot')
plt.rc('font', size=18)

In [3]:
# Postgres info to connect

connection_args = {
                   'host': 'localhost',  # Connecting to local version of psql
                   'dbname': 'fox_ticket_sales',    # DB that we are connecting to
                   'port': 5432          # port opened on AWS
                  }

connection = pg.connect(**connection_args)

In [4]:
cursor = connection.cursor()

# Data Cleaning

- There are a number of shows listed that have no ticket sales data. These will be removed from the dataset.
- There are a number of columns that are unnecessary for analysis and will be removed. See below:
    - gross_gate
    - currency
    - venue_name
    - venue_city
    - venue_state
    - venue_country
    - promoter
- **Assumptions**
    - For any concert that has a max ticket price of $0.00, assume that the price for all tickets is the minimum ticket price (i.e. min ticket price = max ticket price)
    - For co-headlining tours, there are multiple acts listed as headliners; the first act listed in the headliner column will be treated as the show's headliner.  For each co-headliner, add 2 to num_support column (i.e. co-headliners hold twice the weight of a regular supporting act). This will make it easier to interpret artist features later on.
    - For any show with more than 4 supporting acts, it is very likely to be billed as a "festival". A new feature called "show_type" will be created to label each show as a single headliner, co-headliner, or festival
- **Add the following time-based features**
    - Year
    - Month
    - Season
    - Day of Week
    - Time of Week (Weekday vs Weekend)

In [5]:
# drop_views = """    
#               DROP VIEW v_past_shows;
#               DROP TABLE IF EXISTs past_shows;
#               """

# cursor.execute(drop_views)
# cursor.execute('commit;')

In [6]:
past_shows_view = """ 
                  CREATE VIEW v_past_shows AS
                         WITH past_shows_cte AS (
                                                 SELECT event_date::date AS event_date,
                                                        DATE_PART('year', event_date::date)::int AS year,
                                                        DATE_PART('month', event_date::date)::int AS month,
                                                        DATE_PART('dow', event_date::date) AS day_of_week,
                                                        num_shows,
                                                        headliner,
                                                        support,
                                                        CASE WHEN support IS NOT NULL AND (LENGTH(headliner) - LENGTH(REPLACE(headliner, '/', '')) + 1 > 1)
                                                             THEN LENGTH(support) - LENGTH(REPLACE(support, ',', '')) + 1 + 2*(LENGTH(headliner) - LENGTH(REPLACE(headliner, '/', '')))
                                                             WHEN support IS NULL AND (LENGTH(headliner) - LENGTH(REPLACE(headliner, '/', '')) + 1 > 1)
                                                             THEN 2*(LENGTH(headliner) - LENGTH(REPLACE(headliner, '/', '')))
                                                             WHEN support IS NULL
                                                             THEN 0
                                                             ELSE LENGTH(support) - LENGTH(REPLACE(support, ',', '')) + 1
                                                         END AS num_support,
                                                        tickets_sold,
                                                        gross_usd,
                                                        venue_capacity,
                                                        percentage_sold,
                                                        ticket_price_min,
                                                        ticket_price_max
                                                   FROM ticket_sales
                                                  WHERE tickets_sold IS NOT NULL
                                                )

                  SELECT event_date,
                         year,
                         TO_CHAR(event_date::date, 'Month') AS month,
                         CASE WHEN month IN (12,1,2) THEN 'Winter'
                              WHEN month IN  (3,4,5) THEN 'Spring'
                              WHEN month IN  (6,7,8) THEN 'Summer'
                              ELSE 'Autumn'
                               END AS season,
                         TO_CHAR(event_date::date, 'Day') AS day_of_week,
                         CASE WHEN day_of_week IN (0,1,2,3) THEN 'Weekday'
                              ELSE 'Weekend'
                               END AS time_of_week,                                
                         num_shows,
                         CASE WHEN num_support > 4
                              THEN 'festival'
                              WHEN num_shows > 1
                              THEN 'multiple shows'
                              WHEN LENGTH(headliner) - LENGTH(REPLACE(headliner, '/', '')) + 1 > 1
                              THEN 'co-headliner'
                              ELSE 'single headliner'
                               END AS show_type, 
                         REPLACE(SPLIT_PART(headliner, '/', 1), '"', '') AS headliner,
                         support,
                         num_support,
                         tickets_sold,
                         gross_usd,
                         venue_capacity,
                         percentage_sold,
                         ticket_price_min,
                         CASE WHEN ticket_price_max = 0
                              THEN ticket_price_min
                              ELSE ticket_price_max
                          END AS ticket_price_max
                    FROM past_shows_cte;
                  """

cursor.execute(past_shows_view)
cursor.execute('commit;')

- For a number of the shows, the tour name or festival name is listed as the headliner. In order to scrape artist data from the web, the headliner name will need to be the actual headliner. This occurs about 25 times throughout the table and will be manually fixed.

In [7]:
artists_cleaned = """
                  CREATE VIEW v_shows_artists AS
                  SELECT event_date,
                         year,
                         month,
                         season,
                         day_of_week,
                         time_of_week,
                         num_shows,
                         show_type,
                         CASE WHEN headliner ILIKE '%Ragga Muffins%' AND year = 2009 THEN 'K''naan'
                              WHEN headliner ILIKE '%Ragga Muffins%' AND year = 2010 THEN 'Barrington Levy'
                              WHEN headliner ILIKE '%Tribute to Norton Buffalo%' THEN 'Steve Miller Band'
                              WHEN headliner ILIKE '%The HARD Tour%' THEN 'Crystal Castles'
                              WHEN headliner ILIKE '%STS9%' THEN 'STS9'                    
                              WHEN headliner ILIKE '%Outside Lands Music & Arts Festival%' THEN 'Sean Hayes'
                              WHEN headliner ILIKE '%Experience Hendrix%' THEN 'Buddy Guy'
                              WHEN headliner ILIKE '%Mos Def%' THEN 'Mos Def'
                              WHEN headliner ILIKE '%Oakland United:%' THEN 'Primus'                    
                              WHEN headliner ILIKE '%Les Claypool%' THEN 'Primus'
                              WHEN headliner ILIKE '%Electric Bounce House%' THEN 'Axwell ^ Ingrosso'
                              WHEN headliner ILIKE '%Peppa Pig%' THEN 'Peppa Pig'
                              WHEN headliner ILIKE '%106 KMEL Holiday House Of Soul%' THEN 'Miguel'
                              WHEN headliner ILIKE '%Julien Baker%' THEN 'Julien Baker'
                              WHEN headliner ILIKE '%Lauryn Hill%' THEN 'Lauryn Hill'
                              WHEN headliner ILIKE '%Ella Mai%' THEN 'Ella Mai'
                              WHEN headliner ILIKE '%Neal Schon%' THEN 'Neal Schon'
                              WHEN headliner ILIKE '%Chris D&#39;Elia%' THEN 'Chris D''Elia'
                              WHEN headliner ILIKE '%The Specials%' THEN 'The Specials'
                              WHEN headliner ILIKE '%Little Feat%' THEN 'Little Feat'
                              WHEN headliner ILIKE '%The Flecktones%' THEN 'Bela Fleck & The Flecktones'
                              WHEN headliner ILIKE '%Local Natives%' THEN 'Local Natives'
                              WHEN headliner ILIKE '%Mac Dre B-day Celebration%' THEN 'Lil Jon'
                              WHEN headliner ILIKE '%Paul Simon%' THEN 'Paul Simon'
                              WHEN headliner ILIKE '%Kirk Franklin%' THEN 'Kirk Franklin'
                              WHEN headliner ILIKE '%Daniel Caesar%' THEN 'Daniel Caesar'
                              WHEN headliner ILIKE '%King Crimson%' THEN 'King Crimson'
                              ELSE headliner
                          END AS headliner,
                         support,
                         num_support,
                         tickets_sold,
                         gross_usd,
                         venue_capacity,
                         percentage_sold,
                         ticket_price_min,
                         ticket_price_max
                    FROM v_past_shows
                   WHERE headliner != 'Disney Junior House Party'
                     AND headliner != 'The Read Live!';
                  """

cursor.execute(artists_cleaned)
cursor.execute('commit;')

In [8]:
query = """
        SELECT *
          FROM v_shows_artists;
        """
artists_cleaned_df = pd_sql.read_sql(query, connection)

In [9]:
artists_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 17 columns):
event_date          858 non-null object
year                858 non-null int64
month               858 non-null object
season              858 non-null object
day_of_week         858 non-null object
time_of_week        858 non-null object
num_shows           858 non-null int64
show_type           858 non-null object
headliner           858 non-null object
support             644 non-null object
num_support         858 non-null int64
tickets_sold        858 non-null float64
gross_usd           858 non-null float64
venue_capacity      858 non-null float64
percentage_sold     858 non-null float64
ticket_price_min    858 non-null float64
ticket_price_max    858 non-null float64
dtypes: float64(6), int64(3), object(8)
memory usage: 114.1+ KB


# Save Final Table as Dataframe

- For use in "Artist_Feature_Web_Scraping" notebook

In [59]:
with open('Data/sql_shows_artists.pkl', 'wb') as to_write:
    pickle.dump(artists_cleaned_df, to_write)