In [1]:
# Import dependencies
import sqlite3
import pandas as pd
import numpy as np
from numpy import nan

In [2]:
# Connect to database
conn = sqlite3.connect('..\database.sqlite')

In [3]:
# Read sqlite query results into a pandas DataFrame
bi_df = pd.read_sql_query("""
SELECT 
*
FROM east_central_table

where category not in ('totals')  and year not in (2017)

union

SELECT 
*
FROM north_east_table

where category not in ('totals') and year not in (2017)

union

SELECT 
*
FROM pacific_table

where category not in ('totals') and year not in (2017)

union

SELECT 
*
FROM south_table

where category not in ('totals') and year not in (2017)

union

SELECT 
*
FROM south_east_table

where category not in ('totals') and year not in (2017)

union

SELECT 
*
FROM south_west_table

where category not in ('totals') and year not in (2017)

union

SELECT 
*
FROM west_central_table

where category not in ('totals') and year not in (2017)

""",conn)

# Drop na and first 3 columns
bi_df = bi_df.drop(bi_df.columns[[0]], axis=1)
bi_df.head()

Unnamed: 0,category,gender,age,year,division,population
0,video_game,Male,18-24,2013,east_central,26
1,video_game,Male,18-24,2013,north_east,47
2,video_game,Male,18-24,2013,pacific,62
3,video_game,Male,18-24,2013,south,61
4,video_game,Male,18-24,2013,south_east,23


In [4]:
# Load csv files
div_data = '../clean_data/nielsen_div_coordinate.csv'

# Read Data File and store into Pandas Data Frames
div_df = pd.read_csv(div_data)
div_df = div_df.drop(div_df.columns[[0]], axis=1)
div_df.head()

Unnamed: 0,division,Latitude,Longitude
0,east_central,41.885151,-86.971436
1,north_east,42.233242,-72.815566
2,pacific,40.415126,-126.62062
3,south,35.918664,-79.692964
4,south_east,34.709733,-87.076199


In [5]:
# Merge two dataframes along the division column
bi_df = pd.merge(bi_df, div_df, on='division')
bi_df.head()

Unnamed: 0,category,gender,age,year,division,population,Latitude,Longitude
0,video_game,Male,18-24,2013,east_central,26,41.885151,-86.971436
1,video_game,Male,25-34,2013,east_central,37,41.885151,-86.971436
2,video_game,Male,35-44,2013,east_central,49,41.885151,-86.971436
3,video_game,Male,45-54,2013,east_central,31,41.885151,-86.971436
4,video_game,Male,55+,2013,east_central,13,41.885151,-86.971436


In [6]:
# Rename column into the dataframe using inplace=True
bi_df['category'].replace('all_live_theater/concerts/dance-attended_last_12_months', "live_event", inplace=True)
bi_df['category'].replace('social_media_[social_media_user]', "social_media", inplace=True)
bi_df['category'].replace('supermarket_goer_last_4_weeks', "supermarket", inplace=True)

bi_df['division'].unique()

array(['east_central', 'north_east', 'pacific', 'south', 'south_east',
       'south_west', 'west_central'], dtype=object)

In [7]:
# Feature engineer gender column
bi_df = bi_df.replace('video_game', 'Video Game Players')
bi_df = bi_df.replace('tv', 'Cable TV Watchers')
bi_df = bi_df.replace('radio', 'Radio Listeners')
bi_df = bi_df.replace('magazine', 'Magazine Readers')
bi_df = bi_df.replace('movie_goers', 'Movie Goers')
bi_df = bi_df.replace('supermarket', 'Supermarket Shopper')
bi_df = bi_df.replace('tablet_owner', 'Tablet Owners')
bi_df = bi_df.replace('live_event', 'Live Events Goers')
bi_df = bi_df.replace('social_media', 'Social Media Users')
bi_df = bi_df.replace('digital_music', 'Digital Music Users')
bi_df = bi_df.replace('streaming_video', 'Streaming Video Watchers')

bi_df = bi_df.replace('east_central', 'East Central')
bi_df = bi_df.replace('north_east', 'North East')
bi_df = bi_df.replace('pacific', 'Pacific')
bi_df = bi_df.replace('south', 'South')
bi_df = bi_df.replace('south_east', 'South East')
bi_df = bi_df.replace('south_west', 'South West')
bi_df = bi_df.replace('west_central', 'West Central')

bi_df = bi_df.replace('18-24', '18-34')
bi_df = bi_df.replace('25-34', '18-34')
bi_df = bi_df.replace('35-44', '35-54')
bi_df = bi_df.replace('45-54', '35-54')

bi_df.head()

Unnamed: 0,category,gender,age,year,division,population,Latitude,Longitude
0,Video Game Players,Male,18-34,2013,East Central,26,41.885151,-86.971436
1,Video Game Players,Male,18-34,2013,East Central,37,41.885151,-86.971436
2,Video Game Players,Male,35-54,2013,East Central,49,41.885151,-86.971436
3,Video Game Players,Male,35-54,2013,East Central,31,41.885151,-86.971436
4,Video Game Players,Male,55+,2013,East Central,13,41.885151,-86.971436


In [8]:
# Save clean data to csv
bi_df.to_csv('../bi_data/bi_data.csv', index = False)