# Create Athena Database Schema

In [9]:
import pandas as pd
import boto3
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# Import PyAthena

In [10]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0
from pyathena import connect

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


# Set Private bucket

In [11]:
s3_private_path = "s3://ads508-team4-raw"

# List files in the Private bucket

In [12]:
!aws s3 ls $s3_private_path

                           PRE assets/
                           PRE demographics/
                           PRE plays/
                           PRE psychographics/
                           PRE users/


# Create Athena Database and Tables

In [13]:
database_name = "ads508team4"
table_1 = "assets"
table_2 = "plays"
table_3 = "users"
table_4 = "demographics"
table_5 = "psychographics"

In [14]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [15]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [12]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)

In [13]:
pd.read_sql(statement, conn)

# Make sure the database is created

In [14]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,ads508team4
1,default
2,dsoaws


In [15]:
# Create table assets
create_table_1 = """CREATE EXTERNAL TABLE IF NOT EXISTS ads508team4.assets

(
         showtype string,
         genre string,
         running_minutes int,
         source_language string,
         asset_id int,
         season_id int,
         series_id int,
         studio_id int
        
) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 
LINES TERMINATED BY '\\n' 
STORED AS TEXTFILE
LOCATION 's3://ads508-team4-raw/assets/'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_1)

pd.read_sql(create_table_1, conn)
pd.read_sql("""SELECT * FROM ads508team4.assets LIMIT 10""", conn)

Unnamed: 0,showtype,genre,running_minutes,source_language,asset_id,season_id,series_id,studio_id
0,Movies,Sci-Fi,146,English,1,,,325
1,TV,Documentary and Biography,43,English,2,4.0,5.0,7
2,TV,Reality,22,English,3,15.0,22.0,442
3,TV,Reality,22,English,4,15.0,22.0,442
4,TV,Reality,22,English,5,15.0,22.0,442
5,TV,Comedy,23,English,6,12.0,20.0,397
6,TV,Comedy,23,English,7,13.0,20.0,397
7,TV,Comedy,23,English,8,13.0,20.0,397
8,TV,Kids,12,English,9,50.0,6.0,47
9,TV,Comedy,19,English,10,35.0,41.0,442


In [16]:
# Create table plays
create_table_2 = """CREATE EXTERNAL TABLE IF NOT EXISTS ads508team4.plays

(
         user_id string,
         platform string,
         asset_id int,
         minutes_viewed int
         
) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 
LINES TERMINATED BY '\\n' 
STORED AS TEXTFILE
LOCATION 's3://ads508-team4-raw/plays/'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_2)

pd.read_sql(create_table_2, conn)
pd.read_sql("""SELECT * FROM ads508team4.plays LIMIT 10""", conn)

Unnamed: 0,user_id,platform,asset_id,minutes_viewed
0,765000000000,android,13758,28
1,412000000000,android,13825,28
2,1500000000000,iOS,93,105
3,490000000000,android,6226,7
4,68719476744,android,3762,1
5,258000000000,android,4673,44
6,1240000000000,android,10526,1
7,1080000000000,android,14441,0
8,1220000000000,android,4808,28
9,756000000000,android,15019,11


In [17]:
# Create table users
create_table_3 = """CREATE EXTERNAL TABLE IF NOT EXISTS ads508team4.users

(
         user_id string,
         country_code string
         
) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 
LINES TERMINATED BY '\\n' 
STORED AS TEXTFILE
LOCATION 's3://ads508-team4-raw/users/'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_3)

pd.read_sql(create_table_3, conn)
pd.read_sql("""SELECT * FROM ads508team4.users LIMIT 10""", conn)

Unnamed: 0,user_id,country_code
0,781684000000,ID
1,781684000000,MY
2,781684000000,ID
3,781684000000,ID
4,781684000000,ID
5,781684000000,ID
6,781684000000,ID
7,781684000000,ID
8,781684000000,MY
9,781684000000,ID


In [18]:
# Create table demographics
create_table_4 = """CREATE EXTERNAL TABLE IF NOT EXISTS ads508team4.demographics

(
         user_id string,
         platform string,
         level_1 string,
         level_2 string,
         level_3 string,
         confidence_score float
         
) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 
LINES TERMINATED BY '\\n' 
STORED AS TEXTFILE
LOCATION 's3://ads508-team4-raw/demographics/'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_4)

pd.read_sql(create_table_4, conn)
pd.read_sql("""SELECT * FROM ads508team4.demographics LIMIT 10""", conn)

Unnamed: 0,user_id,platform,level_1,level_2,level_3,confidence_score
0,172000000000,android,Demographics,Income,Medium,1.0
1,326000000000,android,Demographics,Income,Medium,1.0
2,17179869245,android,Demographics,Income,Medium,1.0
3,996000000000,android,Demographics,Income,Low,1.0
4,1610000000000,android,Demographics,Income,Low,1.0
5,1280000000000,iOS,Demographics,Income,High,1.0
6,1280000000000,android,Demographics,Income,High,1.0
7,8589934678,android,Demographics,Income,Low,1.0
8,1560000000000,android,Demographics,Income,Low,1.0
9,902000000000,android,Demographics,Income,Low,1.0


In [19]:
# Create table psychographics
create_table_5 = """CREATE EXTERNAL TABLE IF NOT EXISTS ads508team4.psychographics

(
         user_id string,
         platform string,
         level_1 string,
         level_2 string,
         level_3 string,
         confidence_score float
         
) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 
LINES TERMINATED BY '\\n' 
STORED AS TEXTFILE
LOCATION 's3://ads508-team4-raw/psychographics/'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_5)

pd.read_sql(create_table_5, conn)
pd.read_sql("""SELECT * FROM ads508team4.psychographics LIMIT 10""", conn)

Unnamed: 0,user_id,platform,level_1,level_2,level_3,confidence_score
0,352000000000,android,Psychographics,Social Media Fans,"""\""""\""""""",1.0
1,51539607784,android,Psychographics,Social Media Fans,"""\""""\""""""",1.0
2,1320000000000,android,Psychographics,Social Media Fans,"""\""""\""""""",1.0
3,137000000000,android,Psychographics,Movies Lovers,Romance Movies Fans,0.56
4,928000000000,android,Psychographics,Movies Lovers,Comedy Movies Fans,0.8
5,498000000000,android-tv,Psychographics,Movies Lovers,Horror Movies Fans,0.97
6,137000000000,web,Psychographics,Movies Lovers,Action and Adventure Movies Fans,0.37
7,1330000000000,android,Psychographics,Movies Lovers,Comedy Movies Fans,0.11
8,1370000000000,android,Psychographics,Movies Lovers,Family Movies Fans,0.27
9,816000000000,android,Psychographics,Movies Lovers,Family Movies Fans,0.42


# Make sure the tables are created

In [20]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,assets
1,demographics
2,plays
3,psychographics
4,users


# Create a sub-master table for demographics (includes everything except for psychographics info)

In [2]:
plays = """SELECT * FROM {}.plays""".format(database_name)

users = """SELECT * FROM {}.users""".format(database_name)

assets = """SELECT * FROM {}.assets""".format(database_name)

demographics = """SELECT * FROM {}.demographics""".format(database_name)

psychographics = """SELECT * FROM {}.psychographics""".format(database_name)

df_plays = pd.read_sql(plays, conn)
df_users = pd.read_sql(users, conn)
df_assets = pd.read_sql(assets, conn)
df_demographics = pd.read_sql(demographics, conn)
df_psychographics = pd.read_sql(psychographics, conn)

result1 = pd.merge(df_demographics, df_users, how='inner', on='user_id')
result_2 = pd.merge(result1, df_plays, how='inner', on='user_id')

NameError: name 'database_name' is not defined