# Create Athena Database Schema

In [52]:
import boto3
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# Import PyAthena

In [53]:
from pyathena import connect

# Set Private bucket

In [54]:
s3_private_path = "s3://ads508-team4-raw"

# List files in the Private bucket

In [55]:
!aws s3 ls $s3_private_path

2022-03-19 18:39:22     691755 assets.csv
2022-03-19 18:39:29    4819081 demographics.csv
2022-03-19 18:39:36   14187619 plays.csv
2022-03-19 18:39:21   32759612 psychographics.csv
2022-03-19 18:39:42    1759323 users.csv


# Create Athena Database

In [56]:
database_name = "ads508team4-finalproject"

In [57]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [58]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [59]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS ads508team4-finalproject


# Load and check for each files

In [60]:
import pandas as pd

s3_client = boto3.client("s3")

BUCKET='ads508-team4-raw'
KEY_assets='assets.csv'
KEY_demographics='demographics.csv'
KEY_plays='plays.csv'
KEY_psychographics='psychographics.csv'
KEY_users='users.csv'

response_assets = s3_client.get_object(Bucket=BUCKET, Key=KEY_assets)
df_assets = pd.read_csv(response_assets.get("Body"))

response_demographics = s3_client.get_object(Bucket=BUCKET, Key=KEY_demographics)
df_demographics = pd.read_csv(response_demographics.get("Body"))

response_plays = s3_client.get_object(Bucket=BUCKET, Key=KEY_plays)
df_plays = pd.read_csv(response_plays.get("Body"))

response_psychographics = s3_client.get_object(Bucket=BUCKET, Key=KEY_psychographics)
df_psychographics = pd.read_csv(response_psychographics.get("Body"))

response_users = s3_client.get_object(Bucket=BUCKET, Key=KEY_users)
df_users = pd.read_csv(response_users.get("Body"))

In [61]:
df_assets.head()

Unnamed: 0,show_type,genre,running_minutes,source_language,asset_id,season_id,series_id,studio_id
0,Movies,Sci-Fi,146,English,1,,,325.0
1,TV,Documentary and Biography,43,English,2,4.0,5.0,7.0
2,TV,Reality,22,English,3,15.0,22.0,442.0
3,TV,Reality,22,English,4,15.0,22.0,442.0
4,TV,Reality,22,English,5,15.0,22.0,442.0


In [62]:
df_demographics.head()

Unnamed: 0,user_id,platform,level_1,level_2,level_3,confidence_score
0,172000000000.0,android,Demographics,Income,Medium,1.0
1,326000000000.0,android,Demographics,Income,Medium,1.0
2,17179870000.0,android,Demographics,Income,Medium,1.0
3,996000000000.0,android,Demographics,Income,Low,1.0
4,1610000000000.0,android,Demographics,Income,Low,1.0


In [63]:
df_plays.head()

Unnamed: 0,user_id,platform,asset_id,minutes_viewed
0,765000000000.0,android,13758,28
1,412000000000.0,android,13825,28
2,1500000000000.0,iOS,93,105
3,490000000000.0,android,6226,7
4,68719480000.0,android,3762,1


In [64]:
df_psychographics.head()

Unnamed: 0,user_id,platform,level_1,level_2,level_3,confidence_score
0,352000000000.0,android,Psychographics,Social Media Fans,"\""\""",1.0
1,51539610000.0,android,Psychographics,Social Media Fans,"\""\""",1.0
2,1320000000000.0,android,Psychographics,Social Media Fans,"\""\""",1.0
3,137000000000.0,android,Psychographics,Movies Lovers,Romance Movies Fans,0.56
4,928000000000.0,android,Psychographics,Movies Lovers,Comedy Movies Fans,0.8


In [65]:
df_users.head()

Unnamed: 0,user_id,country_code
0,781684000000.0,ID
1,781684000000.0,MY
2,781684000000.0,ID
3,781684000000.0,ID
4,781684000000.0,ID
