In [40]:
# Importing packages 

# Imports the create_engine function to create a connection to a database and text to construct textual sql expressions
from sqlalchemy import create_engine, text

# Import pandas or data manipulation and analysis
import pandas as pd

In [41]:
# Setting up database connection details
host = "isba-dev-01.c3cckk0gsovf.us-east-1.rds.amazonaws.com"
username = "admin"
password = "isba_4715"
schema = "sql_project"

db_config = {
    "host": host,
    "username": username,
    "password": password,
    "schema": schema
}

# Establishing connection to a MySQL database
# driver://username:password@host/database
engine = create_engine(f"mysql+pymysql://{db_config['username']}:{db_config['password']}@{db_config['host']}/{db_config['schema']}")

In [42]:
# Converting csv into dataframes 
video_info_df = pd.read_csv('video_info.csv')

author_info_df = pd.read_csv ('author_info.csv')

In [43]:
# Checking dataframe
video_info_df.head()

Unnamed: 0,video_id,duration,collect_count,comment_count,digg_count,play_count,share_count
0,7322451384014507307,25,71200,839,308100,6800000,4247
1,7246918411303914795,34,5231,492,67000,1100000,76
2,7329607001980505390,46,44300,482,198900,2100000,1048
3,7340435685964369182,39,30100,736,294000,3500000,1357
4,7328966158911163678,46,421,97,2960,196600,121


In [44]:
# Checking dataframe
author_info_df.head()

Unnamed: 0,video_id,author_id,username,digg_count,follower_count,heart_count,video_count
0,7322451384014507307,7050194696207123462,songofskin,21700,909100,26700000,307
1,7246918411303914795,7242768202300474414,luckytobeskincare,202,8459,1400000,127
2,7329607001980505390,15971434,xo.murielle,254300,36000,991800,38
3,7340435685964369182,6652722185097363461,sheis_alyssamarie,45000,548500,13600000,869
4,7328966158911163678,7323435900599370794,skincare.school04,9,952,3429,2


In [45]:
# To make in in 3NF form we have to remove any transitive dependency 
author_info_df.drop(['author_id'], axis=1, inplace=True)

# Checking if column is successfully dropped
author_info_df.head()

Unnamed: 0,video_id,username,digg_count,follower_count,heart_count,video_count
0,7322451384014507307,songofskin,21700,909100,26700000,307
1,7246918411303914795,luckytobeskincare,202,8459,1400000,127
2,7329607001980505390,xo.murielle,254300,36000,991800,38
3,7340435685964369182,sheis_alyssamarie,45000,548500,13600000,869
4,7328966158911163678,skincare.school04,9,952,3429,2


In [46]:
# Creating table 
table1 = "video_info"

In [27]:
# Uploading dataframe to the sql database and appending it to the specified table
video_info_df.to_sql(table1, engine, index=False, if_exists="append")

12

In [47]:
# Creating table
table2 = "author_info"

In [13]:
# Uploading dataframe to the sql database and appending it to the specified table
author_info_df.to_sql(table2, engine, index=False, if_exists="append")

12

## Query 1: Descriptive Analytics

Business Question: What are the average engagement metrics per video duration category?

In [48]:
# Creating categories based on the duration and getting the average metrics
query1 = text(f'''
WITH categorized_videos AS (
    SELECT
        video_id,
        CASE
            WHEN duration <= 30 THEN 'Less than 30s'
            WHEN duration BETWEEN 31 AND 60 THEN '30s-1min'
            ELSE '1+ min'
        END AS duration_category,
        digg_count,
        share_count,
        play_count
    FROM
        {table1}
)
SELECT
    duration_category,
    AVG(digg_count) AS avg_digg_count,
    AVG(share_count) AS avg_share_count,
    AVG(play_count) AS avg_play_count
FROM
    categorized_videos
GROUP BY
    duration_category;
''')

query1_result_df = pd.read_sql(query1, engine)

In [49]:
query1_result_df

Unnamed: 0,duration_category,avg_digg_count,avg_share_count,avg_play_count
0,Less than 30s,768100.0,5163.0,19925000.0
1,30s-1min,335694.29,2774.86,4053042.86
2,1+ min,73700.0,281.0,1200000.0


Insight: Shorter videos, less than 30s, appears the most when searching keyword and has the highest engagement metrics 

Recommendation: Make shorter videos to sell products or feature products 

Prediction: More engagement which leads to wider reach and higher conversions

## Query 2: Diagnostic Analytics 

Business Question: Does a higher number of followers correlate with higher engagement metrics on videos produced by authors?

In [55]:
# Ranking authors by number of followers and seeing if it indicates higher video engagements
query2 = text(f'''
WITH authors_ranked AS (
    SELECT
        a.video_id,
        a.username,
        a.follower_count,
        RANK() OVER (ORDER BY a.follower_count DESC) AS author_rank
    FROM
        {table2} a
)
SELECT
	r.username,
    r.author_rank,
    v.collect_count,
    v.comment_count,
    v.digg_count,
    v.play_count,
    v.share_count
FROM
    {table1} v
JOIN
    authors_ranked r ON v.video_id = r.video_id
ORDER BY 
    r.author_rank;
''')

query2_result_df = pd.read_sql(query2, engine)

In [56]:
query2_result_df

Unnamed: 0,username,author_rank,collect_count,comment_count,digg_count,play_count,share_count
0,roryeliza,1,30300,8020,559600,6600000,2935
1,mattrandon,2,6420,548,73700,1200000,281
2,songofskin,3,71200,839,308100,6800000,4247
3,sheis_alyssamarie,4,30100,736,294000,3500000,1357
4,luellebrand,5,83000,2795,1700000,62200000,12600
5,xo.murielle,6,44300,482,198900,2100000,1048
6,ellie_thorburn2,7,69400,3322,965800,8100000,3110
7,myphxm,8,471500,2781,1200000,14400000,13800
8,luckytobeskincare,9,5231,492,67000,1100000,76
9,preppyxcalio,10,2626,991,27400,474700,87


Insight: user myphxm has the highest engagement rate on her featured video. Higher follower count doesn't determine engagement.

Recommendation: Feature your products with users that have high engagement rate over follower count

Prediction: More product engagement and increased conversions