# LinkedIn Job Postings

## ADS 508 Impacting the Business with a Distributed Data Science Pipeline

In [None]:
# Import packages
import numpy as np
import pandas as pandas
import boto3
import sagemaker
from pyathena import connect

## Data Ingestion

### CSV to S3

In [None]:
# Setup boto3 session parameters
session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

# Establish connection
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [None]:
# Set S3 Source Location (Public bucket)
s3_public_path = "s3://linkedin-postings/raw_data/"

In [None]:
# Set S3 Destination Location (Private bucket)
s3_private_path = "s3://{}/linkedin-postings/raw_data/".format(bucket)
print(s3_private_path)

In [None]:
# Copy data from Public S3 bucket to Private S3 bucket
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "postings.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "salaries.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "job_skills.csv"

In [None]:
# Check files are copied successfully
print(s3_private_path)

In [None]:
!aws s3 ls $s3_private_path/

### Create Athena Database 

In [None]:
database_name = "linkedin_data"

In [None]:
# Set S3 staging directory - a temporary directory for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [None]:
# Connect to staging directory
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [None]:
# Create Database
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)

pd.read_sql(statement, conn)

### Verify database has been created

In [None]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

### Create Athena Tables from CSV Files

In [None]:
table_name = "postings"

# SQL statement to execute the postings table
statement = """
    CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
        marketplace string,
        customer_id string,
        review_id string,
        product_id string,
        product_parent string,
        product_title string,
        product_category string,
        star_rating int,
        helpful_votes int,
        total_votes int,
        vine string,
        verified_purchase string,
        review_headline string,
        review_body string,
        review_date string
    ) 
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' 
    LINES TERMINATED BY '\\n' 
    LOCATION '{}' 
    TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')
    """.format(database_name, table_name, s3_private_path)

# Execute statement
pd.read_sql(statement, conn)
print("Created postings table")

In [None]:
postings_df = pd.read_sql(statement, conn)

In [None]:
table_name_2 = "salaries"

# SQL statement to execute the postings table
statement = """
    CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
        marketplace string,
        customer_id string,
        review_id string,
        product_id string,
        product_parent string,
        product_title string,
        product_category string,
        star_rating int,
        helpful_votes int,
        total_votes int,
        vine string,
        verified_purchase string,
        review_headline string,
        review_body string,
        review_date string
    ) 
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' 
    LINES TERMINATED BY '\\n' 
    LOCATION '{}' 
    TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')
    """.format(database_name, table_name_2, s3_private_path)

# Execute statement
pd.read_sql(statement, conn)
print("Created salaries table")

In [None]:
salaries_df = pd.read_sql(statement, conn)

In [None]:
table_name_3 = "job_skills"

# SQL statement to execute the postings table
statement = """
    CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
        marketplace string,
        customer_id string,
        review_id string,
        product_id string,
        product_parent string,
        product_title string,
        product_category string,
        star_rating int,
        helpful_votes int,
        total_votes int,
        vine string,
        verified_purchase string,
        review_headline string,
        review_body string,
        review_date string
    ) 
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' 
    LINES TERMINATED BY '\\n' 
    LOCATION '{}' 
    TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')
    """.format(database_name, table_name_3, s3_private_path)

# Execute statement
pd.read_sql(statement, conn)
print("Created job_skills table")

In [None]:
job_skills_df = pd.read_sql(statement, conn)

### Verify tables have been created successfully

In [None]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

In [None]:
# View table to check the data looks correct
statement = """
    SELECT * 
    FROM {}.{}
    LIMIT 3
    """.format(database_name, table_name)

pd.read_sql(statement, conn)

In [None]:
statement = """
    SELECT * 
    FROM {}.{}
    LIMIT 3
    """.format(database_name, table_name_2)

pd.read_sql(statement, conn)

In [None]:
statement = """
    SELECT * 
    FROM {}.{}
    LIMIT 3
    """.format(database_name, table_name_3)

pd.read_sql(statement, conn)

## Data Exploration