# LinkedIn Job Postings

## ADS 508 Impacting the Business with a Distributed Data Science Pipeline

In [1]:
# Import packages
import numpy as np
import pandas as pd
import boto3
import sagemaker
from pyathena import connect

import warnings
warnings.filterwarnings('ignore')

## Data Ingestion

### CSV to S3

In [2]:
# Setup boto3 session parameters
session = sagemaker.Session()
bucket = session.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

# Establish connection
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [3]:
# Set S3 Source Location (Public bucket)
s3_public_path = "s3://linkedin-postings/raw_data/"

In [4]:
# Set S3 Destination Location (Private bucket)
s3_private_path = "s3://{}/linkedin-postings/raw_data/".format(bucket)
print(s3_private_path)

s3://sagemaker-us-east-1-937572952481/linkedin-postings/raw_data/


In [5]:
# Copy data from Public S3 bucket to Private S3 bucket
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "postings.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "salaries.csv"
!aws s3 cp --recursive $s3_public_path/ $s3_private_path/ --exclude "*" --include "job_skills.csv"

In [6]:
# Check files are copied successfully
print(s3_private_path)

s3://sagemaker-us-east-1-937572952481/linkedin-postings/raw_data/


In [7]:
!aws s3 ls $s3_private_path/

### Create Athena Database 

In [8]:
database_name = "linkedin_data"

In [9]:
# Set S3 staging directory - a temporary directory for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [10]:
# Connect to staging directory
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [11]:
# Create Database
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)

pd.read_sql(statement, conn)

### Verify database has been created

In [12]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,default
1,dsoaws
2,linkedin_data


### Create Athena Tables from CSV Files

In [13]:
table_name = "postings"
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name}', conn)

# SQL statement to execute the postings table
statement = """
    CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
        job_id string,
        company_name string,
        title string,
        description string,
        max_salary float,
        pay_period string,
        location string,
        company_id string,
        views int,
        med_salary float,
        min_salary float,
        formatted_work_type string,
        applies int,
        original_listed_time string,
        remote_allowed string,
        job_posting_url string,
        application_url string,
        application_type string,
        expiry string,
        closed_time string,
        formatted_experience_level string,
        skills_desc string,
        listed_time string,
        posting_domain string,
        sponsored int,
        work_type string,
        currency string,
        compensation_type string,
        normalized_salary float,
        zip_code int,
        fips int
    ) 
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' 
    LINES TERMINATED BY '\\n' 
    LOCATION '{}' 
    TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')
    """.format(database_name, table_name, s3_private_path)

# Execute statement
pd.read_sql(statement, conn)
print("Created postings table")

Created postings table


In [14]:
postings_df = pd.read_sql(statement, conn)

In [15]:
table_name_2 = "salaries"
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name_2}', conn)

# SQL statement to execute the postings table
statement = """
    CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
        salary_id string,
        job_id string,
        max_salary float,
        med_salary float,
        min_salary float,
        pay_period string,
        product_category string,
        currency string,
        compensation_type string
    ) 
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' 
    LINES TERMINATED BY '\\n' 
    LOCATION '{}' 
    TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')
    """.format(database_name, table_name_2, s3_private_path)

# Execute statement
pd.read_sql(statement, conn)
print("Created salaries table")

Created salaries table


In [16]:
salaries_df = pd.read_sql(statement, conn)

In [17]:
table_name_3 = "job_skills"
pd.read_sql(f'DROP TABLE IF EXISTS {database_name}.{table_name_3}', conn)

# SQL statement to execute the postings table
statement = """
    CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
        job_id string,
        skill_abr string
    ) 
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' 
    LINES TERMINATED BY '\\n' 
    LOCATION '{}' 
    TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1')
    """.format(database_name, table_name_3, s3_private_path)

# Execute statement
pd.read_sql(statement, conn)
print("Created job_skills table")

Created job_skills table


In [18]:
job_skills_df = pd.read_sql(statement, conn)

### Verify tables have been created successfully

In [19]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,job_skills
1,postings
2,salaries


In [24]:
# View table to check the data looks correct
statement = """
    SELECT * 
    FROM {}.{}
    LIMIT 3
    """.format(database_name, table_name)

pd.read_sql(statement, conn)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date


In [21]:
statement = """
    SELECT * 
    FROM {}.{}
    LIMIT 3
    """.format(database_name, table_name_2)

pd.read_sql(statement, conn)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date


In [22]:
statement = """
    SELECT * 
    FROM {}.{}
    LIMIT 3
    """.format(database_name, table_name_3)

pd.read_sql(statement, conn)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date


## Data Exploration