In [1]:
import pyspark
import boto3

from pyspark.sql.functions import udf, col
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import IntegerType


In [None]:
#Download files from s3 blossom-data-engs bucket
s3_client = boto3.client('s3')
s3_client.download_file('blossom-data-engs','all-us-stocks-tickers-company-info-logos.zip', 'us-stocks.zip')
s3_client.download_file('blossom-data-engs','data-scientist-job-market-in-the-us.zip', 'data-scentist.zip')

In [2]:
#Load csv files into company and alldata frames
spark = SparkSession.builder.getOrCreate()
companies = spark.read.csv(
            "companies.csv", 
            header=True, inferSchema=True)

alldata = spark.read.csv( "alldata.csv", 
            header=True, inferSchema=True)

In [3]:
#Check to see if location column in dataframe loads fine
alldata.select('location').show()

+--------------------+
|            location|
+--------------------+
|                null|
|                 GA.|
|                null|
|            database|
|                null|
| has served as on...|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
| has an open posi...|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
+--------------------+
only showing top 20 rows



In [48]:
companies.count()
companies.columns
alldata.columns
#Rename description column in campanies to des
companies = companies.withColumnRenamed('description', 'description2')
companies = companies.filter(companies.description2.isNotNull())
alldata = alldata.filter(alldata.description.isNotNull())


In [49]:
alldata.select('location').show()

+--------------------+
|            location|
+--------------------+
|                null|
|                 GA.|
|            database|
| has served as on...|
|                null|
|                null|
| has an open posi...|
|                null|
| or other quantit...|
| Algorithms/Incor...|
|                null|
|                null|
| regulatory and p...|
|                null|
|                null|
|                null|
|                null|
|                null|
| impactful solutions|
|                null|
+--------------------+
only showing top 20 rows



In [50]:
#Join the two data set and assign to inner_join dataframe
inner_join = companies.join(alldata, companies['company name'] == alldata.company)

In [51]:
inner_join.select('position', 'company','location','ticker', 'industry', 'description').show()

+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+
|            position|        company|            location|              ticker|            industry|         description|
+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+
|Experience follow...|         design|                null| The fund general...|http://www.invesc...|                code|
|Facilitates the d...|    development|                null| The fund general...| sale or distribu...| and planning of ...|
|Facilitates the d...|    development|                null| The fund general...| services and pro...| and planning of ...|
|              Travel| transportation|                null| The Shares are i...| they provide inv...| and moving expen...|
|Apply health scie...|    development| and risk factors...| The fund general...| sale or distribu...| and conduct of s...|
|Apply health sc

In [52]:
from pyspark.ml.feature import NGram, Tokenizer

#creaet an Ngram function to generate from description
def ngram_func(num, job):
    tokens = Tokenizer(inputCol='description', outputCol='tokens')
    jobs = tokens.transform(job)
    nums = NGram(n= num, inputCol='tokens', outputCol="ngram")
    jobs = nums.transform(jobs)
    return jobs

In [53]:
#Create a Bigram with the joint data set an assign to job1 dataframe
job1 = ngram_func(2, inner_join)

In [74]:
#create a unigram 
job2 = ngram_func(1, inner_join)

In [54]:
job1.columns

['ticker',
 'company name',
 'short name',
 'industry',
 'description2',
 'website',
 'logo',
 'ceo',
 'exchange',
 'market cap',
 'sector',
 'tag 1',
 'tag 2',
 'tag 3',
 'position',
 'company',
 'description',
 'reviews',
 'location',
 'tokens',
 'ngram']

In [55]:
job1.select('location').show()

+--------------------+
|            location|
+--------------------+
|                null|
|                null|
|                null|
|                null|
| and risk factors...|
| and risk factors...|
|                null|
|                null|
| implement and/or...|
|                null|
| and deployment o...|
| and deployment o...|
| re-designing cod...|
| re-designing inf...|
|        publications|
|                null|
| and water sector...|
|                null|
|                null|
| and Use public t...|
+--------------------+
only showing top 20 rows



In [56]:
job1.select('ngram').show()

+--------------------+
|               ngram|
+--------------------+
|             [ code]|
|[ and, and planni...|
|[ and, and planni...|
|[ and, and moving...|
|[ and, and conduc...|
|[ and, and conduc...|
|[ and, and moving...|
|[ and, and moving...|
|            [ build]|
|[ and, and moving...|
|         [ building]|
|         [ building]|
|[ and, and implem...|
|[ and, and implem...|
|[ and, and analys...|
|[ and, and manage...|
|            [ power]|
|[ and, and implem...|
|[ and, and implem...|
|[ and, and land-u...|
+--------------------+
only showing top 20 rows



In [57]:
#Create  a split on the the location column and append with an alias called city and assign to new_jobs dataframe
new_jobs = job1.select(
    'industry', 'ngram','location', F.split(job1['location'], ',')[0].alias('city'))

In [75]:
new_jobs2 = job2.select(
    'industry', 'ngram','location', F.split(job2['location'], ',')[0].alias('city'))

In [58]:
new_jobs.show()

+--------------------+--------------------+--------------------+--------------------+
|            industry|               ngram|            location|                city|
+--------------------+--------------------+--------------------+--------------------+
|http://www.invesc...|             [ code]|                null|                null|
| sale or distribu...|[ and, and planni...|                null|                null|
| services and pro...|[ and, and planni...|                null|                null|
| they provide inv...|[ and, and moving...|                null|                null|
| sale or distribu...|[ and, and conduc...| and risk factors...| and risk factors...|
| services and pro...|[ and, and conduc...| and risk factors...| and risk factors...|
| they provide inv...|[ and, and moving...|                null|                null|
| they provide inv...|[ and, and moving...|                null|                null|
|http://www.invesc...|            [ build]| implement 

In [62]:
#Create a function to reuse the Ngram DataFrame
def data_frequency(input_job, input_col):
    q= input_job.select(input_col, F.explode('ngram').alias('ngram')).groupby([ 'ngram', input_col]).count()
    q = q.withColumnRenamed('count', 'frequency')
    q.orderBy(q.frequency.desc())
    return q
    
    

In [63]:
#Pass new_jobs dataframe  and city column 
dt1= data_frequency(new_jobs, 'city')


In [64]:
dt.show()

+--------------------+--------------------+---------+
|               ngram|                city|frequency|
+--------------------+--------------------+---------+
|         relating to|                null|        1|
|philadelphia; us-...|                null|        1|
|     montpelier; us-|                null|        1|
|           access to|                null|        1|
|                 air|                null|        1|
|   qualifications in|                null|        1|
|       candidate who|                null|        1|
|   broader technical|                null|        1|
|       landscape for|           Cambridge|        1|
|markets.position ...|           Cambridge|        1|
|optimization, rec...|           Cambridge|        1|
|             glp and|           Cambridge|        1|
|    in microbiology,|           Cambridge|        1|
|     good laboratory|           Cambridge|        1|
|            in early| target expressio...|        2|
|         for quality|      

In [65]:
#pass industry dataframe
dt2= data_frequency(new_jobs, 'industry')

In [66]:
dt2.show()

+--------------------+--------------------+---------+
|               ngram|            industry|frequency|
+--------------------+--------------------+---------+
|implementation ac...| services and pro...|        1|
|         of projects| services and pro...|        1|
| new instrumentation| sale or distribu...|        2|
| collaboratively and| sale or distribu...|        1|
|  opportunities with|http://www.invesc...|        1|
|       project needs|Engineering & Con...|        1|
|     next generation|http://www.invesc...|        3|
|       data curation|Application Software|        1|
|       breaking down| sale or distribu...|        1|
|            research| sale or distribu...|        1|
|        will support|  Drug Manufacturers|        2|
|deliverables supp...|  Drug Manufacturers|        1|
|         of position| sale or distribu...|        1|
|       in cambridge,|  Drug Manufacturers|        4|
|               in an|  Drug Manufacturers|        1|
|         of research|      

In [124]:
x= new_jobs.select(
'industry', 
F.explode('ngram').alias('ngram')
).groupby(['ngram', 'industry']).count()

x = x.withColumnRenamed('count', 'frequency')
x.orderBy(x.frequency.desc())

DataFrame[ngram: string, industry: string, frequency: bigint]

In [125]:
x.show()

+--------------------+--------------------+---------+
|               ngram|            industry|frequency|
+--------------------+--------------------+---------+
|implementation ac...| services and pro...|        1|
|         of projects| services and pro...|        1|
| new instrumentation| sale or distribu...|        2|
| collaboratively and| sale or distribu...|        1|
|  opportunities with|http://www.invesc...|        1|
|       project needs|Engineering & Con...|        1|
|     next generation|http://www.invesc...|        3|
|       data curation|Application Software|        1|
|       breaking down| sale or distribu...|        1|
|            research| sale or distribu...|        1|
|        will support|  Drug Manufacturers|        2|
|deliverables supp...|  Drug Manufacturers|        1|
|         of position| sale or distribu...|        1|
|       in cambridge,|  Drug Manufacturers|        4|
|               in an|  Drug Manufacturers|        1|
|         of research|      

In [None]:
new_jobs

In [77]:
#Passing unigram
dt3= data_frequency(new_jobs2, 'city')
dt3.show()

+---------------+--------------------+---------+
|          ngram|                city|frequency|
+---------------+--------------------+---------+
|          novel| gene expression ...|        2|
|        limited|            military|        6|
|      involving|                null|        4|
|        level).|                null|        1|
|         online|                null|        1|
|           some|                null|        3|
|    performance|           Cambridge|        2|
|       guidance|           Cambridge|        1|
|       projects| marketing and bu...|        1|
|      antibody,|                null|        1|
|     pipelines,|                null|        1|
|          cures|                null|        2|
|       amazon’s| globally. You wi...|        1|
|       meetings| data prioritizat...|        1|
|         travel|                null|        2|
|        elevate|                null|        1|
|      standards|                null|        1|
|            125|   

In [78]:
dt4= data_frequency(new_jobs2, 'location')
dt4.show()

+---------------+--------------------+---------+
|          ngram|            location|frequency|
+---------------+--------------------+---------+
|          novel| gene expression ...|        2|
|        limited|            military|        6|
|      involving|                null|        4|
|        level).|                null|        1|
|         online|                null|        1|
|           some|                null|        3|
|       systems,|       Cambridge, MA|        1|
|           safe|       Cambridge, MA|        1|
|       projects| marketing and bu...|        1|
|      antibody,|                null|        1|
|     pipelines,|                null|        1|
|          cures|                null|        2|
|       amazon’s| globally. You wi...|        1|
|       meetings| data prioritizat...|        1|
|         travel|                null|        2|
|        elevate|                null|        1|
|      standards|                null|        1|
|            125|   