In [None]:
import boto3

#initialise s3 client
s3 = boto3.client('s3', region_name='eu-west-1')
#Download file from S3 bucket
bucket = 'blossom-data-engs'
s3.download_file(bucket, 'alldata.csv', 'alldata.csv')
s3.download_file(bucket, 'companies.csv', 'companies.csv')

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as d
from pyspark.sql import Window
from pyspark import SparkConf 
from pyspark.context import SparkContext
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.ml.feature import NGram,Tokenizer

In [10]:
# create spark session
spark = SparkSession.builder.getOrCreate()

In [11]:
companies = spark.read.csv('companies.csv', header=True, inferSchema=True, multiLine=True)
companies.count()

6512

In [12]:
alldata = spark.read.csv('alldata.csv', header=True, inferSchema=True, multiLine=True)
alldata.count()

13513

In [13]:
companies.show()

+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+------------+------------------+------------------+--------------------+--------------------+
|ticker|        company name|          short name|            industry|         description|             website|    logo|                 ceo|            exchange|  market cap|            sector|             tag 1|               tag 2|              tag 3
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+------------+------------------+------------------+--------------------+--------------------+
|     A|Agilent Technolog...|             Agilent|Medical Diagnosti...|Agilent Technolog...|http://www.agilen...|   A.png| Michael R. McMullen|New York Stock Ex...| 24218068096|        Healthcare|        Healthcare|Diagnostics &

In [62]:
alldata.select('location\r').show()

+--------------------+
|           location
+--------------------+
|  Atlanta, GA 30301 |
|                null|
|                null|
| has an open posi...|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
|                null|
| or other quantit...|
| Algorithms/Incor...|
|                null|
|                null|
|                null|
|                null|
+--------------------+
only showing top 20 rows



In [63]:
alldata = alldata.withColumnRenamed('location\r', 'location')

In [89]:
companies.columns
companies = companies.filter(companies.description_name.isNotNull())
alldata = alldata.filter(alldata.description.isNotNull())

In [90]:
companies.columns

['ticker',
 'company_name',
 'short name',
 'industry',
 'description_name',
 'website',
 'logo',
 'ceo',
 'exchange',
 'market cap',
 'sector',
 'tag 1',
 'tag 2',
 'tag 3\r']

In [91]:
#setting alias and do an inner join
com = companies.alias('com')
ald = alldata.alias('ald')
innerjoin = com.join(ald, com.company_name == ald.company, 'inner')

In [92]:
innerjoin.columns

['ticker',
 'company_name',
 'short name',
 'industry',
 'description_name',
 'website',
 'logo',
 'ceo',
 'exchange',
 'market cap',
 'sector',
 'tag 1',
 'tag 2',
 'tag 3\r',
 'position',
 'company',
 'description',
 'reviews',
 'location']

In [93]:
#select the joined columns
innerjoin.select('location', 'company').show()

+--------------------+--------------------+
|            location|             company|
+--------------------+--------------------+
|          Austin, TX|   Cubic Corporation|
|          Austin, TX| The Hershey Company|
|          Austin, TX| Abbott Laboratories|
|          Austin, TX| Centene Corporation|
|    Austin, TX 78746|           eBay Inc.|
|         Boulder, CO|        Trimble Inc.|
|    Boston, MA 02210|   Cabot Corporation|
|    Boston, MA 02108| Celgene Corporation|
|   Chicago, IL 60664|               AECOM|
|         Chicago, IL|  Kemper Corporation|
|         Chicago, IL|Discover Financia...|
|   Chicago, IL 60664|               AECOM|
|   Chicago, IL 60661|          TransUnion|
|         Chicago, IL|       Wipro Limited|
|         Chicago, IL| Synchrony Financial|
|   Chicago, IL 60661|          TransUnion|
|         Chicago, IL|Discover Financia...|
|   Chicago, IL 60661|          TransUnion|
|                null|Vanda Pharmaceuti...|
|Washington, DC 20037|Vanda Phar

In [152]:
#function to generate N-gram
def Ngm(datainput, n):
    tokens = Tokenizer(inputCol='description', outputCol='tokens')
    dfs = tokens.transform(datainput)
    ngrm = NGram(n=n, inputCol='tokens', outputCol='ngram')
    dfs = ngrm.transform(dfs)
    return dfs

In [155]:
ng1 = Ngm(innerjoin, 1)

In [154]:
ng2 = Ngm(innerjoin,n=2)

In [156]:
ng1.columns

['ticker',
 'company_name',
 'short name',
 'industry',
 'description_name',
 'website',
 'logo',
 'ceo',
 'exchange',
 'market cap',
 'sector',
 'tag 1',
 'tag 2',
 'tag 3\r',
 'position',
 'company',
 'description',
 'reviews',
 'location',
 'tokens',
 'ngram']

In [157]:
ng2.columns

['ticker',
 'company_name',
 'short name',
 'industry',
 'description_name',
 'website',
 'logo',
 'ceo',
 'exchange',
 'market cap',
 'sector',
 'tag 1',
 'tag 2',
 'tag 3\r',
 'position',
 'company',
 'description',
 'reviews',
 'location',
 'tokens',
 'ngram']

In [158]:
ng1.select('location').show()

+--------------------+
|            location|
+--------------------+
|          Austin, TX|
|          Austin, TX|
|          Austin, TX|
|          Austin, TX|
|    Austin, TX 78746|
|         Boulder, CO|
|    Boston, MA 02210|
|    Boston, MA 02108|
|   Chicago, IL 60664|
|         Chicago, IL|
|         Chicago, IL|
|   Chicago, IL 60664|
|   Chicago, IL 60661|
|         Chicago, IL|
|         Chicago, IL|
|   Chicago, IL 60661|
|         Chicago, IL|
|   Chicago, IL 60661|
|                null|
|Washington, DC 20037|
+--------------------+
only showing top 20 rows



In [159]:
ng2.select('location').show()

+--------------------+
|            location|
+--------------------+
|          Austin, TX|
|          Austin, TX|
|          Austin, TX|
|          Austin, TX|
|    Austin, TX 78746|
|         Boulder, CO|
|    Boston, MA 02210|
|    Boston, MA 02108|
|   Chicago, IL 60664|
|         Chicago, IL|
|         Chicago, IL|
|   Chicago, IL 60664|
|   Chicago, IL 60661|
|         Chicago, IL|
|         Chicago, IL|
|   Chicago, IL 60661|
|         Chicago, IL|
|   Chicago, IL 60661|
|                null|
|Washington, DC 20037|
+--------------------+
only showing top 20 rows



In [163]:
#split the location column and append with an alias called city and assign to a new dataframe
newdf = ng1.select('industry', 'ngram', 'location', d.split(ng1['location'], ',')[0].alias('city'))

In [164]:
newdf1 = ng2.select('industry', 'ngram', 'location', d.split(ng2['location'], ',')[0].alias('city'))

In [165]:
newdf.show()

+--------------------+--------------------+--------------------+----------+
|            industry|               ngram|            location|      city|
+--------------------+--------------------+--------------------+----------+
| Aerospace & Defense|[job, summary:, ,...|          Austin, TX|    Austin|
|Consumer Packaged...|[job, title:, foo...|          Austin, TX|    Austin|
|     Medical Devices|[at, abbott,, we'...|          Austin, TX|    Austin|
|   Health Care Plans|[provide, vision,...|          Austin, TX|    Austin|
|Retail - Apparel ...|[at, ebay,, you, ...|    Austin, TX 78746|    Austin|
|   Computer Hardware|[data, scientist,...|         Boulder, CO|   Boulder|
|           Chemicals|[position, summar...|    Boston, MA 02210|    Boston|
|       Biotechnology|[other, locations...|    Boston, MA 02108|    Boston|
|Engineering & Con...|[aecom, is, activ...|   Chicago, IL 60664|   Chicago|
|           Insurance|[position, summar...|         Chicago, IL|   Chicago|
|     Credit

In [166]:
newdf.select('industry','ngram','location', 'city')

DataFrame[industry: string, ngram: array<string>, location: string, city: string]

In [167]:
#A function to create 2 spark DataFrames which has 3 columns
def data_frames(inputframe, colinput):
    qry = inputframe.select(colinput, d.explode('ngram').alias('ngram')).groupby(['ngram',colinput]).count()
    qry = qry.withColumnRenamed('count', 'frequency')
    qry.orderBy(qry.frequency.desc())
    return qry

In [168]:
#pass bigram and new spark dataframe with city column
spd = data_frames(newdf, 'city')

In [169]:
spd.show()

+------------+-------+---------+
|       ngram|   city|frequency|
+------------+-------+---------+
|integration,| Austin|        1|
|       siri,| Austin|        1|
|   excellent| Austin|        1|
|    relevant| Austin|        6|
|           –| Austin|        1|
|         key|Boulder|        1|
|         her|Boulder|        1|
|      status| Boston|        2|
|   establish|Chicago|        1|
|       teams|Chicago|       12|
|         (or|Chicago|        3|
| skillsshare|Chicago|        1|
|      401(k)|Chicago|        1|
|      master|Chicago|        1|
|   required:|Chicago|        2|
|     systems|Chicago|        2|
|     cutting|Chicago|        2|
|  strategies|Chicago|        3|
|        true|Chicago|        1|
|    optimize|Chicago|        1|
+------------+-------+---------+
only showing top 20 rows



In [171]:
#pass bigram and new spark dataframe with industry column
spd2 = data_frames(newdf, 'industry')

In [173]:
spd2.show()

+--------------------+--------------------+---------+
|               ngram|            industry|frequency|
+--------------------+--------------------+---------+
|           barkthins|Consumer Packaged...|        1|
|             creates|Consumer Packaged...|        1|
|        formulation,|Consumer Packaged...|        1|
|            payments|   Health Care Plans|        2|
|           retention|   Health Care Plans|        1|
|       interpersonal|   Health Care Plans|        1|
|           business.|Retail - Apparel ...|        1|
|          employment|Retail - Apparel ...|        3|
|        development,|   Computer Hardware|        1|
|                  at|   Computer Hardware|        1|
|        agriculture,|   Computer Hardware|        1|
|                some|Engineering & Con...|        4|
|              signed|Engineering & Con...|        6|
|implementationstrong|           Insurance|        1|
|                 -in|           Insurance|        1|
|              highly|     C

In [175]:
#pass unigram and new spark dataframe with city column
spd3 = data_frames(newdf1, 'city')
spd3.show()

+--------------------+-------+---------+
|               ngram|   city|frequency|
+--------------------+-------+---------+
|  related challenges| Austin|        1|
|     college degree,| Austin|        1|
|   opportunities for| Austin|        1|
|         skills used| Austin|        1|
|      color, gender,| Austin|        1|
|        life. today,| Austin|        1|
|          study data| Austin|        1|
| economics functions| Austin|        1|
|        and content.| Austin|        1|
|    proven technical| Austin|        1|
|      versatile work| Austin|        1|
|         sex, sexual| Austin|        1|
|            taking a|Boulder|        1|
|professional comm...|Boulder|        1|
|        when needed.|Boulder|        1|
|      and logistics.|Boulder|        1|
|     and affirmative|Boulder|        1|
|   protected factor.|Boulder|        1|
|      development of| Boston|        2|
|          key trends| Boston|        1|
+--------------------+-------+---------+
only showing top

In [176]:
#pass unigram and new spark dataframe with industry column
spd3 = data_frames(newdf1, 'industry')
spd3.show()

+--------------------+--------------------+---------+
|               ngram|            industry|frequency|
+--------------------+--------------------+---------+
|   between disparate| Aerospace & Defense|        1|
|            in clear| Aerospace & Defense|        1|
|analysis methodology| Aerospace & Defense|        1|
|      manner through| Aerospace & Defense|        1|
|          and around|Consumer Packaged...|        1|
|     master’s degree|     Medical Devices|        2|
|    understanding of|   Health Care Plans|        1|
|         will ensure|   Health Care Plans|        1|
|            within a|   Health Care Plans|        2|
|    complex analyses|   Health Care Plans|        1|
|machines, supervi...|   Health Care Plans|        1|
|theoretical knowl...|   Health Care Plans|        1|
|           this role|Retail - Apparel ...|        3|
|deployment infras...|Retail - Apparel ...|        1|
|       aggregate and|Retail - Apparel ...|        1|
|   computing systems|Retail