### This project is aimed at understanding the various spark functions needed to do batch processing.
#### Datasets
     - Stocks data from kaggle
     - data scientists jobs from kaggle


In [None]:
import pyspark

from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [None]:
# Creating a spark session
spark = SparkSession.builder.getOrCreate()

In [None]:
# Download the dataset from an s3 bucket

import boto3

s3 = boto3.client('s3') # creating the boto3 to be able to access the file from the s3 bucket

s3.download_file('blossom-data-engs', 'all-us-stocks-tickers-company-info-logos.zip', 'us_stocks.zip')
s3.download_file('blossom-data-engs', 'data-scientist-job-market-in-the-us.zip', 'ds_jobs.zip')

In [None]:
# creating a new dataframe from the csv file...
companies = spark.read.csv("companies.csv", header=True, inferSchema=True,multiLine=True)
alldata = spark.read.csv("alldata.csv", header=True, inferSchema=True, multiLine=True)

In [62]:
# renaming the description both dataframes.
companies = companies.withColumnRenamed('description','company_description')
alldata = alldata.withColumnRenamed('description','job_description')

In [63]:
#merging the 2 dataframes...
merged_data = companies.join(alldata, alldata['company'] == companies['company name'])

In [65]:
#extracting the city from the location data...
merged_data = merged_data.select('*', F.split(alldata['location'], ',')[0].alias('city'))

In [126]:
companies.count(), alldata.count()

(6512, 8714)

In [142]:
from pyspark.ml.feature import NGram, Tokenizer

In [143]:
#This function creates the ngrams from the dataframe using the specified column
def create_ngram(df, col):
    tokens = Tokenizer(inputCol=col, outputCol='tokens') # create tokens from the data on the col column
    new_df = tokens.transform(df)   ## apply the tokenizer on the dataset
    ngram = NGram(n=2, inputCol='tokens', outputCol='ngrams') # creating the ngram object
    new_df = ngram.transform(new_df)  #transform the df with the ngram
    return new_df


In [149]:
# method for creating the frequency data function...
def create_freq_df(df, col):
    n=df.select(col, F.explode('ngrams').alias('ngrams')).groupBy([col, 'ngrams']).count() #exploding the ngrams
    n = n.withColumnRenamed('count','frequency') # changing the column name from count to frequency
    n = n.orderBy(n['frequency'].desc()) # ordering rows by biggest first.
    return n


In [144]:
#applying the function to the merged_data df to create ngrams from the job_description...
new_data = create_ngram(merged_data, 'company_description')

In [150]:
#creating the freq for industry ngrams
industry_freq_df = create_freq_df(new_data, 'industry')

In [151]:
industry_freq_df.show()

+------------------+--------------------+---------+
|          industry|              ngrams|frequency|
+------------------+--------------------+---------+
|     Biotechnology|                is a|       52|
|Drug Manufacturers|        and vaccines|       47|
|     Biotechnology|             corp is|       42|
|     Biotechnology|            to treat|       41|
|     Biotechnology|  and commercializes|       36|
|     Biotechnology|          cancer and|       34|
|     Biotechnology|         designed to|       34|
|     Biotechnology|discovers, develops,|       34|
|     Biotechnology|        treat cancer|       34|
|     Biotechnology|  abraxane, istodax,|       34|
|     Biotechnology|        company that|       34|
|     Biotechnology| revlimid, thalomid,|       34|
|     Biotechnology|and immune-inflam...|       34|
|     Biotechnology|   related diseases.|       34|
|     Biotechnology|       diseases. its|       34|
|     Biotechnology|          brands are|       34|
|     Biotec

In [152]:
#creating the freq for the city ngrams
job_df = create_ngram(merged_data, 'job_description')
city_freq_df = create_freq_df(job_df,'city')

In [153]:
city_freq_df.show()

+---------+-----------------+---------+
|     city|           ngrams|frequency|
+---------+-----------------+---------+
|Cambridge|       ability to|      130|
|Cambridge|           in the|      130|
|Cambridge|           of the|      104|
|Cambridge|    experience in|       93|
|Cambridge|           to the|       85|
|San Diego|           in the|       82|
|Cambridge|             in a|       79|
|Cambridge|          and the|       78|
|Cambridge|             as a|       67|
|Cambridge|  experience with|       65|
|Cambridge|          to work|       65|
|Cambridge|equal opportunity|       64|
|Cambridge|        sanofi is|       59|
|Cambridge|          well as|       58|
|Cambridge|          as well|       57|
|Cambridge|     committed to|       56|
|Cambridge|        regard to|       55|
|Cambridge|          will be|       55|
|San Diego|           of the|       54|
|Cambridge|  gender identity|       48|
+---------+-----------------+---------+
only showing top 20 rows

