In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType

In [2]:
csv = spark.read.csv('CourseFiles/Latest_Data_Science_Salaries.csv', header = True, inferSchema = True)

In [3]:
csv.columns

['Job Title',
 'Employment Type',
 'Experience Level',
 'Expertise Level',
 'Salary',
 'Salary Currency',
 'Company Location',
 'Salary in USD',
 'Employee Residence',
 'Company Size',
 'Year']

In [4]:
csv.createOrReplaceTempView("DataScienceSalaries")

In [5]:
csv.select('Job Title', 'Employment Type', 'Year', 'Company Location', 'Salary in USD')\
.filter(csv['Job Title'] == 'Data Engineer')\
.filter(csv.Year == '2023')\
.filter(csv['Experience Level'] == 'Senior')\
.groupBy(csv['Company Location'])\
.agg(func.mean(csv['Salary in USD']).alias('Average USD Salaries'))\
.orderBy(func.desc('Average USD Salaries'))\
.withColumnRenamed('Company Location', 'Country')\
.show()

+--------------+--------------------+
|       Country|Average USD Salaries|
+--------------+--------------------+
| United States|  168786.66101694916|
|       Germany|            141479.0|
|United Kingdom|            129320.0|
|        Canada|            127500.0|
|       Ireland|            102569.0|
|        Sweden|             86374.0|
|      Colombia|             66000.0|
|     Argentina|             65000.0|
|      Portugal|             53983.5|
|       Estonia|             50529.0|
+--------------+--------------------+



In [6]:
csv.filter(csv['Job Title'] == 'Data Engineer').count()

702

In [7]:
csv.groupBy(csv['Job Title']).agg(func.mean(csv['Salary in USD']).alias('Average Salaries'))\
.orderBy(func.desc('Average Salaries')).show()

+--------------------+------------------+
|           Job Title|  Average Salaries|
+--------------------+------------------+
|Analytics Enginee...|          399880.0|
|Data Science Tech...|          375000.0|
|Managing Director...|          300000.0|
|  AWS Data Architect|          258000.0|
|Cloud Data Architect|          250000.0|
|        AI Architect|          237484.0|
| Data Analytics Lead|          211255.5|
|Director of Data ...|209800.31578947368|
|Principal Data Sc...|193988.44444444444|
|Principal Data En...|          192500.0|
|Machine Learning ...|189538.76923076922|
|         ML Engineer|188072.42372881356|
|Data Science Manager|186074.05084745763|
|Staff Machine Lea...|          185000.0|
|Head of Data Science|182629.63636363635|
|        Head of Data|          181454.4|
|   Applied Scientist|  180935.306122449|
|   Research Engineer|176613.52631578947|
|Business Intellig...|          175522.5|
|Machine Learning ...|         173787.35|
+--------------------+------------

In [9]:
import pandas as pd

In [11]:
df = csv.toPandas()

In [12]:
df.head()

Unnamed: 0,Job Title,Employment Type,Experience Level,Expertise Level,Salary,Salary Currency,Company Location,Salary in USD,Employee Residence,Company Size,Year
0,Data Engineer,Full-Time,Senior,Expert,210000,United States Dollar,United States,210000,United States,Medium,2023
1,Data Engineer,Full-Time,Senior,Expert,165000,United States Dollar,United States,165000,United States,Medium,2023
2,Data Engineer,Full-Time,Senior,Expert,185900,United States Dollar,United States,185900,United States,Medium,2023
3,Data Engineer,Full-Time,Senior,Expert,129300,United States Dollar,United States,129300,United States,Medium,2023
4,Data Scientist,Full-Time,Senior,Expert,140000,United States Dollar,United States,140000,United States,Medium,2023


In [13]:
df.describe()

Unnamed: 0,Salary,Salary in USD,Year
count,3300.0,3300.0,3300.0
mean,204662.3,142095.983939,2022.495455
std,727938.3,69028.235512,0.716355
min,14000.0,15000.0,2020.0
25%,94169.0,90000.0,2022.0
50%,140000.0,136000.0,2023.0
75%,190000.0,185000.0,2023.0
max,30400000.0,450000.0,2023.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3300 entries, 0 to 3299
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Job Title           3300 non-null   object
 1   Employment Type     3300 non-null   object
 2   Experience Level    3300 non-null   object
 3   Expertise Level     3300 non-null   object
 4   Salary              3300 non-null   int32 
 5   Salary Currency     3300 non-null   object
 6   Company Location    3300 non-null   object
 7   Salary in USD       3300 non-null   int32 
 8   Employee Residence  3300 non-null   object
 9   Company Size        3300 non-null   object
 10  Year                3300 non-null   int32 
dtypes: int32(3), object(8)
memory usage: 245.0+ KB
