# 26 Pandas Analyzing

Doing basic analytics using pandas e.g. mean, max, mix, etc., and also grouping results to analyse and identify trends.

In [28]:
# import pandas
import pandas as pd
from datasets import load_dataset

# load data
dataset = load_dataset("lukebarousse/data_jobs")
df = dataset["train"].to_pandas()

# Data cleaning - transform to date and time data type

df["job_posted_date"] = pd.to_datetime(df["job_posted_date"])

In [29]:
# we now have the details on the date column when calling the describe method that details all numerical columns

df.describe()

Unnamed: 0,job_posted_date,salary_year_avg,salary_hour_avg
count,785741,22003.0,10662.0
mean,2023-06-25 16:02:11.860248576,123286.274072,47.016598
min,2023-01-01 00:00:04,15000.0,8.0
25%,2023-03-20 10:05:48,90000.0,27.5
50%,2023-06-29 06:11:38,115000.0,45.98
75%,2023-09-27 01:01:16,150000.0,61.159996
max,2023-12-31 23:59:58,960000.0,391.0
std,,48312.449482,21.890738


In [30]:
# also check summary info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785741 non-null  object        
 1   job_title              785740 non-null  object        
 2   job_location           784696 non-null  object        
 3   job_via                785733 non-null  object        
 4   job_schedule_type      773074 non-null  object        
 5   job_work_from_home     785741 non-null  bool          
 6   search_location        785741 non-null  object        
 7   job_posted_date        785741 non-null  datetime64[ns]
 8   job_no_degree_mention  785741 non-null  bool          
 9   job_health_insurance   785741 non-null  bool          
 10  job_country            785692 non-null  object        
 11  salary_rate            33067 non-null   object        
 12  salary_year_avg        22003 non-null   floa

In [31]:
# we can also get some additional info using additional methods
# numerical methods will return an error the moment a column with strings is included. We need to precise the columns we want

df["salary_year_avg"].median()

np.float64(115000.0)

In [32]:
# we always need to investigate the min and max values to ensure there is nothing wrong with the data

# the min = 15000 (as seen using the df.describe() method)

df["salary_year_avg"].min() # just checking we get the same min

np.float64(15000.0)

In [33]:
# we want to find the index value of the min value to investigate
# there's a method for that! df.idxmin()

min_salary = df["salary_year_avg"].idxmin()

df.iloc[min_salary]

# we find out that the job is in brazil. Looking into it further, 15000 isn't that bad of a salary in Brazil.

job_title_short                                              Data Engineer
job_title                                           Data Engineer - Hadoop
job_location                                                        Brazil
job_via                                                       via LinkedIn
job_schedule_type                                                Full-time
job_work_from_home                                                   False
search_location                                                     Brazil
job_posted_date                                        2023-12-09 10:05:30
job_no_degree_mention                                                 True
job_health_insurance                                                 False
job_country                                                         Brazil
salary_rate                                                           year
salary_year_avg                                                    15000.0
salary_hour_avg          

In [34]:
# we used the unique() method to find out the unique values.

df["job_title_short"].unique()

array(['Senior Data Engineer', 'Data Analyst', 'Data Engineer',
       'Business Analyst', 'Data Scientist', 'Machine Learning Engineer',
       'Senior Data Analyst', 'Cloud Engineer', 'Senior Data Scientist',
       'Software Engineer'], dtype=object)

In [35]:
# it'S cool and all, but not very useful. We want the unique value counts instead.

df["job_title_short"].value_counts()

job_title_short
Data Analyst                 196075
Data Engineer                186241
Data Scientist               172286
Business Analyst              49063
Software Engineer             44929
Senior Data Engineer          44563
Senior Data Scientist         36957
Senior Data Analyst           29216
Machine Learning Engineer     14080
Cloud Engineer                12331
Name: count, dtype: int64

In [36]:
# what if we want to dive deeper and combine the analysis we've done before e.g. what are the min salaries of each of the job titles
# we use the df.groupby() method

df.groupby("job_title_short")["salary_year_avg"].min()

job_title_short
Business Analyst             16500.0
Cloud Engineer               42000.0
Data Analyst                 25000.0
Data Engineer                15000.0
Data Scientist               27000.0
Machine Learning Engineer    30000.0
Senior Data Analyst          30000.0
Senior Data Engineer         35000.0
Senior Data Scientist        45000.0
Software Engineer            28000.0
Name: salary_year_avg, dtype: float64

In [37]:
df.groupby("job_title_short")["salary_year_avg"].median()

job_title_short
Business Analyst              85000.0
Cloud Engineer                90000.0
Data Analyst                  90000.0
Data Engineer                125000.0
Data Scientist               127500.0
Machine Learning Engineer    106415.0
Senior Data Analyst          111175.0
Senior Data Engineer         147500.0
Senior Data Scientist        155500.0
Software Engineer             99150.0
Name: salary_year_avg, dtype: float64

In [38]:
# we can also group by additional info e.g. country

df.groupby(["job_title_short", "job_country"])["salary_year_avg"].median()

job_title_short    job_country
Business Analyst   Afghanistan        NaN
                   Albania            NaN
                   Algeria            NaN
                   Angola             NaN
                   Argentina      71100.0
                                   ...   
Software Engineer  Venezuela          NaN
                   Vietnam        53600.0
                   Yemen              NaN
                   Zambia             NaN
                   Zimbabwe           NaN
Name: salary_year_avg, Length: 1387, dtype: float64

In [39]:
# we can perform aggregations on multiple columns. E.g. what if we also want to look at the hourly salary data

df.groupby("job_title_short")[["salary_year_avg", "salary_hour_avg"]].median()


Unnamed: 0_level_0,salary_year_avg,salary_hour_avg
job_title_short,Unnamed: 1_level_1,Unnamed: 2_level_1
Business Analyst,85000.0,40.362499
Cloud Engineer,90000.0,40.0
Data Analyst,90000.0,32.615002
Data Engineer,125000.0,58.5
Data Scientist,127500.0,47.5
Machine Learning Engineer,106415.0,47.535
Senior Data Analyst,111175.0,47.5
Senior Data Engineer,147500.0,61.5
Senior Data Scientist,155500.0,47.620003
Software Engineer,99150.0,47.560001


In [40]:
# what if I want to do multiple aggregations
# use the agg method

df.groupby("job_title_short")[["salary_year_avg", "salary_hour_avg"]].agg(["min", "max", "median"])

Unnamed: 0_level_0,salary_year_avg,salary_year_avg,salary_year_avg,salary_hour_avg,salary_hour_avg,salary_hour_avg
Unnamed: 0_level_1,min,max,median,min,max,median
job_title_short,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Business Analyst,16500.0,387460.0,85000.0,10.835,132.5,40.362499
Cloud Engineer,42000.0,280000.0,90000.0,18.5,91.0,40.0
Data Analyst,25000.0,650000.0,90000.0,8.0,391.0,32.615002
Data Engineer,15000.0,525000.0,125000.0,8.0,184.5,58.5
Data Scientist,27000.0,960000.0,127500.0,8.0,237.5,47.5
Machine Learning Engineer,30000.0,325000.0,106415.0,10.0,87.5,47.535
Senior Data Analyst,30000.0,425000.0,111175.0,13.0,130.0,47.5
Senior Data Engineer,35000.0,425000.0,147500.0,12.5,170.0,61.5
Senior Data Scientist,45000.0,890000.0,155500.0,17.5,150.0,47.620003
Software Engineer,28000.0,375000.0,99150.0,11.0,102.5,47.560001


# 26 Problem

## 1.26.1

Calculate the mean of the salary_year_avg column and display the result.

In [45]:
mean_salary = df["salary_year_avg"].mean()
mean_salary

np.float64(123286.27407182401)

## 1.26.2

Calculate the median of the salary_year_avg column and display the result.

In [46]:
median_salary = df["salary_year_avg"].median()
median_salary

np.float64(115000.0)

## 1.26.3

Group the DataFrame by job_title_short and calculate the average salary_hour_avg for each job title. Display the result.

In [48]:
avg_hourly_by_job = df.groupby("job_title_short")["salary_hour_avg"].mean()
avg_hourly_by_job

job_title_short
Business Analyst             42.723415
Cloud Engineer               45.560715
Data Analyst                 38.106918
Data Engineer                57.196998
Data Scientist               49.361403
Machine Learning Engineer    48.477935
Senior Data Analyst          47.661218
Senior Data Engineer         62.848496
Senior Data Scientist        54.105631
Software Engineer            46.452019
Name: salary_hour_avg, dtype: float64

## 1.26.4

Find the job title with the lowest average hourly salary and display the job title and the salary.

In [50]:
min_salary = df["salary_hour_avg"].idxmin()

lowest_hourly = df.iloc[min_salary][["job_title_short", "salary_hour_avg"]]

lowest_hourly

job_title_short    Data Scientist
salary_hour_avg               8.0
Name: 88185, dtype: object

## 1.26.5

Group the DataFrame by job_country and calculate the total number of job postings for each country. Display the result.

In [51]:
job_by_country = df.groupby("job_country").size()

job_by_country

job_country
Afghanistan      16
Albania         119
Algeria         111
Angola           11
Argentina      8736
               ... 
Venezuela        69
Vietnam        2414
Yemen            10
Zambia          104
Zimbabwe         81
Length: 160, dtype: int64