# 25 Pandas Cleaning

In [1]:
# importing pandas and loading the data in a dataframe

import pandas as pd
from datasets import load_dataset

dataset = load_dataset("lukebarousse/data_jobs")
df = dataset["train"].to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


## 25.1 Check df info

In [2]:
# first thing we'll do is check the dataframe

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   job_title_short        785741 non-null  object 
 1   job_title              785740 non-null  object 
 2   job_location           784696 non-null  object 
 3   job_via                785733 non-null  object 
 4   job_schedule_type      773074 non-null  object 
 5   job_work_from_home     785741 non-null  bool   
 6   search_location        785741 non-null  object 
 7   job_posted_date        785741 non-null  object 
 8   job_no_degree_mention  785741 non-null  bool   
 9   job_health_insurance   785741 non-null  bool   
 10  job_country            785692 non-null  object 
 11  salary_rate            33067 non-null   object 
 12  salary_year_avg        22003 non-null   float64
 13  salary_hour_avg        10662 non-null   float64
 14  company_name           785723 non-nu

## 25.2 Change data type (to date time type)

In [4]:
# the job_posted_date is stored as an object i.e. a string. We want to change it's type to do a whole lot with it!
# we'll use the pd.to_datetime() function

pd.to_datetime(df.job_posted_date)


0        2023-06-16 13:44:15
1        2023-01-14 13:18:07
2        2023-10-10 13:14:55
3        2023-07-04 13:01:41
4        2023-08-07 14:29:36
                 ...        
785736   2023-03-13 06:16:16
785737   2023-03-12 06:18:18
785738   2023-03-12 06:32:36
785739   2023-03-12 06:32:15
785740   2023-03-13 06:16:31
Name: job_posted_date, Length: 785741, dtype: datetime64[ns]

In [6]:
# but doing that, the original data in the dataframe hasn't been changed.
# we need to assign the new changed format to itself in the dataframe to modify it at the source.

df["job_posted_date"] = pd.to_datetime(df.job_posted_date) # to reassign a value, we use the brackets rather than the dot syntax 

In [8]:
# and now the column type has changed

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785741 non-null  object        
 1   job_title              785740 non-null  object        
 2   job_location           784696 non-null  object        
 3   job_via                785733 non-null  object        
 4   job_schedule_type      773074 non-null  object        
 5   job_work_from_home     785741 non-null  bool          
 6   search_location        785741 non-null  object        
 7   job_posted_date        785741 non-null  datetime64[ns]
 8   job_no_degree_mention  785741 non-null  bool          
 9   job_health_insurance   785741 non-null  bool          
 10  job_country            785692 non-null  object        
 11  salary_rate            33067 non-null   object        
 12  salary_year_avg        22003 non-null   floa

## 25.3 Create a new column

In [9]:
# there are now a lot of other properties available. if we write df.job_posted_date.dt. there are a lot of options

df.job_posted_date.dt.month # this is a property and not a method, so we don't have to use the brackets.

0          6
1          1
2         10
3          7
4          8
          ..
785736     3
785737     3
785738     3
785739     3
785740     3
Name: job_posted_date, Length: 785741, dtype: int32

In [11]:
# but that month value is nowhere in our dataframe. We want to update it
# we need to add a new column

df["job_posted_month"] = df.job_posted_date.dt.month # we need to use the brackets to designate the name of the new column

In [13]:
df.info()

# we now have the job_posted_month column.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785741 non-null  object        
 1   job_title              785740 non-null  object        
 2   job_location           784696 non-null  object        
 3   job_via                785733 non-null  object        
 4   job_schedule_type      773074 non-null  object        
 5   job_work_from_home     785741 non-null  bool          
 6   search_location        785741 non-null  object        
 7   job_posted_date        785741 non-null  datetime64[ns]
 8   job_no_degree_mention  785741 non-null  bool          
 9   job_health_insurance   785741 non-null  bool          
 10  job_country            785692 non-null  object        
 11  salary_rate            33067 non-null   object        
 12  salary_year_avg        22003 non-null   floa

## 25.4 Sort the df by a certain column

In [17]:
# let's now organise our data in chronological order, from first to last published using the job_posted_date column
# we will use the pd.DataFrame.sort_values method

df.sort_values(by="job_posted_date", inplace=True)

# the inplace=True modifies the actual dataframe and sorts everything in chronological order. Otherwise, it's just a "view" of the df in order.

In [18]:
# now in chronological order

df

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills,job_posted_month
108804,Data Analyst,Data Analyst,"New York, NY",via CareerBuilder,Full-time,False,"New York, United States",2023-01-01 00:00:04,False,False,United States,,,,Metasys Technologies,"['sql', 'snowflake', 'visio', 'jira', 'conflue...","{'analyst_tools': ['visio'], 'async': ['jira',...",1
96906,Data Analyst,Data Analyst,"Bloomfield, CT",via CareerBuilder,Full-time,False,"New York, United States",2023-01-01 00:00:07,True,False,United States,,,,Diverse Lynx,"['sql', 'sas', 'sas']","{'analyst_tools': ['sas'], 'programming': ['sq...",1
98438,Data Analyst,Data Analyst,"Washington, DC",via CareerBuilder,Full-time,False,"New York, United States",2023-01-01 00:00:22,False,True,United States,,,,Guidehouse,"['sql', 'python', 'r', 'azure', 'snowflake', '...","{'analyst_tools': ['tableau', 'excel'], 'cloud...",1
110290,Data Analyst,Data Analyst,"Fairfax, VA",via CareerBuilder,Full-time,False,"New York, United States",2023-01-01 00:00:24,False,False,United States,,,,Protask,"['sql', 'jira']","{'async': ['jira'], 'programming': ['sql']}",1
72158,Senior Data Analyst,Senior Data Analyst / Platform Experience,"Worcester, MA",via LinkedIn,Full-time,False,"New York, United States",2023-01-01 00:00:27,False,True,United States,,,,Atria Wealth Solutions,"['sql', 'atlassian', 'jira']","{'async': ['jira'], 'other': ['atlassian'], 'p...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351830,Data Engineer,Data engineer,"San Francisco, CA",via Talent.com,Full-time,False,Sudan,2023-12-31 23:40:31,False,False,Sudan,,,,JBL Resources,"['azure', 'power bi', 'word']","{'analyst_tools': ['power bi', 'word'], 'cloud...",12
362129,Data Engineer,Data engineer,"Sunnyvale, CA",via Talent.com,Full-time,False,Sudan,2023-12-31 23:40:31,True,False,Sudan,,,,ITCO Solutions,"['sql', 'python', 'java', 'snowflake', 'airflo...","{'cloud': ['snowflake'], 'libraries': ['airflo...",12
327439,Data Engineer,Staff engineer data,"Southfield, MI",via Talent.com,Full-time,False,Sudan,2023-12-31 23:40:32,False,False,Sudan,year,140000.0,,Credit Acceptance,"['express', 'kubernetes']","{'other': ['kubernetes'], 'webframeworks': ['e...",12
112373,Senior Data Analyst,Marketing Data & Analytics Product Owner - Sen...,"Boston, MA (+1 other)",via Boston Consulting Group,Full-time,False,"New York, United States",2023-12-31 23:59:58,False,False,United States,,,,Boston Consulting Group,"['sql', 'python', 'r', 'gdpr', 'tableau', 'pow...","{'analyst_tools': ['tableau', 'power bi'], 'li...",12


## 25.5 Drop a column

In [21]:
# let's remove the salary_hour_avg
# we can use the drop() method

df.drop(labels="salary_hour_avg", axis=1, inplace=True)

# need to precise the name of the label
# need to precise if index/row or column that we want to remove with the axis. Axis = 0 -» indes / Axis = 1 -» column
# need to use the inplace argument to actually remove the column from the dataframe rather than just giving out an updated view of it

## 25.6 Drop rows with empty values

In [22]:
# let's remove rows without values in the yearly salary
# we can use the dropna() method

df.dropna(axis=0, subset=["salary_year_avg"], inplace=True)

# precised axis=0 to remove rows. We don't have to precise necessarily as axis=0 by default
# the precise the name of the column(s) to look into to find blank values and remove the associated rows. Need to be a list. Can be multiple columns.
# need to use the inplace argument to actually remove the column from the dataframe rather than just giving out an updated view of it


In [23]:
df.info()

# we now only have 22003 rows -» the ones with yearly salary info
# we don't have hourly salary info
# we have extra month column
# we changed the data type of the date column
# we sorted in chronological order of publication

<class 'pandas.core.frame.DataFrame'>
Index: 22003 entries, 108883 to 327439
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   job_title_short        22003 non-null  object        
 1   job_title              22003 non-null  object        
 2   job_location           21723 non-null  object        
 3   job_via                22003 non-null  object        
 4   job_schedule_type      21987 non-null  object        
 5   job_work_from_home     22003 non-null  bool          
 6   search_location        22003 non-null  object        
 7   job_posted_date        22003 non-null  datetime64[ns]
 8   job_no_degree_mention  22003 non-null  bool          
 9   job_health_insurance   22003 non-null  bool          
 10  job_country            22003 non-null  object        
 11  salary_rate            22003 non-null  object        
 12  salary_year_avg        22003 non-null  float64       
 13  

# 25 Problems

## 1.25.1

Sort the DataFrame by salary_year_avg in descending order and display the first 10 rows.

In [24]:
# calling the df as df1 to avoid conflict with above course notes

import pandas as pd
from datasets import load_dataset

dataset = load_dataset("lukebarousse/data_jobs")
df1 = dataset["train"].to_pandas()

In [26]:
df1.sort_values(by="salary_year_avg", ascending=False, inplace=True)

df1.head(10)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
554784,Data Scientist,Data Scientist,"Madison, SD",via Cooperative Career Center,Full-time,False,"Illinois, United States",2023-05-04 19:06:06,False,True,United States,year,960000.0,,"East River Electric Power Cooperative, Inc.","['java', 'c++', 'python', 'r']","{'programming': ['java', 'c++', 'python', 'r']}"
665811,Senior Data Scientist,Senior Data Scientist,"Pretoria, South Africa",via Adzuna,Full-time,False,South Africa,2023-11-02 10:31:43,False,False,South Africa,year,890000.0,,MSP Staffing LTD,"['python', 'sql', 'java', 'c#', 'databricks', ...","{'cloud': ['databricks', 'azure'], 'libraries'..."
168402,Data Analyst,Data Analyst,Anywhere,via Y Combinator,Full-time,True,India,2023-02-20 15:13:44,True,False,India,year,650000.0,,Mantys,,
387378,Data Scientist,Geographic Information Systems Analyst - GIS A...,"New York, NY",via ZipRecruiter,Full-time,False,"New York, United States",2023-12-27 18:00:12,False,False,United States,year,585000.0,,ReServe,['excel'],{'analyst_tools': ['excel']}
160521,Data Scientist,Staff Data Scientist/Quant Researcher,Anywhere,via LinkedIn,Full-time,True,Sudan,2023-08-16 16:11:58,False,False,Sudan,year,550000.0,,Selby Jennings,"['python', 'sql']","{'programming': ['python', 'sql']}"
404932,Data Engineer,Hybrid - Data Engineer - Up to $600k,"New York, NY",via LinkedIn,Full-time,False,"Florida, United States",2023-04-05 16:10:48,False,False,United States,year,525000.0,,Durlston Partners,"['python', 'c++']","{'programming': ['python', 'c++']}"
554140,Data Scientist,Staff Data Scientist - Business Analytics,Anywhere,via LinkedIn,Full-time,True,"Texas, United States",2023-09-01 19:24:02,False,False,United States,year,525000.0,,Selby Jennings,['sql'],{'programming': ['sql']}
417241,Senior Data Scientist,Senior Data Scientist,Anywhere,via LinkedIn,Full-time,True,"Texas, United States",2023-01-31 16:03:46,False,False,United States,year,475000.0,,Glocomms,,
618461,Senior Data Scientist,VP Data Science & Research,"Menlo Park, CA",via LinkedIn,Full-time,False,"California, United States",2023-11-08 12:23:39,False,False,United States,year,463500.0,,WhatsApp,['go'],{'programming': ['go']}
124265,Data Scientist,Data Scientist (L5) - Messaging,"Los Gatos, CA",via IT JobServe,Full-time,False,"California, United States",2023-08-17 08:04:09,False,False,United States,year,450000.0,,Netflix,"['python', 'r', 'sql', 'spark', 'excel']","{'analyst_tools': ['excel'], 'libraries': ['sp..."


## 1.25.2

Create a new column is_remote that indicates whether the job is remote (True) or not (False) based on the job_work_from_home column.

In [28]:
df1["is_remote"] = df1["job_work_from_home"]

## 1.25.3

Create a new column job_posted_year that extracts the year from the job_posted_date column and display the first 5 rows.

In [30]:
df1["job_posted_date"] = pd.to_datetime(df.job_posted_date)

df1["job_posted_year"] = df1.job_posted_date.dt.year

df1.head(5)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills,is_remote,job_posted_year
554784,Data Scientist,Data Scientist,"Madison, SD",via Cooperative Career Center,Full-time,False,"Illinois, United States",2023-05-04 19:06:06,False,True,United States,year,960000.0,,"East River Electric Power Cooperative, Inc.","['java', 'c++', 'python', 'r']","{'programming': ['java', 'c++', 'python', 'r']}",False,2023.0
665811,Senior Data Scientist,Senior Data Scientist,"Pretoria, South Africa",via Adzuna,Full-time,False,South Africa,2023-11-02 10:31:43,False,False,South Africa,year,890000.0,,MSP Staffing LTD,"['python', 'sql', 'java', 'c#', 'databricks', ...","{'cloud': ['databricks', 'azure'], 'libraries'...",False,2023.0
168402,Data Analyst,Data Analyst,Anywhere,via Y Combinator,Full-time,True,India,2023-02-20 15:13:44,True,False,India,year,650000.0,,Mantys,,,True,2023.0
387378,Data Scientist,Geographic Information Systems Analyst - GIS A...,"New York, NY",via ZipRecruiter,Full-time,False,"New York, United States",2023-12-27 18:00:12,False,False,United States,year,585000.0,,ReServe,['excel'],{'analyst_tools': ['excel']},False,2023.0
160521,Data Scientist,Staff Data Scientist/Quant Researcher,Anywhere,via LinkedIn,Full-time,True,Sudan,2023-08-16 16:11:58,False,False,Sudan,year,550000.0,,Selby Jennings,"['python', 'sql']","{'programming': ['python', 'sql']}",True,2023.0


## 1.25.4

Drop rows where the salary_year_avg column has missing values and display the first 5 rows of the cleaned DataFrame.

In [31]:
df1.dropna(axis=0, subset=["salary_year_avg"], inplace=True)

df1.head(5)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills,is_remote,job_posted_year
554784,Data Scientist,Data Scientist,"Madison, SD",via Cooperative Career Center,Full-time,False,"Illinois, United States",2023-05-04 19:06:06,False,True,United States,year,960000.0,,"East River Electric Power Cooperative, Inc.","['java', 'c++', 'python', 'r']","{'programming': ['java', 'c++', 'python', 'r']}",False,2023.0
665811,Senior Data Scientist,Senior Data Scientist,"Pretoria, South Africa",via Adzuna,Full-time,False,South Africa,2023-11-02 10:31:43,False,False,South Africa,year,890000.0,,MSP Staffing LTD,"['python', 'sql', 'java', 'c#', 'databricks', ...","{'cloud': ['databricks', 'azure'], 'libraries'...",False,2023.0
168402,Data Analyst,Data Analyst,Anywhere,via Y Combinator,Full-time,True,India,2023-02-20 15:13:44,True,False,India,year,650000.0,,Mantys,,,True,2023.0
387378,Data Scientist,Geographic Information Systems Analyst - GIS A...,"New York, NY",via ZipRecruiter,Full-time,False,"New York, United States",2023-12-27 18:00:12,False,False,United States,year,585000.0,,ReServe,['excel'],{'analyst_tools': ['excel']},False,2023.0
160521,Data Scientist,Staff Data Scientist/Quant Researcher,Anywhere,via LinkedIn,Full-time,True,Sudan,2023-08-16 16:11:58,False,False,Sudan,year,550000.0,,Selby Jennings,"['python', 'sql']","{'programming': ['python', 'sql']}",True,2023.0
