# Pandas Applying Functions

**df.apply(lmbda x: x * 2)**

In [29]:
# Import Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Load Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup (to convert job_posted_date to datetime)
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

### Notes

We haven't analyzed the job_skills column yet. In part because the data is stored as string. We need it to be a list.

That's where the apply() method comes to help us in cleaning up that column.

Let's start with some simple examples first.

In [30]:
# Let's inspect the salary_year_avg column
# df['salary_year_avg'] -- we're going to use the notna() method to filter out NaN values
df_salary = df[pd.notna(df['salary_year_avg'])].copy()

# Apply method on COLUMNS

In [31]:
# What we want to do now is to calculate the projected salary for next year
# we'll consider an inflation of 3% and apply to the salary_year_avg

def projected_salary(salary):
    return salary * 1.03

df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(projected_salary)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [32]:
# Another way to write the code above is using an anonymous function, Lambda!
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)

df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


In [33]:
# Yet another way - a more simplistic way - is to simply multiply the column by 1.03
df_salary['salary_year_inflated'] = df_salary['salary_year_avg'] * 1.03
df_salary[['salary_year_avg', 'salary_year_inflated']]

Unnamed: 0,salary_year_avg,salary_year_inflated
28,109500.0,112785.00
77,140000.0,144200.00
92,120000.0,123600.00
100,228222.0,235068.66
109,89000.0,91670.00
...,...,...
785624,139216.0,143392.48
785641,150000.0,154500.00
785648,221875.0,228531.25
785682,157500.0,162225.00


### Fixing the job_skills column
(changing the data type from string to list)

For changing the type from string to a list, we need to use the **.apply()** method - there is no *"simplistic"* way around it unlike the previous example.

In [34]:
# if we want to check and confirm that the job_skills column is a string
# we can use the type() method

type(df['job_skills'][1])

str

In [35]:
# In the past, we used the list() method to convert a string to a list
# list(df['job_skills'] [1])
# But if we did that now, we would get a list of characters!
list(df['job_skills'] [1])

['[',
 "'",
 'r',
 "'",
 ',',
 ' ',
 "'",
 'p',
 'y',
 't',
 'h',
 'o',
 'n',
 "'",
 ',',
 ' ',
 "'",
 's',
 'q',
 'l',
 "'",
 ',',
 ' ',
 "'",
 'n',
 'o',
 's',
 'q',
 'l',
 "'",
 ',',
 ' ',
 "'",
 'p',
 'o',
 'w',
 'e',
 'r',
 ' ',
 'b',
 'i',
 "'",
 ',',
 ' ',
 "'",
 't',
 'a',
 'b',
 'l',
 'e',
 'a',
 'u',
 "'",
 ']']

In [36]:
# so, instead, we're going to use the ast.literal_eval(node_or_string) method
import ast # ast is part of the standard library, so we don't need to install it

ast.literal_eval(df['job_skills'][1])


['r', 'python', 'sql', 'nosql', 'power bi', 'tableau']

In [37]:
# to confirm this is now a list we can use the type() method
type(ast.literal_eval(df['job_skills'][1]))

list

In [38]:
# Note that we cannot run the ast.literal_eval() method on the entire column
# because it will throw a value error
# instead, we can use the apply() method to apply the ast.literal_eval() method to each row


# IMPORTANT: If we run this code, the next Jupyter cell will throw an error. So this cell has been commented out
# First we are going to create a function called clean_list
# def clean_list(skill_list):
#     # we need to filter with isna() to avoid NaN values
#     if pd.notna(skill_list):
#         return ast.literal_eval(skill_list)

# df['job_skills'] = df['job_skills'].apply(clean_list) 

In [39]:
# Once again, because the clean_list function is so simple, we can re-write this code as a lambda function
df['job_skills'] = df['job_skills'].apply(lambda skill_list: ast.literal_eval(skill_list) if pd.notna(skill_list) else skill_list)
# df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
df.head(5)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,"Watertown, CT",via Work Nearby,Full-time,False,"Texas, United States",2023-06-16 13:44:15,False,False,United States,,,,Boehringer Ingelheim,,
1,Data Analyst,Data Analyst,"Guadalajara, Jalisco, Mexico",via BeBee México,Full-time,False,Mexico,2023-01-14 13:18:07,False,False,Mexico,,,,Hewlett Packard Enterprise,"[r, python, sql, nosql, power bi, tableau]","{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","Berlin, Germany",via LinkedIn,Full-time,False,Germany,2023-10-10 13:14:55,False,False,Germany,,,,ALPHA Augmented Services,"[python, sql, c#, azure, airflow, dax, docker,...","{'analyst_tools': ['dax'], 'cloud': ['azure'],..."
3,Data Engineer,LEAD ENGINEER - PRINCIPAL ANALYST - PRINCIPAL ...,"San Antonio, TX",via Diversity.com,Full-time,False,"Texas, United States",2023-07-04 13:01:41,True,False,United States,,,,Southwest Research Institute,"[python, c++, java, matlab, aws, tensorflow, k...","{'cloud': ['aws'], 'libraries': ['tensorflow',..."
4,Data Engineer,Data Engineer- Sr Jobs,"Washington, DC",via Clearance Jobs,Full-time,False,Sudan,2023-08-07 14:29:36,False,False,Sudan,,,,Kristina Daniel,"[bash, python, oracle, aws, ansible, puppet, j...","{'cloud': ['oracle', 'aws'], 'other': ['ansibl..."


# Apply method on ROWS

Let's say that the salary increase will depend on the role.
- Senior roles will have a 5% increase
- Other roles will have a 3% increase

In a situation like this we need to use the **apply()** method to certain rows only. For that we will use the axis parameter

In [44]:
# Instead of providing the column 'salary_year_avg' as shown below...
# df_salary['salary_year_inflated'] = df_salary['salary_year_avg'].apply(lambda salary: salary * 1.03)
# ...we will provide the entire dataframe
# df_salary['salary_year_inflated'] = df_salary.apply(<function>, axis=1) || axis = 1 means we want to apply the function to each row

# But first, let's create the function
def projected_salary(row):
    # we will check if there is the word 'Senior' in the column 'job_title_short'
    if 'Senior' in row['job_title_short']:
        return 1.05 * row['salary_year_avg']
    else:
        return 1.03 * row['salary_year_avg']

df_salary['salary_year_inflated'] = df_salary.apply(projected_salary, axis=1)

df_salary[['job_title_short','salary_year_avg', 'salary_year_inflated']]
# Note in the output below that it only applies the 5% increase to the Senior Software Engineer position, for other positions it applies 3%


Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00


In [None]:
# of course we can also use a lambda function to obtain the same result
df_salary['salary_year_inflated'] = df_salary.apply(lambda row: 1.05 * row['salary_year_avg'] if 'Senior' in row['job_title_short'] else 1.03 * row['salary_year_avg'], axis=1)
df_salary[['job_title_short','salary_year_avg', 'salary_year_inflated']]
# Note in the output below that it only applies the 5% increase to the Senior Software Engineer position, for other positions it applies 3%

Unnamed: 0,job_title_short,salary_year_avg,salary_year_inflated
28,Data Scientist,109500.0,112785.00
77,Data Engineer,140000.0,144200.00
92,Data Engineer,120000.0,123600.00
100,Data Scientist,228222.0,235068.66
109,Data Analyst,89000.0,91670.00
...,...,...,...
785624,Data Engineer,139216.0,143392.48
785641,Data Engineer,150000.0,154500.00
785648,Data Scientist,221875.0,228531.25
785682,Data Scientist,157500.0,162225.00
