# Extracting rows with salary information

Emilio Lehoucq - 6/6/24

## Importing libraries

In [1]:
import pandas as pd
import re

## Reading data

In [2]:
df = pd.read_csv('cesnet_data_march_6_2024.csv')
print(f'Number of rows: {len(df)}')

Number of rows: 2308


## Identifying rows with salary information

In [3]:
# Columns to store data
df['salary_potential_string'] = None
df['salary_potential_flag'] = None

# Number of characters around the potential salary to extract
N_CHARACTERS = 60

# Loop through each row in the dataframe
for i, row in df.iterrows():
    # Text of the message
    text = row['text']
    # Check if there seems to be a money sign followed by digits
    salary_flag = re.finditer(r'[€£$]\s?\d{1,6}', text)
    # Variables to store info
    list_salary_potential_strings = []
    list_salary_potential_flags = []
    # Iterate over potential matches
    for match in salary_flag:
        # Get the index where the info seems to start
        start_idx = match.start()
        # Get the index where the info seems to end
        end_idx = match.end()
        # Get characters around the info
        start_string = start_idx - N_CHARACTERS
        end_string = end_idx + N_CHARACTERS
        # Extract info
        salary_potential_string = text[start_string:end_string].replace('\n', ' ').strip().lower()
        # Check that the string includes the word 'salary' or 'compensation' or 'pay'
        if 'salary' in salary_potential_string or 'compensation' in salary_potential_string or 'pay' in salary_potential_string:
            # Store info
            list_salary_potential_strings.append(salary_potential_string)
            list_salary_potential_flags.append(True)
    # Store info in df
    df.at[i, 'salary_potential_flag'] = any(list_salary_potential_flags)
    if len(list_salary_potential_strings) > 0:
        df.at[i, 'salary_potential_string'] = list_salary_potential_strings
    # If no salary found so far
    else:
        # Store None in df
        df.at[i, 'salary_potential_string'] = None
        # # I WROTE THE CODE BELOW TO CHECK FOR SALARY INFO IN CASE THERE WAS NO MONEY SIGN
        # # BUT I LOOKED AT THE OUTPUT AND THERE DOESN'T SEEM TO BE, SO I'M COMMENTING IT OUT
        # # Check if text mentions keywords
        # if 'salary' in text or 'compensation' in text or 'pay' in text:
        #     # Check if text mentions numbers that look like salary
        #     salary_flag = re.finditer(r'\b\d{1,3}[,.]?\d{3}\b', text)
        #     # Iterate over potential matches
        #     list_salary_potential_strings = []
        #     list_salary_potential_flags = []
        #     for match in salary_flag:
        #         # Get the index where the info seems to start
        #         start_idx = match.start()
        #         # Get the index where the info seems to end
        #         end_idx = match.end()
        #         # Get characters around the info
        #         start_string = start_idx - N_CHARACTERS
        #         end_string = end_idx + N_CHARACTERS
        #         # Extract info
        #         salary_potential_string = text[start_string:end_string].replace('\n', ' ').strip().lower()
        #         # Check that the string includes the word 'salary' or 'compensation' or 'pay'
        #         if 'salary' in salary_potential_string or 'compensation' in salary_potential_string or 'pay' in salary_potential_string:
        #             # Store info
        #             list_salary_potential_strings.append(salary_potential_string)
        #             list_salary_potential_flags.append(True)
        #     # If there seems to be salary info
        #     if any(list_salary_potential_flags):
        #         # Store info in df
        #         df.at[i, 'salary_potential_flag'] = True
        #         df.at[i, 'salary_potential_string'] = list_salary_potential_strings

# Get subset of rows with potential salary
df_subset = df[df['salary_potential_string'].notna()]
print(f'There are {len(df_subset[df_subset["salary_potential_flag"] == True])} rows with potential salary')

There are 61 rows with potential salary


## Save to CSV

In [4]:
df_subset.to_csv('cesnet_data_march_6_2024_subset_with_salary.csv', index=False)