In [37]:
import pandas as pd
import re

### Explanation

The following function splits the salary column into four new columns:
* Lower Range (int)
* Upper Range (int)
* Source (Glassdoor or Employer)
* Hourly vs. Annual (based on 'K' indicating annual salaries)
    
**Arguments**:
* df (pd.DataFrame): The input DataFrame containing the salary column.
* salary_col (str): The name of the salary column. 

**Returns**:
* pd.DataFrame: A DataFrame with the new salary components.

Note that there are two versions of this function, you can pick one of them.  The first one (v1) uses regular expressions, the second one (v2) uses string manipulations.  The first version (split_salary_column_v1) is better!

In [41]:
def split_salary_column_v1(df, salary_col="Salary"):
    
    # Define regex pattern to extract salary components
    salary_pattern = re.compile(r"\$(\d{1,3})K\s*-\s*\$(\d{1,3})K.*\((Glassdoor|Employer) est.\)")

    # Lists to store extracted values
    lower_range, upper_range, source, hourly_annual = [], [], [], []

    for salary in df[salary_col]:
        match = salary_pattern.search(str(salary))
        if match:
            lower, upper, src = match.groups()
            lower_range.append(int(lower) * 1000)
            upper_range.append(int(upper) * 1000)
            source.append(src)
            hourly_annual.append("Annual")  # Since 'K' is present, assume annual
        else:
            lower_range.append(None)
            upper_range.append(None)
            source.append(None)
            hourly_annual.append(None)

    # Add extracted columns to the DataFrame
    df["Lower Range"] = lower_range
    df["Upper Range"] = upper_range
    df["Source"] = source
    df["Hourly vs. Annual"] = hourly_annual

    return df


def split_salary_column_v2(df, salary_col="Salary"):
    lower_range, upper_range, source, hourly_annual = [], [], [], []

    for salary in df[salary_col].astype(str):
        try:
            # Remove dollar signs and split at "-"
            salary_parts = salary.split("-")
            if len(salary_parts) != 2:
                raise ValueError("Invalid format")

            # Extract lower and upper salary ranges
            lower = salary_parts[0].replace("$", "").strip()
            upper, src_part = salary_parts[1].split("(")
            upper = upper.replace("$", "").strip()
            
            # Convert to numeric values (assuming 'K' means thousands)
            lower_val = int(lower.replace("K", "")) * 1000 if "K" in lower else int(lower)
            upper_val = int(upper.replace("K", "")) * 1000 if "K" in upper else int(upper)
            
            # Extract source (Glassdoor or Employer)
            source_val = "Glassdoor" if "Glassdoor" in src_part else "Employer" if "Employer" in src_part else None
            
            # Determine if salary is annual or hourly
            pay_type = "Annual" if "K" in lower else "Hourly"

            # Append values
            lower_range.append(lower_val)
            upper_range.append(upper_val)
            source.append(source_val)
            hourly_annual.append(pay_type)

        except:
            # Handle cases where parsing fails
            lower_range.append(None)
            upper_range.append(None)
            source.append(None)
            hourly_annual.append(None)

    # Add new columns to the DataFrame
    df["Lower Range"] = lower_range
    df["Upper Range"] = upper_range
    df["Source"] = source
    df["Hourly vs. Annual"] = hourly_annual

    return df

In [43]:
df = pd.read_csv('Software Engineer Salaries.csv')
df.head()

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary
0,ViewSoft,4.8,Software Engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.)
1,Workiva,4.3,Software Support Engineer,Remote,2d,$61K - $104K (Employer est.)
2,"Garmin International, Inc.",3.9,C# Software Engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.)
3,Snapchat,3.5,"Software Engineer, Fullstack, 1+ Years of Expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.)
4,Vitesco Technologies Group AG,3.1,Software Engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.)


In [45]:
df1 = split_salary_column_v1(df)

In [47]:
df1.head()

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,Lower Range,Upper Range,Source,Hourly vs. Annual
0,ViewSoft,4.8,Software Engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.),68000.0,94000.0,Glassdoor,Annual
1,Workiva,4.3,Software Support Engineer,Remote,2d,$61K - $104K (Employer est.),61000.0,104000.0,Employer,Annual
2,"Garmin International, Inc.",3.9,C# Software Engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.),95000.0,118000.0,Glassdoor,Annual
3,Snapchat,3.5,"Software Engineer, Fullstack, 1+ Years of Expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.),97000.0,145000.0,Employer,Annual
4,Vitesco Technologies Group AG,3.1,Software Engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.),85000.0,108000.0,Glassdoor,Annual


In [49]:
df2 = split_salary_column_v2(df)
df2.head()

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,Lower Range,Upper Range,Source,Hourly vs. Annual
0,ViewSoft,4.8,Software Engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.),68000.0,94000.0,Glassdoor,Annual
1,Workiva,4.3,Software Support Engineer,Remote,2d,$61K - $104K (Employer est.),61000.0,104000.0,Employer,Annual
2,"Garmin International, Inc.",3.9,C# Software Engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.),95000.0,118000.0,Glassdoor,Annual
3,Snapchat,3.5,"Software Engineer, Fullstack, 1+ Years of Expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.),97000.0,145000.0,Employer,Annual
4,Vitesco Technologies Group AG,3.1,Software Engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.),85000.0,108000.0,Glassdoor,Annual
