In [43]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv("indeed_data_science_jobs.csv")

print(df.head(20))


    Position                                          Job Title  \
0          1                                     Data Scientist   
1          2  Senior Artificial Intelligence Researcher for ...   
2          3                              Senior Data Scientist   
3          4                                     Data Scientist   
4          5                                     Data Scientist   
5          6                                     Data Scientist   
6          7                   Business Data Scientist/Engineer   
7          8                                   Data Scientist I   
8          9                                     Data Scientist   
9         10                           Associate Data Scientist   
10        11                                     Data Scientist   
11        12                                     Data Scientist   
12        13                                   Systems Engineer   
13        14                              Senior Data Scientis

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Position           200 non-null    int64 
 1   Job Title          200 non-null    object
 2   Company            200 non-null    object
 3   Location           200 non-null    object
 4   Salary             62 non-null     object
 5   Short Description  200 non-null    object
 6   Posted At          16 non-null     object
 7   Job link           200 non-null    object
dtypes: int64(1), object(7)
memory usage: 12.6+ KB


In [45]:
import re

df_2 = df.copy()


def classify_job_title(title):
    title = title.lower()  # Convert to lowercase for easier matching

    # Regex patterns for different roles
    if re.search(r'scien', title):
        return 'Data Scientist'
    elif re.search(r'analy', title):
        return 'Data Analyst'
    elif re.search(r'engineer', title):
        return 'Data Engineer'
    else:
        return 'Other'

# Apply the function to classify job titles
df_2['Job Profile'] = df_2['Job Title'].apply(classify_job_title)


In [46]:
print(df_2['Job Profile'].unique)

<bound method Series.unique of 0      Data Scientist
1      Data Scientist
2      Data Scientist
3      Data Scientist
4      Data Scientist
            ...      
195    Data Scientist
196    Data Scientist
197    Data Scientist
198             Other
199    Data Scientist
Name: Job Profile, Length: 200, dtype: object>


In [47]:
# Function to extract the minimum numeric salary
def extract_min_salary(salary):
    if pd.isna(salary):  # Handle NaN values
        return None
    
    # Use regex to extract all numeric values from the salary string
    numeric_values = re.findall(r'\d+', salary)
    
    if numeric_values:
        # Convert to integers and return the minimum value
        min_salary = min(map(int, numeric_values))
        return min_salary
    else:
        return None  # Return None if no numeric value is found

# Apply the function to the Salary column
df_2['Salary'] = df_2['Salary'].apply(extract_min_salary)

# Convert the Salary column to numeric, forcing errors to NaN
df_2['Salary'] = pd.to_numeric(df_2['Salary'], errors='coerce')

# Convert salary values from thousands to actual scale (e.g., 103 -> 103000)
df_2['Salary'] = df_2['Salary'] * 1000

# Fill NaN values in the Salary column with the average salary
df_2['Salary'].fillna(70000, inplace=True)

# Round the 'Salary' column to two decimal places
df_2['Salary'] = df_2['Salary'].round(2)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_2['Salary'].fillna(70000, inplace=True)


In [48]:
# Drop the 'Posted At' and 'Short Description' columns
df_2.drop(columns=['Posted At', 'Short Description'], inplace=True)

# Display the DataFrame to confirm the columns have been dropped
print(df_2.head())


   Position                                          Job Title  \
0         1                                     Data Scientist   
1         2  Senior Artificial Intelligence Researcher for ...   
2         3                              Senior Data Scientist   
3         4                                     Data Scientist   
4         5                                     Data Scientist   

                                          Company  \
0                                     Robert Half   
1  Johns Hopkins Applied Physics Laboratory (APL)   
2               Modern Technology Solutions, Inc.   
3                        Twitch Interactive, Inc.   
4               US Office of Personnel Management   

                          Location    Salary  \
0  Remote in Los Angeles, CA 90024       0.0   
1                 Laurel, MD 20723   70000.0   
2             Huntsville, AL 35895   70000.0   
3                      Seattle, WA   70000.0   
4                           Remote  103000.0

In [49]:
# Function to extract city and state from the location string
def extract_city_state(location):
    if pd.isna(location):
        return pd.Series([None, None])
    
    # Regex to capture city and state
    match = re.search(r'(?P<city>[A-Za-z\s]+),\s*(?P<state>[A-Z]{2})', location)
    
    if match:
        return pd.Series([match.group('city').strip(), match.group('state')])
    else:
        return pd.Series([None, None])

# Apply the function and create new columns
df_2[['City', 'State']] = df_2['Location'].apply(extract_city_state)

# Drop the original Location column if no longer needed
df_2.drop(columns=['Location'], inplace=True)

# Display the updated DataFrame
print(df_2.head())


   Position                                          Job Title  \
0         1                                     Data Scientist   
1         2  Senior Artificial Intelligence Researcher for ...   
2         3                              Senior Data Scientist   
3         4                                     Data Scientist   
4         5                                     Data Scientist   

                                          Company    Salary  \
0                                     Robert Half       0.0   
1  Johns Hopkins Applied Physics Laboratory (APL)   70000.0   
2               Modern Technology Solutions, Inc.   70000.0   
3                        Twitch Interactive, Inc.   70000.0   
4               US Office of Personnel Management  103000.0   

                                            Job link     Job Profile  \
0  https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...  Data Scientist   
1  https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...  Data Scientist   
2  https

In [50]:
# Save the final DataFrame to a CSV file
df_2.to_csv('final_job_data.csv', index=False)
