In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import random

import os
import re
from pprint import pprint
from collections import Counter


In [None]:
# Setting the seed for NumPy's random generator 
np.random.seed(42)

In [None]:
# Defining the file path 
import sys, os
path_to_file = 'C:/Users/omika/Job-Recommender-System'
print(path_to_file)

C:/Users/omika/Job-Recommender-System


## Data Set

In [None]:
# Loads job_skills.csv file into dataframe df_job_skills
# Display first five rows  
df_job_skills = pd.read_csv("job_skills.csv")
df_job_skills.head()

Unnamed: 0,job_link,job_skills
0,https://www.linkedin.com/jobs/view/housekeeper...,"Building Custodial Services, Cleaning, Janitor..."
1,https://www.linkedin.com/jobs/view/assistant-g...,"Customer service, Restaurant management, Food ..."
2,https://www.linkedin.com/jobs/view/school-base...,"Applied Behavior Analysis (ABA), Data analysis..."
3,https://www.linkedin.com/jobs/view/electrical-...,"Electrical Engineering, Project Controls, Sche..."
4,https://www.linkedin.com/jobs/view/electrical-...,"Electrical Assembly, Point to point wiring, St..."


In [None]:
# Loads linkedin_job_postings.csv file into dataframe df_job_posting
df_job_posting = pd.read_csv("linkedin_job_postings.csv")
# Converting boolean flag coloumns to python boolean values 
df_job_posting['got_summary'] = df_job_posting['got_summary'].map(lambda x: True if x=='t' else False)
df_job_posting['got_ner'] = df_job_posting['got_ner'].map(lambda x: True if x=='t' else False)
df_job_posting['is_being_worked'] = df_job_posting['is_being_worked'].map(lambda x: True if x=='t' else False)
# Display the first 5 rows 
df_job_posting.head()

Unnamed: 0,job_link,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type
0,https://www.linkedin.com/jobs/view/account-exe...,2024-01-21 07:12:29.00256+00,True,True,False,Account Executive - Dispensing (NorCal/Norther...,BD,"San Diego, CA",2024-01-15,Coronado,United States,Color Maker,Mid senior,Onsite
1,https://www.linkedin.com/jobs/view/registered-...,2024-01-21 07:39:58.88137+00,True,True,False,Registered Nurse - RN Care Manager,Trinity Health MI,"Norton Shores, MI",2024-01-14,Grand Haven,United States,Director Nursing Service,Mid senior,Onsite
2,https://www.linkedin.com/jobs/view/restaurant-...,2024-01-21 07:40:00.251126+00,True,True,False,RESTAURANT SUPERVISOR - THE FORKLIFT,Wasatch Adaptive Sports,"Sandy, UT",2024-01-14,Tooele,United States,Stand-In,Mid senior,Onsite
3,https://www.linkedin.com/jobs/view/independent...,2024-01-21 07:40:00.308133+00,True,True,False,Independent Real Estate Agent,Howard Hanna | Rand Realty,"Englewood Cliffs, NJ",2024-01-16,Pinehurst,United States,Real-Estate Clerk,Mid senior,Onsite
4,https://www.linkedin.com/jobs/view/group-unit-...,2024-01-19 09:45:09.215838+00,False,False,False,Group/Unit Supervisor (Systems Support Manager...,"IRS, Office of Chief Counsel","Chamblee, GA",2024-01-17,Gadsden,United States,Supervisor Travel-Information Center,Mid senior,Onsite


In [20]:
# Loads a sample of the job_summary.csv file into dataframe df
# Display the first five rows of the data 
df = pd.read_csv("C:/Users/omika/Job-Recommender-System/job_summary.csv", skiprows=lambda x: x > 0 and random.random() >=0.2)
df.head()

Unnamed: 0,job_link,job_summary
0,https://uk.linkedin.com/jobs/view/commercial-a...,Commercial account executive\nSheffield\nFull ...
1,https://www.linkedin.com/jobs/view/restaurant-...,Description\nOur\nRestaurant Team/Shift Leader...
2,https://www.linkedin.com/jobs/view/material-ha...,Job Details:\nJob Title : Quality Liaison (599...
3,https://au.linkedin.com/jobs/view/experienced-...,"Restaurant Description:\nLocated in Canberra, ..."
4,https://au.linkedin.com/jobs/view/store-manage...,"Who We Are\nWe are EssilorLuxottica, a global ..."


## Data Cleaning

In [None]:
# Creating a unique identifier using job link for each job 
df_job_skills['job_link_cleaned'] = df_job_skills['job_link'].map(lambda x: x.split('-')[-1])
df_job_posting['job_link_cleaned'] = df_job_posting['job_link'].map(lambda x: x.split('-')[-1])


In [None]:
# Count the unique rows from skills table 
df_job_skills['job_link_cleaned'].value_counts()

job_link_cleaned
3797940160    4
3799818268    4
3797022448    3
3802713233    3
3802204204    3
             ..
3755476892    1
3733960939    1
3781185783    1
3331580456    1
3683460063    1
Name: count, Length: 1294972, dtype: int64

In [None]:
# Count the unique rows from posting table 
df_job_posting['job_link_cleaned'].value_counts()

job_link_cleaned
jobs          664
3797940160      5
3804421891      4
3799818270      4
3799818268      4
             ... 
3654168951      1
3787735858      1
3801865312      1
3805761853      1
3803057508      1
Name: count, Length: 1346169, dtype: int64

In [None]:
# Removing duplicates from both the tables 
df_job_skills = df_job_skills.drop_duplicates(['job_link_cleaned'], keep='first')
df_job_posting = df_job_posting.drop_duplicates(['job_link_cleaned'], keep='first')

In [None]:
# Printing (rows,columns)
print(df_job_skills.shape)
print(df_job_posting.shape)

(1294972, 3)
(1346169, 15)


## Merge

In [None]:
# Merging both the dataframes into single file df 
df = df_job_posting.merge(df_job_skills, on="job_link_cleaned", how="inner")
df = df.drop(columns = ['job_link_x', 'job_link_y'])
df = df[df['job_skills'].notnull()]
df = df.reset_index(drop=True)
print(df.shape)
df.head()

(1292887, 15)


Unnamed: 0,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_link_cleaned,job_skills
0,2024-01-21 07:12:29.00256+00,True,True,False,Account Executive - Dispensing (NorCal/Norther...,BD,"San Diego, CA",2024-01-15,Coronado,United States,Color Maker,Mid senior,Onsite,3802078767,"Medical equipment sales, Key competitors, Term..."
1,2024-01-21 07:39:58.88137+00,True,True,False,Registered Nurse - RN Care Manager,Trinity Health MI,"Norton Shores, MI",2024-01-14,Grand Haven,United States,Director Nursing Service,Mid senior,Onsite,3803386312,"Nursing, Bachelor of Science in Nursing, Maste..."
2,2024-01-21 07:40:00.251126+00,True,True,False,RESTAURANT SUPERVISOR - THE FORKLIFT,Wasatch Adaptive Sports,"Sandy, UT",2024-01-14,Tooele,United States,Stand-In,Mid senior,Onsite,3771464419,"Restaurant Operations Management, Inventory Ma..."
3,2024-01-21 07:40:00.308133+00,True,True,False,Independent Real Estate Agent,Howard Hanna | Rand Realty,"Englewood Cliffs, NJ",2024-01-16,Pinehurst,United States,Real-Estate Clerk,Mid senior,Onsite,3797661348,"Real Estate, Customer Service, Sales, Negotiat..."
4,2024-01-21 08:08:19.663033+00,True,True,False,Registered Nurse (RN),Trinity Health MI,"Muskegon, MI",2024-01-14,Muskegon,United States,Nurse Practitioner,Mid senior,Onsite,3790954711,"Nursing, BSN, Medical License, Virtual RN, Nur..."


In [13]:
# Reordering the columns with "job_link_cleaned" as the first column
df = df.reindex(columns=['job_link_cleaned', 'last_processed_time', 'got_summary', 'got_ner', 'is_being_worked', 'job_title', 'company', 'job_location', 'first_seen', 'search_city', 'search_country', 'search_position', 'job_level', 'job_type', 'job_skills'])
print(df.shape)
df.head()

(1292887, 15)


Unnamed: 0,job_link_cleaned,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills
0,3802078767,2024-01-21 07:12:29.00256+00,True,True,False,Account Executive - Dispensing (NorCal/Norther...,BD,"San Diego, CA",2024-01-15,Coronado,United States,Color Maker,Mid senior,Onsite,"Medical equipment sales, Key competitors, Term..."
1,3803386312,2024-01-21 07:39:58.88137+00,True,True,False,Registered Nurse - RN Care Manager,Trinity Health MI,"Norton Shores, MI",2024-01-14,Grand Haven,United States,Director Nursing Service,Mid senior,Onsite,"Nursing, Bachelor of Science in Nursing, Maste..."
2,3771464419,2024-01-21 07:40:00.251126+00,True,True,False,RESTAURANT SUPERVISOR - THE FORKLIFT,Wasatch Adaptive Sports,"Sandy, UT",2024-01-14,Tooele,United States,Stand-In,Mid senior,Onsite,"Restaurant Operations Management, Inventory Ma..."
3,3797661348,2024-01-21 07:40:00.308133+00,True,True,False,Independent Real Estate Agent,Howard Hanna | Rand Realty,"Englewood Cliffs, NJ",2024-01-16,Pinehurst,United States,Real-Estate Clerk,Mid senior,Onsite,"Real Estate, Customer Service, Sales, Negotiat..."
4,3790954711,2024-01-21 08:08:19.663033+00,True,True,False,Registered Nurse (RN),Trinity Health MI,"Muskegon, MI",2024-01-14,Muskegon,United States,Nurse Practitioner,Mid senior,Onsite,"Nursing, BSN, Medical License, Virtual RN, Nur..."


# Basic Data Cleaning

Perform basic data cleaning to remove unicode characters and Markdown that might cause issues when exporting

In [14]:
# Function to clean strings
def clean_text(text):
    # Remove extra spaces and tabs
    cleaned_text = re.sub(r'\s+', ' ', text)

    # Remove Unicode characters and Markdown
    cleaned_text = re.sub(r'[^\x00-\x7F\\]+', ' ', cleaned_text)

    return cleaned_text.strip()

# List of columns to clean (string type)
columns_to_clean = ['job_title', 'company', 'job_location', 'search_city', 'search_country',
                    'search_position', 'job_level', 'job_type', 'job_skills']

# Replace all NaN (float) values in the target columns with an empty string ('')
df[columns_to_clean] = df[columns_to_clean].fillna('')

# Clean the specified columns
for col in columns_to_clean:
    df[col] = df[col].apply(clean_text)

# Now 'df' contains the cleaned columns
# Reset index
df.reset_index(drop=True, inplace=True)

In [15]:
# Export the DataFrame to a CSV file
df.to_csv('linkedin_job_posts_skills.csv', index=True, index_label='sn')  # also export the index

In [16]:
df.to_csv("C:/Users/omika/Job-Recommender-System/job_summary_sample.csv", index=False)