In [1]:
################################
##### Merging two datasets #####
################################
import pandas as pd

df1 = pd.read_csv("fake_job_postings.csv")
df1.drop(['benefits', 'company_profile', 'employment_type', 'salary_range',
          'industry', 'department', 'required_experience', 'required_education', 'job_id', 'function',], axis=1, inplace=True)
df2 = pd.read_csv("job_train.csv")
df2 = df2[df2['fraudulent']==1]
merged_df = pd.concat([df1, df2])
# merged_df.to_csv("final_data.csv")
merged_df.head()

Unnamed: 0,title,location,description,requirements,telecommuting,has_company_logo,has_questions,fraudulent
0,Marketing Intern,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever","Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,0,1,0,0
3,Account Executive - Washington DC,"US, DC, Washington",THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,0
4,Bill Review Manager,"US, FL, Fort Worth",JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,0


In [2]:
############################
##### Define Functions #####
############################

import unicodedata

def remove_accents(text):
    if pd.isna(text):
        return text
    # Normalize to NFD (decompose accented chars), then filter out combining marks
    return ''.join(c for c in unicodedata.normalize('NFD', str(text))
                   if unicodedata.category(c) != 'Mn')

def clean_accents(column, print_change):
    cleaned_lst = []
    for val in merged_df[column]:
        if not pd.isna(val) and str(val).strip() != "": 
            cleaned_val = remove_accents(val)
            if val != cleaned_val:
                cleaned_lst.append((val, cleaned_val))
    cleaned_lst = list(set(cleaned_lst))
    if print_change:
        for cleaned in cleaned_lst:
            print(cleaned)

def has_non_latin(text):
    if pd.isna(text):
        return 0
    for char in text:
        if ord(char) > 127:
            return 1
    return 0

def remove_non_latin(text):
    if pd.isna(text):
        return pd.NA
    cleaned_text = ""
    for char in text:
        if ord(char) > 127:
            continue
        cleaned_text += char
    return cleaned_text

In [3]:
###########################
##### Data Processing #####
###########################

# drop rows with missing description (18336 --> 18334 samples)
merged_df = merged_df.dropna(subset=['description'])

### Text Columns ###

# convert accents to basic latin
text_cols = ['location', 'description', 'requirements']
for col in text_cols:
    clean_accents(col, False)

# mask and remove non-basic-latin
for col in text_cols:
    merged_df[f"{col}_has_non_latin"] = merged_df[col].apply(has_non_latin)
    merged_df[col] = merged_df[col].apply(remove_non_latin)

# Make new feature that has binary value for whether requirements was missing or not
merged_df["has_requirements"] = merged_df["requirements"].notna()
# Merge description with requirements so they are in one new feature - called description_and_requirements
merged_df["description_and_requirements"] = merged_df["description"] + merged_df["requirements"].fillna("")

### LOCATION ###

# Extract country and state
pattern1 = r'(^[A-Z]{2},\s*[A-Z0-9]{1,3})'
merged_df['country_state'] = merged_df['location'].str.extract(pattern1, expand=False)
# Extract country
pattern2 = r'(^[A-Z]{2})'
merged_df['country'] = merged_df['location'].str.extract(pattern2, expand=False)
# Manage Remote jobs
merged_df['is_remote'] = merged_df['location'].str.lower().str.contains('remote|work from home', na=False) & merged_df['country_state'].isna()
merged_df.loc[merged_df['is_remote'], 'country_state'] = "Remote"
merged_df.loc[merged_df['is_remote'], 'country'] = "Remote"
merged_df.drop(columns=["is_remote"], inplace=True)

# New column - location mask
merged_df["has_location"] = merged_df["location"].notna()
# New column - detailed location mask - something beyond just the country code
merged_df["has_location_details"] = (merged_df["location"].str.lower().str.strip() == merged_df["country"].str.lower().str.strip()).fillna(False)

merged_df.head()

Unnamed: 0,title,location,description,requirements,telecommuting,has_company_logo,has_questions,fraudulent,location_has_non_latin,description_has_non_latin,requirements_has_non_latin,has_requirements,description_and_requirements,country_state,country,has_location,has_location_details
0,Marketing Intern,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,0,0,0,0,True,"Food52, a fast-growing, James Beard Award-winn...","US, NY",US,True,False
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,0,0,1,1,True,Organised - Focused - Vibrant - Awesome!Do you...,,NZ,True,False
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever","Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,0,1,0,0,0,0,1,True,"Our client, located in Houston, is actively se...","US, IA",US,True,False
3,Account Executive - Washington DC,"US, DC, Washington",THE COMPANY: ESRI Environmental Systems Resea...,"EDUCATION:Bachelors or Masters in GIS, busines...",0,1,0,0,0,1,1,True,THE COMPANY: ESRI Environmental Systems Resea...,"US, DC",US,True,False
4,Bill Review Manager,"US, FL, Fort Worth",JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,0,0,1,1,True,JOB TITLE: Itemization Review ManagerLOCATION:...,"US, FL",US,True,False


In [None]:
# merged_df.to_csv("final_data.csv")