In [1]:
################################
##### Merging two datasets #####
################################
import pandas as pd

df1 = pd.read_csv("fake_job_postings.csv")
df1.drop(['benefits', 'company_profile', 'employment_type', 'salary_range',
          'industry', 'department', 'required_experience', 'required_education', 'job_id', 'function',], axis=1, inplace=True)
df2 = pd.read_csv("job_train.csv")
df2 = df2[df2['fraudulent']==1]
merged_df = pd.concat([df1, df2])
# merged_df.to_csv("final_data.csv")
merged_df.head()

Unnamed: 0,title,location,description,requirements,telecommuting,has_company_logo,has_questions,fraudulent
0,Marketing Intern,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever","Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,0,1,0,0
3,Account Executive - Washington DC,"US, DC, Washington",THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,0
4,Bill Review Manager,"US, FL, Fort Worth",JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,0


In [2]:
######################################
##### Data Processing (Combined) #####
######################################

# drop rows with missing description (18336 --> 18334 samples)
merged_df = merged_df.dropna(subset=['description'])

# Make new feature that has binary value for whether requirements was missing or not
merged_df["has_requirements"] = merged_df["requirements"].notna()
# Merge description with requirements so they are in one new feature - called description_and_requirements
merged_df["description_and_requirements"] = merged_df["description"] + merged_df["requirements"].fillna("")


### LOCATION ###

# Extract country and state
pattern1 = r'(^[A-Z]{2},\s*[A-Z0-9]{1,3})'
merged_df['country_state'] = merged_df['location'].str.extract(pattern1, expand=False)
# Extract country
pattern2 = r'(^[A-Z]{2})'
merged_df['country'] = merged_df['location'].str.extract(pattern2, expand=False)
# Manage Remote jobs
merged_df['is_remote'] = merged_df['location'].str.lower().str.contains('remote|work from home', na=False) & merged_df['country_state'].isna()
merged_df.loc[merged_df['is_remote'], 'country_state'] = "Remote"
merged_df.loc[merged_df['is_remote'], 'country'] = "Remote"
merged_df.drop(columns=["is_remote"], inplace=True)

# New column - location mask
merged_df["has_location"] = merged_df["location"].notna()
# New column - detailed location mask - something beyond just the country code
merged_df["has_location_details"] = (merged_df["location"].str.lower().str.strip() == merged_df["country"].str.lower().str.strip()).fillna(False)


merged_df.head()

Unnamed: 0,title,location,description,requirements,telecommuting,has_company_logo,has_questions,fraudulent,has_requirements,description_and_requirements,country_state,country,has_location,has_location_details
0,Marketing Intern,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,0,True,"Food52, a fast-growing, James Beard Award-winn...","US, NY",US,True,False
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,0,True,Organised - Focused - Vibrant - Awesome!Do you...,,NZ,True,False
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever","Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,0,1,0,0,True,"Our client, located in Houston, is actively se...","US, IA",US,True,False
3,Account Executive - Washington DC,"US, DC, Washington",THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,0,True,THE COMPANY: ESRI – Environmental Systems Rese...,"US, DC",US,True,False
4,Bill Review Manager,"US, FL, Fort Worth",JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,0,True,JOB TITLE: Itemization Review ManagerLOCATION:...,"US, FL",US,True,False


In [9]:
# Assuming your dataframe is called 'df' and column is 'text_column'
def find_unicode_chars(text_series):
    """Find all unique non-ASCII unicode characters in a text series"""
    unicode_chars = set()
    
    for text in text_series.dropna():
        for char in str(text):
            # Check if character is non-ASCII (ord > 127)
            if ord(char) > 127:
                unicode_chars.add(char)
    
    return sorted(unicode_chars, key=lambda x: ord(x))

# Get unique unicode characters
unicode_list = find_unicode_chars(merged_df['description_and_requirements'])

# Print them with their unicode codes and names
import unicodedata

print("Unique Unicode Characters Found:")
print("-" * 60)
for char in unicode_list:
    try:
        name = unicodedata.name(char)
    except ValueError:
        name = "Unknown"
    print(f"'{char}' | U+{ord(char):04X} | {name}")

Unique Unicode Characters Found:
------------------------------------------------------------
'' | U+0085 | Unknown
'' | U+0092 | Unknown
'' | U+0096 | Unknown
'' | U+009F | Unknown
' ' | U+00A0 | NO-BREAK SPACE
'¢' | U+00A2 | CENT SIGN
'£' | U+00A3 | POUND SIGN
'¤' | U+00A4 | CURRENCY SIGN
'§' | U+00A7 | SECTION SIGN
'¨' | U+00A8 | DIAERESIS
'©' | U+00A9 | COPYRIGHT SIGN
'«' | U+00AB | LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
'­' | U+00AD | SOFT HYPHEN
'®' | U+00AE | REGISTERED SIGN
'°' | U+00B0 | DEGREE SIGN
'´' | U+00B4 | ACUTE ACCENT
'·' | U+00B7 | MIDDLE DOT
'»' | U+00BB | RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
'¼' | U+00BC | VULGAR FRACTION ONE QUARTER
'½' | U+00BD | VULGAR FRACTION ONE HALF
'¿' | U+00BF | INVERTED QUESTION MARK
'Á' | U+00C1 | LATIN CAPITAL LETTER A WITH ACUTE
'Â' | U+00C2 | LATIN CAPITAL LETTER A WITH CIRCUMFLEX
'Å' | U+00C5 | LATIN CAPITAL LETTER A WITH RING ABOVE
'É' | U+00C9 | LATIN CAPITAL LETTER E WITH ACUTE
'Î' | U+00CE | LATIN CAPITAL LETTER I WI

In [None]:
# Option 1: Replace specific characters with their text equivalents
replacements = {
    '—': '--',        # em dash
    '–': '-',         # en dash
    '"': '"',         # smart quotes
    '"': '"',
    ''': "'",
    ''': "'",
    '…': '...',       # ellipsis
    '®': '(R)',
    '©': '(C)',
    '™': '(TM)',
}

df['text_column'] = df['text_column'].str.replace('|'.join(replacements.keys()), 
                                                    lambda m: replacements[m.group()], 
                                                    regex=True)

# Option 2: Remove accents from letters (keeping base letters)
import unicodedata

def remove_accents(text):
    if pd.isna(text):
        return text
    # Normalize to NFD (decompose accented chars), then filter out combining marks
    return ''.join(c for c in unicodedata.normalize('NFD', str(text))
                   if unicodedata.category(c) != 'Mn')

df['text_column'] = df['text_column'].apply(remove_accents)

# Option 3: Convert to closest ASCII equivalent
df['text_column'] = df['text_column'].str.encode('ascii', errors='ignore').str.decode('ascii')

In [3]:
# print(merged_df.isnull().sum())
print(df1.duplicated().sum())
print(df2.duplicated().sum())
print(merged_df.duplicated().sum())

df1_unique = df1.drop_duplicates()
df2_unique = df2.drop_duplicates()
overlap_count = pd.merge(df1_unique, df2_unique, how='inner').shape[0]
print(overlap_count)
print(len(df2))

397
10
852
446
456


In [4]:
merged_df.info()
print(merged_df.nunique())
merged_df.duplicated().sum()
merged_df.drop_duplicates(inplace=True)

<class 'pandas.core.frame.DataFrame'>
Index: 18334 entries, 0 to 8938
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   title                         18334 non-null  object
 1   location                      17980 non-null  object
 2   description                   18334 non-null  object
 3   requirements                  15564 non-null  object
 4   telecommuting                 18334 non-null  int64 
 5   has_company_logo              18334 non-null  int64 
 6   has_questions                 18334 non-null  int64 
 7   fraudulent                    18334 non-null  int64 
 8   has_requirements              18334 non-null  bool  
 9   description_and_requirements  18334 non-null  object
 10  country_state                 15735 non-null  object
 11  country                       17980 non-null  object
 12  has_location                  18334 non-null  bool  
 13  has_location_details  

In [5]:
textual = ['title', 'company_profile', 'description',
       'requirements', 'benefits']
categorical = ['location', 'employment_type', 'industry']
continuous = ['salary_range']
label = ['fraudulent']

merged_df.isnull().sum()

title                              0
location                         341
description                        0
requirements                    2637
telecommuting                      0
has_company_logo                   0
has_questions                      0
fraudulent                         0
has_requirements                   0
description_and_requirements       0
country_state                   2509
country                          341
has_location                       0
has_location_details               0
dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Combine relevant text columns (you can modify this based on which columns you want to use)
merged_df['text'] = merged_df['title'].fillna('') + ' ' + merged_df['description'].fillna('')

# Split the data
X = merged_df['text']
y = merged_df['fraudulent']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train logistic regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_vec, y_train)

# Make predictions and print classification report
y_pred = model.predict(X_test_vec)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(model.coef_)

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3336
           1       0.98      0.30      0.46       161

    accuracy                           0.97      3497
   macro avg       0.97      0.65      0.72      3497
weighted avg       0.97      0.97      0.96      3497

[[ 0.67765924  1.34207408 -0.17258051 ... -0.18184741 -0.0522385
  -0.0719484 ]]


In [6]:

# len(df3)
# len(merged_df)

In [7]:
merged_df.drop_duplicates(inplace=True)
# Combine relevant text columns (you can modify this based on which columns you want to use)
merged_df['text'] = merged_df['title'].fillna('') + ' ' + merged_df['description'].fillna('')

# Split the data
X = merged_df['text']
y = merged_df['fraudulent']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train logistic regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_vec, y_train)

# Make predictions and print classification report
y_pred = model.predict(X_test_vec)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(model.coef_)

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3321
           1       0.95      0.35      0.51       176

    accuracy                           0.97      3497
   macro avg       0.96      0.67      0.75      3497
weighted avg       0.97      0.97      0.96      3497

[[ 0.28998688  1.26500827 -0.21935576 ... -0.17891268 -0.05137752
  -0.06817463]]


In [8]:
merged_df.duplicated().sum()

np.int64(0)

In [9]:
import pandas as pd

df1 = pd.read_csv("fake_job_postings.csv")
df1.drop(['benefits', 'company_profile', 'employment_type', 'salary_range', 'industry', 'department', 'required_experience', 'required_education', 'job_id', 'function',], axis=1, inplace=True)
df2 = pd.read_csv("job_train.csv")
merged_df = pd.concat([df1, df2])

merged_df

Unnamed: 0,title,location,description,requirements,telecommuting,has_company_logo,has_questions,fraudulent
0,Marketing Intern,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever","Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,0,1,0,0
3,Account Executive - Washington DC,"US, DC, Washington",THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,0
4,Bill Review Manager,"US, FL, Fort Worth",JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,0
...,...,...,...,...,...,...,...,...
8935,Financial Analyst,"GR, I, Paiania","Financial analysis, reporting and review of de...",Postgraduate degree required. Economics or Fin...,0,1,1,0
8936,Customer Service Associate - Part Time,"CA, ON, Peterborough",The Customer Service Associate will be based i...,Minimum Requirements:Minimum of 6 months custo...,0,1,0,0
8937,Sales Manager,"UA, 61, Ternopil","Responsibilitiesactive sales, realization of c...",Main requirementsexperience with Sales (2+ yea...,0,0,1,0
8938,Administrative Assistant,"US, CA, Santa Ana",In addition to clerical and administrative du...,,0,0,0,1


In [12]:
# print(merged_df.isnull().sum())
print(df1.duplicated().sum())
print(df2.duplicated().sum())
print(merged_df.duplicated().sum())

df1_unique = df1.drop_duplicates()
df2_unique = df2.drop_duplicates()
overlap_count = pd.merge(df1_unique, df2_unique, how='inner').shape[0]
print(overlap_count)
print(len(df2))

314
105
9337
8921
8940


In [None]:
##### Oct 30 Flora #####

# Explore "description"
missing_desc = merged_df[merged_df['description'].isna()]
# print(missing_desc[['description', 'fraudulent', 'requirements', 'location']])

### Explore "location" ###
# Extract country and state
pattern1 = r'(^[A-Z]{2},\s*[A-Z0-9]{1,3})'
merged_df['country_state'] = merged_df['location'].str.extract(pattern1, expand=False)
# Extract country
pattern2 = r'(^[A-Z]{2})'
merged_df['country'] = merged_df['location'].str.extract(pattern2, expand=False)
# Manage Remote jobs
merged_df['is_remote'] = merged_df['location'].str.lower().str.contains('remote|work from home', na=False) & merged_df['country_state'].isna()
merged_df.loc[merged_df['is_remote'], 'country_state'] = "Remote"
merged_df.loc[merged_df['is_remote'], 'country'] = "Remote"
merged_df.drop(columns=["is_remote"], inplace=True)

# New column - location mask
merged_df["has_location"] = merged_df["location"].notna()
# New column - detailed location mask - something beyond just the country code
merged_df["has_location_details"] = (merged_df["location"].str.lower().str.strip() == merged_df["country"].str.lower().str.strip()).fillna(False)


# unique_outliers = outliers[["location"]].drop_duplicates()
# print("Outlier rows:")
# print(len(outliers["location"].unique()))
# # unique_outliers.to_csv("location_outliers")
# print(unique_outliers)

42       US
173      US
230      US
368      US
392      US
         ..
17816    US
2566     US
4370     US
5584     GB
6858     BH
Name: location, Length: 98, dtype: object


In [None]:
# Make new feature that has binary value for whether requirements was missing or not
merged_df["has_requirements"] = merged_df["requirements"].notna()
print(merged_df["has_requirements"].head(10))

# Merge description with requirements so they are in one new feature - called description_and_requirements
merged_df["description_and_requirements"] = merged_df["description"].fillna("") + merged_df["requirements"].fillna("")
print(merged_df["description_and_requirements"].head(5))

0     True
1     True
2     True
3     True
4     True
5    False
6     True
7     True
8     True
9     True
Name: has_requirements, dtype: bool
0    Food52, a fast-growing, James Beard Award-winn...
1    Organised - Focused - Vibrant - Awesome!Do you...
2    Our client, located in Houston, is actively se...
3    THE COMPANY: ESRI – Environmental Systems Rese...
4    JOB TITLE: Itemization Review ManagerLOCATION:...
Name: description_and_requirements, dtype: object
