In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np 
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split

df = pd.read_csv('glassdoor_dataset_v10.csv')
df2 = pd.read_csv('glassdoor_cleaned.csv')
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None) 


In [None]:
df2.head(2)

Unnamed: 0,id,industryName,country_code,employee_count,review_date_time,rating_overall,rating_ceo,rating_business_outlook,rating_work_life_balance,rating_culture_and_values,rating_diversity_and_inclusion,rating_senior_leadership,rating_recommend_to_friend,rating_career_opportunities,rating_compensation_and_benefits,is_current_job,length_of_employment,job_title,location,pros,cons,summary,count_helpful,company_id,org_uuid,amount_of_funding_rounds_until_now,total_funding_until_now,rating_ceo_imputed,rating_business_outlook_imputed,rating_recommend_to_friend_imputed,employment_status_imputed,year,has_stress,DURING_COVID,POST_COVID,during_covid_1_6_months,during_covid_2_6_months,during_covid_3_6_months,employment_status_FREELANCE,employment_status_INTERN,employment_status_PART_TIME,employment_status_REGULAR,employment_status_RESERVE,employment_status_SELF_EMPLOY,employment_status_TEMPORARY,employment_status_UNKNOWN,in_each_period,processed_location,DV_has_stress,DV_has_stress_MENTALROBERTA
0,6036,Internet & Web Services,USA,10000+,2023-07-01 23:09:36.113,4,DISAPPROVE,NEGATIVE,5,3,4,3,POSITIVE,5,4,False,2,Senior Consultant,"Salt Lake City, UT","Great environment, lots of great and meaningful networking opportunities","Last in, first out for layoffs.",Would consider returning,0.0,6036.0,05554f65-6aa9-4dd1-6271-8ce2d60f10c4,3,8108000000.0,DISAPPROVE,NEGATIVE,POSITIVE,REGULAR,2023,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,,0,0.0002
1,6036,Internet & Web Services,USA,10000+,2023-05-09 10:51:07.920,4,DISAPPROVE,NEGATIVE,3,3,4,3,POSITIVE,3,4,False,2,Cyber Risk & Financial Advisory Analyst,"Detroit, MI","Worked with knowledgeable and helpful professionals, great networking, ability to up-skill professionally at Deloitte's expense, well-being and hybrid reimbursement subsidies, corporate expense card","You have to ""find"" a job within a job - you must be staffed on a project once your training is complete, otherwise your utilization will go down. Your resource manager and Coach are not super helpful throughout this process for new-hires, it is essentially up to you to network and reach out to people to find a project. Deloitte also announced layoffs in April and the whole process in which they decided to lay people off was rather unprofessional, and they gave very little explanation as to why a specific person was being let go - they cited a combination of performance and economic/business conditions, but the firm was still profitable YoY.",Cyber Risk Analyst in Risk and Financial Advisory,3.0,6036.0,05554f65-6aa9-4dd1-6271-8ce2d60f10c4,3,8108000000.0,DISAPPROVE,NEGATIVE,POSITIVE,REGULAR,2023,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,United States of America,0,0.0003


### First step is cleaning and preprocessing text

### Now we preprocess the pros/cons/summary columns 

In [None]:
%pip install spacy nltk tqdm
%python -m spacy download en_core_web_sm
%pip install spacy

import re
import spacy
import multiprocessing as mp
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger"])
stop_set = set(stopwords.words("english"))
stemmer = PorterStemmer()

# Precompiled regex (MUCH faster)
url_regex = re.compile(
    r'https?://\S+|www\.\S+|[-a-zA-Z0-9@:%._+~#=]{2,256}\.[a-zA-Z]{2,6}\S*'
)
email_regex = re.compile(
    r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}\b'
)

contractions = [
    (re.compile(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't"), r"\1\2 not"),
    (re.compile(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll"), r"\1\2 will"),
    (re.compile(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re"), r"\1\2 are"),
    (re.compile(r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve"), r"\1\2 have"),
    (re.compile(r"(\b)([Cc]a)n't"), r"\1\2n not"),
    (re.compile(r"(\b)([Ii])'m"), r"\1\2 am"),
    (re.compile(r"(\b)([Ll]et)'s"), r"\1\2 us"),
    (re.compile(r"(\b)([Ii]t)'s"), r"\1\2 is"),
    (re.compile(r"(\b)([Tt]here)'s"), r"\1\2 is"),
    (re.compile(r"(\b)([Ww])on't"), r"\1\2ill not"),
    (re.compile(r"(\b)([Ss])han't"), r"\1\2hall not"),
    (re.compile(r"(\b)([Yy])(?:'all|a'll)"), r"\1\2ou all"),
]

def fast_uncontract(text):
    for pattern, repl in contractions:
        text = pattern.sub(repl, text)
    return text


def preprocess_single(text):
    # 1. Normalize text
    text = fast_uncontract(text)
    text = url_regex.sub("URL", text)
    text = email_regex.sub("EMAIL", text)

    # 2. Tokenize using spaCy (super fast)
    doc = nlp(text)

    tokens = []
    for t in doc:
        if not t.is_alpha:       # remove punctuation & digits
            continue
        w = t.text.lower()
        if w in stop_set:        # fast stopword removal
            continue
        w = stemmer.stem(w)      # stemming
        tokens.append(w)

    return " ".join(tokens)


def process_batch(batch):
    return [preprocess_single(t) for t in batch]



KeyboardInterrupt: 

In [None]:
def preprocess_column(series, workers=mp.cpu_count()):
    texts = series.astype(str).tolist()
    batch_size = 5000

    batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]

    with mp.Pool(workers) as pool:
        results = list(
            tqdm(pool.imap(process_batch, batches), total=len(batches))
        )

    # Flatten
    return [item for sublist in results for item in sublist]


df["pros_clean"] = preprocess_column(df["pros"])
df["cons_clean"] = preprocess_column(df["cons"])
df["summary_clean"] = preprocess_column(df["summary"])