# Analyzing Sentiment in Glassdoor Employer Reviews

In [1]:
import os
import time
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import joblib

import torch

import matplotlib.pyplot as plt
%matplotlib inline

# Load Data

In [2]:
load_path = "data/glassdoor/glassdoor_split_stoken.parquet"

In [13]:
%time df = pd.read_parquet(load_path)

Wall time: 1min 50s


In [14]:
len(df['review_num'].unique())

6754236

# Load Models

In [5]:
# load tfidf vectorizer
tfidf_vectorizer = joblib.load("assets/glassdoor/tfidf_vectorizer.jl")

In [6]:
# load nmf_fro
nmf_fro = joblib.load("assets/glassdoor/nmf_fro.jl")

In [15]:
# # load tf vectorizer
# tf_vectorizer = joblib.load("assets/glassdoor/tf_vectorizer.jl")

In [16]:
# # load lda
# lda = joblib.load("assets/glassdoor/lda.jl")

In [7]:
# Topics in NMF model (Frobenius norm):
# Work/Life Balance : work life balance environment place hours hard flexible home fun
# Culture           : great benefits place environment culture team job workers experience coworkers
# Experience        : good benefits job experience salary environment training pretty health workers
# Management        : management upper team poor staff senior new communication bad training
# Growth Potential  : company culture growing growth working small years like grow opportunities
# Coworkers         : people nice working don friendly really like lot hire smart
# Hours             : pay low hours benefits decent time better competitive long flexible
# Respectfulness    : employees time care don like job listen treat better make

nmf_fro_topic_labels = {
    0 : "Work/Life Balance",
    1 : "Culture",
    2 : "Compensation",
    3 : "Coworkers",
    4 : "Management",
    5 : "Respectfulness",
    6 : "Benefits",
    7 : "Flexibility"                  
}

lda_topic_labels = {
    0 : "Management",
    1 : "Training",
    2 : "Compensation",
    3 : "Coworkers",
    4 : "Culture",
    5 : "Benefits",
    6 : "Flexibility",
    7 : "Work/Life Balance"                  
}

# Assign Topics

... and drop short sentence fragments

... and reorder columns

In [8]:
def drop_short_sents(df, column, num_char):
    df.drop(df[df[column].map(len) < num_char].index, inplace=True)

In [9]:
def assign_topic(df, column, vectorizer, topic_model):
    vectorized = vectorizer.transform(df[column])
    topic_scores = topic_model.transform(vectorized)
    df['topic'] = np.argmax(topic_scores, axis=1)

In [9]:
def append_df_to_JSON(file, df):
    json_df = df.to_json(
        # path_or_buf=file_path,
        orient='records',
        lines=True
    )
    with open(file, 'a', encoding='utf8') as f:
        f.write(json_df)
        f.write("\n")

In [33]:
column_order = [
    'review_num',
    'type',
    'topic',
    'text',
    'num_words'
]

save_path = "data/glassdoor/glassdoor_topics.json.gz"

for next_df in tqdm(data_loader):
    drop_short_sents(next_df, 'text', 3)
    assign_topic(next_df, 'text', tfidf_vectorizer, nmf_fro)
    next_df['num_words'] = next_df['text'].str.split().str.len()
    next_df = next_df[column_order]
    append_df_to_JSON(save_path, next_df)

KeyboardInterrupt: 

In [10]:
drop_short_sents(df, 'text', 3)
print(len(df['review_num'].unique()))

6161582


In [15]:
%time assign_topic(df, 'text', tfidf_vectorizer, nmf_fro)
print(len(df['review_num'].unique()))

violation: 1.0
violation: 0.03526301313398225
violation: 0.0003064962303773821
violation: 2.4429237300247963e-06
Converged at iteration 5
6754236


In [34]:
df

Unnamed: 0,review_num,text,type,topic,num_targets
0,0,The people to work with,1,3,0
0,0,Hire more hardworking people and schedule fair...,3,3,0
0,0,And always understaff.,2,0,0
0,0,11-14 hour shifts.,2,7,0
1,1,"Ensure a solid foundation of training, allowan...",3,7,0
...,...,...,...,...,...
6754234,6754234,Long hours with little compensation,2,7,0
6754235,6754235,Really enjoyed being able to spend time with a...,1,1,0
6754235,6754235,Gained valuable experience in the field with p...,1,7,0
6754235,6754235,If you don't like working outside or getting d...,2,7,0


In [39]:
tqdm.pandas()

In [40]:
df['num_words'] = df['text'].progress_apply(lambda x: len(x.split()))
# %time df['num_words'] = df['text'].str.split().str.len()
print(len(df['review_num'].unique()))

HBox(children=(FloatProgress(value=0.0, max=40569801.0), HTML(value='')))


6754236


In [41]:
column_order = [
    'review_num',
    'type',
    'topic',
    'text',
    'num_words',
    'num_targets'
]

%time df = df[column_order]
print(len(df['review_num'].unique()))

Wall time: 9.52 s
6754236


In [42]:
df

Unnamed: 0,review_num,type,topic,text,num_words,num_targets
0,0,1,3,The people to work with,5,0
0,0,3,3,Hire more hardworking people and schedule fair...,19,0
0,0,2,0,And always understaff.,3,0
0,0,2,7,11-14 hour shifts.,3,0
1,1,3,7,"Ensure a solid foundation of training, allowan...",23,0
...,...,...,...,...,...,...
6754234,6754234,2,7,Long hours with little compensation,5,0
6754235,6754235,1,1,Really enjoyed being able to spend time with a...,17,0
6754235,6754235,1,7,Gained valuable experience in the field with p...,10,0
6754235,6754235,2,7,If you don't like working outside or getting d...,16,0


In [43]:
print(len(df['review_num'].unique()))

6754236


In [63]:
save_path = "data/glassdoor/glassdoor_topics.parquet"
df.to_parquet(save_path)

# Convert JSON to Stata

In [3]:
save_path = "data/glassdoor/glassdoor_topics.json.gz"

df_to_save = pd.read_json(save_path, lines=True)

In [23]:
df_to_save['topic_labels'] = df_to_save['topic'].map(lda_topic_labels)
df_to_save

Unnamed: 0,review_id,type,topic,text,num_words,topic_labels
0,0,1,3,The people to work with,5,Coworkers
1,0,3,3,Hire more hardworking people and schedule fair...,19,Coworkers
2,0,2,0,And always understaff.,3,Management
3,0,2,7,11-14 hour shifts.,3,Work/Life Balance
4,1,2,4,Extensive changes in upper level management ov...,10,Culture
5,1,1,1,A company in rapid growth and expansion,7,Training
6,1,3,6,"Ensure a solid foundation of training, allowan...",23,Flexibility
7,2,2,4,Managers do not manage.,4,Culture
8,2,1,2,"Get to know the industry, profit margins, etc.",8,Compensation
9,2,3,5,"Take care of your employees, better.",6,Benefits


In [24]:
df_to_save.to_clipboard(excel=True)

In [3]:
df_to_save['num_words'] = df_to_save['text'].str.split().str.len()

In [4]:
df_to_save.rename(columns={'review_num':'review_id'}, inplace=True)

In [5]:
df_to_save

Unnamed: 0,review_id,type,topic,text,num_words
0,0,1,3,The people to work with,5
1,0,3,3,Hire more hardworking people and schedule fair...,19
2,0,2,0,And always understaff.,3
3,0,2,7,11-14 hour shifts.,3
4,1,2,4,Extensive changes in upper level management ov...,10
...,...,...,...,...,...
26297935,4590139,1,2,"I had a 5 month assignment at a university, go...",14
26297936,4590139,1,6,Medical and Dental is available.,5
26297937,4590139,2,0,There is a lag between assignments and you hav...,15
26297938,4590140,1,7,Have had the opportunity to change positions e...,17


In [7]:
save_path = "data/glassdoor/glassdoor_topics_final.json.gz"

df_to_save.to_json(save_path, orient='records', lines=True, compression="gzip")

In [3]:
# df_to_save["text"] =  df_to_save["text"].str.replace('[^\x00-\x7F]','')

In [16]:
stata_path = "data/glassdoor/glassdoor_topics2.dta.gz"

df.to_stata(stata_path, write_index=False, version=118, compression='gzip')

In [17]:
import gzip
import shutil
with open("data/glassdoor/glassdoor_topics2.dta.gz", 'rb') as f_in:
    with gzip.open("data/glassdoor/glassdoor_topics3.dta.gz", 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [10]:
df_stata = pd.read_stata("data/glassdoor/glassdoor_topics.dta", chunksize=1000)

In [11]:
count=0
for i, df in enumerate(df_stata):
    count+=len(df['review_num'].unique())
    if i % 1000 == 0:
        print(count)

270000
304000
392000
411000
885000
1097000
1386000
1904000
2019000
2033000
2047000
2189000
2509000
2874000
2962000
3385000
3442000
3467000
3662000
3667000
3896000
3995000
4041000
4754000
4771000
4984000
5113000
5637000
5722000
5859000
6105000
6227000


In [12]:
count

6786141

# Adding target words search

In [2]:
load_path = "data/glassdoor/glassdoor_topics.parquet"
df = pd.read_parquet(load_path)

In [3]:
from nltk.corpus import wordnet
import re

def all_synsets(word, pos=None):
    map = {
        'NOUN': wordnet.NOUN,
        'VERB': wordnet.VERB,
        'ADJ': wordnet.ADJ,
        'ADV': wordnet.ADV
    }
    if pos is None:
        pos_list = [wordnet.VERB, wordnet.ADJ, wordnet.NOUN, wordnet.ADV]
    else:
        pos_list = [map[pos]]
    ret = []
    for pos in pos_list:
        ret.extend(wordnet.synsets(word, pos=pos))
    return ret

def clean_senses(synsets):
    return [x for x in set(synsets) if '_' not in x]

def all_possible_synonyms(word, pos=None):
    ret = []
    for syn in all_synsets(word, pos=pos):
        ret.extend(syn.lemma_names())
    return clean_senses(ret)

In [4]:
targets = ['account', 'fraud', 'misreport', 'misrepresent']
synonyms = []
for target in targets:
    synonyms.extend(all_possible_synonyms(target))

In [5]:
set(synonyms).union(set(targets))

{'account',
 'accounting',
 'belie',
 'bill',
 'calculate',
 'chronicle',
 'cook',
 'describe',
 'dupery',
 'explanation',
 'fake',
 'faker',
 'falsify',
 'fraud',
 'fraudulence',
 'fudge',
 'history',
 'hoax',
 'humbug',
 'imposter',
 'impostor',
 'invoice',
 'manipulate',
 'misreport',
 'misrepresent',
 'pretender',
 'pseud',
 'pseudo',
 'put-on',
 'report',
 'score',
 'sham',
 'shammer',
 'story',
 'wangle'}

In [24]:
regex = '(' + '|'.join(set(synonyms).union(set(targets))) +')'
regex

'(shammer|account|sham|fraud|faker|misreport|pseud|imposter|impostor|pretender|humbug|fake|pseudo|misrepresent|fraudulence|put-on|hoax|dupery)'

In [25]:
%time df['num_targets'] = df['text'].str.count(regex, re.IGNORECASE)

Wall time: 9min 22s


In [None]:
df[json_loader['num_targets'] > 0]

In [32]:
# df['has_account']      = df['text'].str.lower().str.contains('account') * 1
# df['has_fraud']        = df['text'].str.lower().str.contains('fraud') * 1
# df['has_misreport']    = df['text'].str.lower().str.contains('misreport') * 1
# df['has_misrepresent'] = df['text'].str.lower().str.contains('misrepresent') * 1

In [28]:
# “account”, “fraud”, “misreport”, “misrepresent”
# df['contains_target_word'] = df['text'].str.lower().str.contains('(account|fraud|misreport|misrepresent)', regex=True) * 1

In [2]:
save_path="data/glassdoor/glassdoor_topics.parquet"
# df.to_parquet(save_path)
df = pd.read_parquet(save_path)

In [9]:
df.review_num = df.review_num + 1

In [12]:
df.drop('index', axis=1,inplace=True)

In [3]:
df

Unnamed: 0,review_num,type,topic,text,num_words,num_targets
0,1,1,3,The people to work with,5,0
1,1,3,3,Hire more hardworking people and schedule fair...,19,0
2,1,2,0,And always understaff.,3,0
3,1,2,7,11-14 hour shifts.,3,0
4,2,3,7,"Ensure a solid foundation of training, allowan...",23,0
...,...,...,...,...,...,...
40569796,6754235,2,7,Long hours with little compensation,5,0
40569797,6754236,1,1,Really enjoyed being able to spend time with a...,17,0
40569798,6754236,1,7,Gained valuable experience in the field with p...,10,0
40569799,6754236,2,7,If you don't like working outside or getting d...,16,0


In [59]:
df.reset_index(inplace=True)

In [46]:
df[df['text'].str.count('(\r|\n)') > 0]

Unnamed: 0,review_num,type,topic,text,num_words,num_targets


In [None]:
df.replace('(\r|\n)',  '. ', regex=True, inplace=True)

In [9]:
data_loader = pd.read_stata("data/glassdoor/glassdoor_topics_v2.dta", chunksize=10000)

In [10]:
for df in tqdm(data_loader):
    pass

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [11]:
df

Unnamed: 0,review_num,text,type,topic,num_words,num_targets
35860000,7672043,"HSI has excellent benefits, paid time off and ...",1,6,23,0
35860001,7672043,"We are the best in our business, realize that ...",3,7,12,0
35860002,7672043,Sold Nov 2014 and new management has changed t...,2,4,21,1
35860003,7672045,A very strategic agency with a great vision an...,1,4,15,0
35860004,7672045,It is a great place to work for as it provides...,1,0,19,0
...,...,...,...,...,...,...
35862215,7672513,A truly wonderful mission.,1,3,4,0
35862216,7672513,"Let your good people participate, genuinely, i...",3,3,11,0
35862217,7672514,Definitely a company where you can grow and le...,1,1,11,0
35862218,7672514,Friendly people and solid company culture.,1,3,6,0


In [2]:
data_path = "data/glassdoor/rev-anon.parquet"
df = pd.read_parquet(data_path)

In [3]:
df

Unnamed: 0,index,pros,cons,feedback
0,1,Team work and support. Room for personal and p...,You will need a vehicle in good working condition,
1,2,"Especially the warehouse positions, it isn't d...","Always changing how you do your job, and not a...",Get your shift managers a bit more hands on ex...
2,3,"Great people, great diverse environment",They need more flexible work arrangements,
3,4,1. Flexible Work Timing\r\n2. Work-Life Balanc...,Nothing that I know of,
4,5,The Individuals are mostly a joy to support,Lack of backbone by management..allow folks to...,Get strong managers and back them up!
...,...,...,...,...
7672509,7672510,"The more eager you are to learn, the more you ...",Interns don't get paid for the 'trial period'.,
7672510,7672511,Very passionate and dedicated group,Not a lot of balance,
7672511,7672512,"Great pay, bosses let you alone, loose mgmt.","Long hrs, stressful, and dirty but worth it fo...",Give assistance to techs
7672512,7672513,A truly wonderful mission. The most diverse st...,Delicate funding situation. Shifting political...,"Let your good people participate, genuinely, i..."
