# Prepare

In [57]:
#  data manipulation
import pandas as pd
import numpy as np

# natural language processing
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import re

# split data
from sklearn.model_selection import train_test_split

# Quiet all warnings
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append("..")

**Get data**

In [2]:
glassdrs = pd.read_csv("../glassdoor_part3.csv", index_col=0)
glassdrs.head()

Unnamed: 0,url,pros,cons
0,https://www.glassdoor.com/Reviews/Memorial-Her...,Great coworkers Wonderful Work environment Tho...,"Overall, I love working at this hospital. Ther..."
1,https://www.glassdoor.com/Reviews/Chicago-Publ...,Great pay and benefits including union support...,"Heavy work load, long hours. Huge Class Sizes ..."
2,https://www.glassdoor.com/Reviews/Gensler-Revi...,Even though it's the largest firm in the world...,No cons but I have to type something in this b...
3,https://www.glassdoor.com/Reviews/Inova-Review...,,
4,https://www.glassdoor.com/Reviews/Five-Guys-Bu...,"Great benefits , get incentives and growth wit...",It takes a while to receive raises and promoti...


**Remove any row with with nulls**

In [3]:
# remove any nuls found in the pros and cons section of the data
glassdrs = glassdrs.dropna()

In [4]:
glassdrs.shape

(341, 3)

**Bin rating**

In [6]:
# Define bin edges
bin_edges = [1.0, 2.0, 3.0, 4.0, 5.0]

# Define bin labels
bin_labels = ['One', 'Two', 'Three', 'Four']

# Bin the 'rating' column
glassdrs['binned_rating'] = pd.cut(glassdrs['rating'], bins=bin_edges, labels=bin_labels, right=False)
glassdrs.head(3)

**Clean strings**

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [7]:
def clean(string):
    """
    This function puts a string in lowercase, normalizes any unicode characters, removes anything that         
    isn't an alphanumeric symbol or single quote.
    """
    # Normalize unicode characters
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # Remove unwanted characters and put string in lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
            
    return string

**Lemmatize**
- Apply lemmatization to the data

In [8]:
def lemmatize(string):
    """
    This function takes in a string, lemmatizes each word, and returns a lemmatized version of the orignal string
    """
    # Build the lemmatizer
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    # Run the lemmatizer on each word after splitting the input string, store results in the 'results' list
    results = []
    for word in string.split():
        results.append(lemmatizer.lemmatize(word))
    
    # Convert results back into a string
    string = ' '.join(results)
    
    # Return the resulting string
    return string


**Remove stop words**
- Remove all stop words from the data

In [9]:
def remove_stopwords(string, extra_words=None, exclude_words=None):
    """
    Takes in a string, with optional arguments for words to add to stock stopwords and words to ignore in the 
    stock list removes the stopwords, and returns a stopword free version of the original string
    """
    # Get the list of stopwords from nltk
    stopword_list = stopwords.words('english')
    
    # Create a set of stopwords to exclude
    excluded_stopwords = set(exclude_words) if exclude_words else set()
    
    # Include any extra words in the stopwords to exclude
    stopwords_to_exclude = set(stopword_list) - excluded_stopwords
    
    # Add extra words to the stopwords set
    stopwords_to_exclude |= set(extra_words) if extra_words else set()
    
    # Tokenize the input string
    words = string.split()
    
    # Filter out stopwords from the tokenized words
    filtered_words = [word for word in words if word not in stopwords_to_exclude]
    
    # Convert back to string
    string = ' '.join(filtered_words)
    
    # Return the resulting string
    return string

**Split data**
- Apply a 60% training, 20% Validation, and 20% testing split to the data. (Random state 95)

In [10]:
def split_readmes(df):
    """
    Takes in a dataframe and performs a 70/15/15 split. Outputs a train, validate, and test dataframe
    """
    # Perfrom a 70/15/15 split
    train_val, test = train_test_split(df, test_size=.2, random_state=95)
    train, validate = train_test_split(train_val, test_size=.25, random_state=95)
    
    # Return the dataframe slices
    return train, validate, test

**Prepare data**

In [11]:
def prep_readmes(df, cols:str=[]):
    """
    Takes in the dataframe and the column name that contains the corpus data, creates a column of cleaned data, then uses that 
    to create a column without stopwords that is lemmatized, performs a train-validate-test split, and returns train, validate,
    and test.
    """
    for idx, col in enumerate(cols):
        # Initialize a list to collect cleaned elements in the for-loop below
        cleaned_row = []

        # Iterate through the readme_content values...
        for i in df[col].values:

            # Clean each value in the column and append to the 'cleaned_row' list
            cleaned_row.append(clean(i))
        
        if idx == 0:
            # Assign the clean row content to a new column in the dataframe named 'cleaned_content
            df = df.assign(pros_cleaned_content=cleaned_row)
            
            # Using a lambda, lemmatize all values in the 'cleaned_content' column and assign to a new column called 'lemmatized'
            df[f'{col}_lemmatized'] = df['pros_cleaned_content'].apply(lambda x: lemmatize(remove_stopwords(x)))
        if idx == 1:
            # Assign the clean row content to a new column in the dataframe named 'cleaned_content
            df = df.assign(cons_cleaned_content=cleaned_row)
            # Using a lambda, lemmatize all values in the 'cleaned_content' column and assign to a new column called 'lemmatized'
            df[f'{col}_lemmatized'] = df['cons_cleaned_content'].apply(lambda x: lemmatize(remove_stopwords(x)))

    # Split the dataframe (70/15/15)
    train, validate, test = split_readmes(df)
    
    # Return train, validate, and test dataframes
    return train, validate, test

In [12]:
train, val, test = prep_readmes(glassdrs, ["pros", "cons"])

In [13]:
train.shape, val.shape, test.shape

((204, 7), (68, 7), (69, 7))

In [14]:
train.head()

Unnamed: 0,url,pros,cons,pros_cleaned_content,pros_lemmatized,cons_cleaned_content,cons_lemmatized
99,https://www.glassdoor.com/Reviews/Sally-Beauty...,Good place to work overall Good staff loved wo...,Challenging some days to keep up with all the ...,good place to work overall good staff loved wo...,good place work overall good staff loved worki...,challenging some days to keep up with all the ...,challenging day keep change hard holiday hour ...
32,https://www.glassdoor.com/Reviews/Carnegie-Mel...,"good client list, gain experience Lots of grea...","sometimes disorganized, team composition not a...",good client list gain experience lots of great...,good client list gain experience lot great stu...,sometimes disorganized team composition not al...,sometimes disorganized team composition always...
315,https://www.glassdoor.com/Reviews/TransUnion-R...,Great culture in engineering. Very organized. ...,Not much opportunity to transfer teams if you ...,great culture in engineering very organized te...,great culture engineering organized tech lead ...,not much opportunity to transfer teams if you ...,much opportunity transfer team wanted work dif...
132,https://www.glassdoor.com/Reviews/Block-Review...,- Remote - Mental Health taken seriously - Sup...,No cons. Very good experience. The lack of a c...,remote mental health taken seriously suppor...,remote mental health taken seriously supportiv...,no cons very good experience the lack of a coh...,con good experience lack coherent strategic fr...
100,https://www.glassdoor.com/Reviews/Motorola-Mob...,The company location is great Work-life balanc...,Limited pay just wish they can give a little b...,the company location is great worklife balance...,company location great worklife balance good w...,limited pay just wish they can give a little b...,limited pay wish give little bit limited hr ru...


## Preparation actions taken

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.
- Lemmatize
- Remove stop words
- 60, 20, 20 split