# Prepare

In [1]:
#  data manipulation
import pandas as pd
import numpy as np

# natural language processing
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import re

# split data
from sklearn.model_selection import train_test_split

# Quiet all warnings
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append("..")

import wrangle

**Get data**

In [2]:
glassdrs = wrangle.combine_all_files()
glassdrs.head()

Unnamed: 0,url,pros,cons
0,https://www.glassdoor.com/Reviews/Memorial-Her...,Great coworkers Wonderful Work environment Tho...,"Overall, I love working at this hospital. Ther..."
1,https://www.glassdoor.com/Reviews/Chicago-Publ...,Great pay and benefits including union support...,"Heavy work load, long hours. Huge Class Sizes ..."
2,https://www.glassdoor.com/Reviews/Gensler-Revi...,Even though it's the largest firm in the world...,No cons but I have to type something in this b...
3,https://www.glassdoor.com/Reviews/Inova-Review...,,
4,https://www.glassdoor.com/Reviews/Five-Guys-Bu...,"Great benefits , get incentives and growth wit...",It takes a while to receive raises and promoti...


**Remove any row with with nulls**

In [3]:
# remove any nuls found in the pros and cons section of the data
glassdrs = glassdrs.dropna()

In [4]:
glassdrs.shape

(1023, 3)

**Bin rating**

In [5]:
# # Define bin edges
# bin_edges = [1.0, 2.0, 3.0, 4.0, 5.0]

# # Define bin labels
# bin_labels = ['One', 'Two', 'Three', 'Four']

# # Bin the 'rating' column
# glassdrs['binned_rating'] = pd.cut(glassdrs['rating'], bins=bin_edges, labels=bin_labels, right=False)
# glassdrs.head(3)

**Clean strings**

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [6]:
def clean(string):
    """
    This function puts a string in lowercase, normalizes any unicode characters, removes anything that         
    isn't an alphanumeric symbol or single quote.
    """
    # Normalize unicode characters
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # Remove unwanted characters and put string in lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
            
    return string

**Lemmatize**
- Apply lemmatization to the data

In [7]:
def lemmatize(string):
    """
    This function takes in a string, lemmatizes each word, and returns a lemmatized version of the orignal string
    """
    # Build the lemmatizer
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    # Run the lemmatizer on each word after splitting the input string, store results in the 'results' list
    results = []
    for word in string.split():
        results.append(lemmatizer.lemmatize(word))
    
    # Convert results back into a string
    string = ' '.join(results)
    
    # Return the resulting string
    return string


**Remove stop words**
- Remove all stop words from the data

In [8]:
def remove_stopwords(string, extra_words=None, exclude_words=None):
    """
    Takes in a string, with optional arguments for words to add to stock stopwords and words to ignore in the 
    stock list removes the stopwords, and returns a stopword free version of the original string
    """
    # Get the list of stopwords from nltk
    stopword_list = stopwords.words('english')
    
    # Create a set of stopwords to exclude
    excluded_stopwords = set(exclude_words) if exclude_words else set()
    
    # Include any extra words in the stopwords to exclude
    stopwords_to_exclude = set(stopword_list) - excluded_stopwords
    
    # Add extra words to the stopwords set
    stopwords_to_exclude |= set(extra_words) if extra_words else set()
    
    # Tokenize the input string
    words = string.split()
    
    # Filter out stopwords from the tokenized words
    filtered_words = [word for word in words if word not in stopwords_to_exclude]
    
    # Convert back to string
    string = ' '.join(filtered_words)
    
    # Return the resulting string
    return string

**Split data**
- Apply a 60% training, 20% Validation, and 20% testing split to the data. (Random state 95)

In [9]:
def split_readmes(df):
    """
    Takes in a dataframe and performs a 70/15/15 split. Outputs a train, validate, and test dataframe
    """
    # Perfrom a 70/15/15 split
    train_val, test = train_test_split(df, test_size=.2, random_state=95)
    train, validate = train_test_split(train_val, test_size=.25, random_state=95)
    
    # Return the dataframe slices
    return train, validate, test

**Prepare data**

In [10]:
def prep_readmes(df, cols:str=[]):
    """
    Takes in the dataframe and the column name that contains the corpus data, creates a column of cleaned data, then uses that 
    to create a column without stopwords that is lemmatized, performs a train-validate-test split, and returns train, validate,
    and test.
    """
    for idx, col in enumerate(cols):
        # Initialize a list to collect cleaned elements in the for-loop below
        cleaned_row = []

        # Iterate through the readme_content values...
        for i in df[col].values:

            # Clean each value in the column and append to the 'cleaned_row' list
            cleaned_row.append(clean(i))
        
        if idx == 0:
            # Assign the clean row content to a new column in the dataframe named 'cleaned_content
            df = df.assign(pros_cleaned_content=cleaned_row)
            
            # Using a lambda, lemmatize all values in the 'cleaned_content' column and assign to a new column called 'lemmatized'
            df[f'{col}_lemmatized'] = df['pros_cleaned_content'].apply(lambda x: lemmatize(remove_stopwords(x)))
        if idx == 1:
            # Assign the clean row content to a new column in the dataframe named 'cleaned_content
            df = df.assign(cons_cleaned_content=cleaned_row)
            # Using a lambda, lemmatize all values in the 'cleaned_content' column and assign to a new column called 'lemmatized'
            df[f'{col}_lemmatized'] = df['cons_cleaned_content'].apply(lambda x: lemmatize(remove_stopwords(x)))

    # Split the dataframe (70/15/15)
    train, validate, test = split_readmes(df)
    
    # Return train, validate, and test dataframes
    return train, validate, test

In [11]:
train, val, test = prep_readmes(glassdrs, ["pros", "cons"])

In [12]:
train.shape, val.shape, test.shape

((613, 7), (205, 7), (205, 7))

In [13]:
train.head()

Unnamed: 0,url,pros,cons,pros_cleaned_content,pros_lemmatized,cons_cleaned_content,cons_lemmatized
556,https://www.glassdoor.com/Reviews/Medline-Indu...,"Flexible schedule, fast paced, independent wor...","Departments can be siloed, e-commerce could us...",flexible schedule fast paced independent work ...,flexible schedule fast paced independent work ...,departments can be siloed ecommerce could use ...,department siloed ecommerce could use improvem...
946,https://www.glassdoor.com/Reviews/OfficeMax-Re...,"Supportive management, community involvement, ...",Can be hard to move up No cons that i can thin...,supportive management community involvement ge...,supportive management community involvement ge...,can be hard to move up no cons that i can thin...,hard move con think living wage sofl pay lacki...
361,https://www.glassdoor.com/Reviews/Pier-1-Impor...,Solid place to work at Pier one was a great st...,No complaints great company and management I’m...,solid place to work at pier one was a great st...,solid place work pier one great store work exc...,no complaints great company and management im ...,complaint great company management im sorry st...
238,https://www.glassdoor.com/Reviews/UScellular-R...,"Genuine people and culture, fair pay and benef...",No major cons come to mind Tough to have work ...,genuine people and culture fair pay and benefi...,genuine people culture fair pay benefit proact...,no major cons come to mind tough to have work ...,major con come mind tough work life balance u ...
893,https://www.glassdoor.com/Reviews/Discount-Tir...,"Decent pay, great management, very well mainta...",Facial hair is not allowed. Must be clean shav...,decent pay great management very well maintain...,decent pay great management well maintained eq...,facial hair is not allowed must be clean shave...,facial hair allowed must clean shaven manageme...


In [2]:
train, val, test = wrangle.wrangle_readmes()

In [4]:
rating = pd.read_csv("company_ratings.csv")

In [9]:
test.shape

(88, 8)

In [8]:
pd.merge(left = test, right=rating, how="inner", left_on="name", right_on="employer_name")

Unnamed: 0.1,url,pros,cons,name,pros_cleaned_content,pros_lemmatized,cons_cleaned_content,cons_lemmatized,Unnamed: 0,employer_name,rating
0,https://www.glassdoor.com/Reviews/Kindred-Hosp...,kindred is a great place to work Great place t...,health insurance choice is limit There are no ...,Kindred Hospitals,kindred is a great place to work great place t...,kindred great place work great place work flex...,health insurance choice is limit there are no ...,health insurance choice limit con underpay imm...,695,Kindred Hospitals,3.2
1,https://www.glassdoor.com/Reviews/Techtronic-I...,"Promote from within, ambitious culture, hands ...",Minimal connection with team members You are h...,Techtronic Industries North America,promote from within ambitious culture hands on...,promote within ambitious culture hand experien...,minimal connection with team members you are h...,minimal connection team member rely trust walm...,971,Techtronic Industries North America,3.7
2,https://www.glassdoor.com/Reviews/Magna-Intern...,Excellent health care benefits and 401k match ...,"Too many hours, needs to hire more people. Mag...",Magna International,excellent health care benefits and 401k match ...,excellent health care benefit 401k match magna...,too many hours needs to hire more people magna...,many hour need hire people magna electronics o...,813,Magna International,3.7
3,https://www.glassdoor.com/Reviews/Acosta-Revie...,"Set your own schedule, Great manager, Teamwork...",Can get boring and you have to be a self start...,Acosta,set your own schedule great manager teamwork p...,set schedule great manager teamwork perfect st...,can get boring and you have to be a self start...,get boring self starter sometimes working full...,884,Acosta,3.3
4,https://www.glassdoor.com/Reviews/Avis-Budget-...,Exciting to be in an international airport. Ve...,Unmotivated management. Could do better withou...,Avis Budget Group,exciting to be in an international airport ver...,exciting international airport self motivated ...,unmotivated management could do better without...,unmotivated management could better without le...,731,Avis Budget Group,3.4
...,...,...,...,...,...,...,...,...,...,...,...
65,https://www.glassdoor.com/Reviews/CDK-Global-R...,"People, leadership, culture, work-life balance...","For some, the pace of change could be too fast...",CDK Global,people leadership culture worklife balance gre...,people leadership culture worklife balance gre...,for some the pace of change could be too fast ...,pace change could fast that's organization tod...,712,CDK Global,3.0
66,https://www.glassdoor.com/Reviews/Parker-Hanni...,Opportunities for new college graduates Good e...,Base pay Location Jobs might sometimes be out ...,Parker Hannifin,opportunities for new college graduates good e...,opportunity new college graduate good employee...,base pay location jobs might sometimes be out ...,base pay location job might sometimes scope pa...,724,Parker Hannifin,3.8
67,https://www.glassdoor.com/Reviews/University-o...,"Transparent, honest, supportive executive team...","lots of work to do, many hours Nothing that I ...",University of Phoenix,transparent honest supportive executive team t...,transparent honest supportive executive team w...,lots of work to do many hours nothing that i h...,lot work many hour nothing seen con love worki...,947,University of Phoenix,4.1
68,https://www.glassdoor.com/Reviews/Cadence-Desi...,"- Large, stable company with resources - Encou...",- Lots of legacy software with extremely poor ...,Cadence Design Systems,large stable company with resources encourag...,large stable company resource encouragement pa...,lots of legacy software with extremely poor o...,lot legacy software extremely poor documentati...,849,Cadence Design Systems,4.3


## Preparation actions taken

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.
- Lemmatize
- Remove stop words
- 60, 20, 20 split