# Prepare

In [1]:
#  data manipulation
import pandas as pd
import numpy as np

# natural language processing
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
import re

# split data
from sklearn.model_selection import train_test_split

# Quiet all warnings
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append("..")

**Get data**

In [2]:
glassdrs = pd.read_csv("../sample_raw_data.csv", index_col=0)
glassdrs.head()

Unnamed: 0,url,name,rating,pros,cons
0,https://www.glassdoor.com/Reviews/Amazon-Revie...,Amazon,3.7,Gain useful experience and great benefits Real...,Not much room for advancement You have to be s...
1,https://www.glassdoor.com/Reviews/Deloitte-Rev...,Deloitte,4.0,"Well, it's BigD isn't it? Everyone on the ball...","Not really a con, but a very large, structured..."
2,https://www.glassdoor.com/Reviews/Walmart-Revi...,Walmart,3.3,Employee discount 10% and time and half holida...,Customers getting in the way Understaffing iss...
3,https://www.glassdoor.com/Reviews/Target-Revie...,Target,3.6,,
4,https://www.glassdoor.com/Reviews/McDonald-s-R...,McDonald's,3.5,"Very nice people, sometimes fun Great job and ...",Can be boring at times I’m going back in a min...


**Remove any row with with nulls**

In [3]:
# remove any nuls found in the pros and cons section of the data
glassdrs = glassdrs.dropna()

**Bin rating**

In [4]:
# Define bin edges
bin_edges = [1.0, 2.0, 3.0, 4.0, 5.0]

# Define bin labels
bin_labels = ['One', 'Two', 'Three', 'Four']

# Bin the 'rating' column
glassdrs['binned_rating'] = pd.cut(glassdrs['rating'], bins=bin_edges, labels=bin_labels, right=False)
glassdrs.head(3)

Unnamed: 0,url,name,rating,pros,cons,binned_rating
0,https://www.glassdoor.com/Reviews/Amazon-Revie...,Amazon,3.7,Gain useful experience and great benefits Real...,Not much room for advancement You have to be s...,Three
1,https://www.glassdoor.com/Reviews/Deloitte-Rev...,Deloitte,4.0,"Well, it's BigD isn't it? Everyone on the ball...","Not really a con, but a very large, structured...",Four
2,https://www.glassdoor.com/Reviews/Walmart-Revi...,Walmart,3.3,Employee discount 10% and time and half holida...,Customers getting in the way Understaffing iss...,Three


**Clean strings**

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [8]:
def clean(string):
    """
    This function puts a string in lowercase, normalizes any unicode characters, removes anything that         
    isn't an alphanumeric symbol or single quote.
    """
    # Normalize unicode characters
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # Remove unwanted characters and put string in lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
            
    return string

**Lemmatize**
- Apply lemmatization to the data

In [9]:
def lemmatize(string):
    """
    This function takes in a string, lemmatizes each word, and returns a lemmatized version of the orignal string
    """
    # Build the lemmatizer
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    # Run the lemmatizer on each word after splitting the input string, store results in the 'results' list
    results = []
    for word in string.split():
        results.append(lemmatizer.lemmatize(word))
    
    # Convert results back into a string
    string = ' '.join(results)
    
    # Return the resulting string
    return string


**Remove stop words**
- Remove all stop words from the data

In [11]:
def remove_stopwords(string, extra_words=None, exclude_words=None):
    """
    Takes in a string, with optional arguments for words to add to stock stopwords and words to ignore in the 
    stock list removes the stopwords, and returns a stopword free version of the original string
    """
    # Get the list of stopwords from nltk
    stopword_list = stopwords.words('english')
    
    # Create a set of stopwords to exclude
    excluded_stopwords = set(exclude_words) if exclude_words else set()
    
    # Include any extra words in the stopwords to exclude
    stopwords_to_exclude = set(stopword_list) - excluded_stopwords
    
    # Add extra words to the stopwords set
    stopwords_to_exclude |= set(extra_words) if extra_words else set()
    
    # Tokenize the input string
    words = string.split()
    
    # Filter out stopwords from the tokenized words
    filtered_words = [word for word in words if word not in stopwords_to_exclude]
    
    # Convert back to string
    string = ' '.join(filtered_words)
    
    # Return the resulting string
    return string

**Split data**
- Apply a 60% training, 20% Validation, and 20% testing split to the data. (Random state 95)

In [12]:
def split_readmes(df):
    """
    Takes in a dataframe and performs a 70/15/15 split. Outputs a train, validate, and test dataframe
    """
    # Perfrom a 70/15/15 split
    train_val, test = train_test_split(df, test_size=.2, random_state=95)
    train, validate = train_test_split(train_val, test_size=.25, random_state=95)
    
    # Return the dataframe slices
    return train, validate, test

**Prepare data**

In [20]:
def prep_readmes(df, cols:str=[]):
    """
    Takes in the dataframe and the column name that contains the corpus data, creates a column of cleaned data, then uses that 
    to create a column without stopwords that is lemmatized, performs a train-validate-test split, and returns train, validate,
    and test.
    """
    for idx, col in enumerate(cols):
        # Initialize a list to collect cleaned elements in the for-loop below
        cleaned_row = []

        # Iterate through the readme_content values...
        for i in df[col].values:

            # Clean each value in the column and append to the 'cleaned_row' list
            cleaned_row.append(clean(i))
        
        if idx == 0:
            # Assign the clean row content to a new column in the dataframe named 'cleaned_content
            df = df.assign(pros_cleaned_content=cleaned_row)
            
            # Using a lambda, lemmatize all values in the 'cleaned_content' column and assign to a new column called 'lemmatized'
            df[f'{col}_lemmatized'] = df['pros_cleaned_content'].apply(lambda x: lemmatize(remove_stopwords(x)))
        if idx == 1:
            # Assign the clean row content to a new column in the dataframe named 'cleaned_content
            df = df.assign(cons_cleaned_content=cleaned_row)
            # Using a lambda, lemmatize all values in the 'cleaned_content' column and assign to a new column called 'lemmatized'
            df[f'{col}_lemmatized'] = df['cons_cleaned_content'].apply(lambda x: lemmatize(remove_stopwords(x)))

    # Split the dataframe (70/15/15)
    train, validate, test = split_readmes(df)
    
    # Return train, validate, and test dataframes
    return train, validate, test

In [21]:
train, val, test = prep_readmes(glassdrs, ["pros", "cons"])

In [22]:
train.shape, val.shape, test.shape

((5, 10), (2, 10), (2, 10))

In [23]:
train.head()

Unnamed: 0,url,name,rating,pros,cons,binned_rating,pros_cleaned_content,pros_lemmatized,cons_cleaned_content,cons_lemmatized
4,https://www.glassdoor.com/Reviews/McDonald-s-R...,McDonald's,3.5,"Very nice people, sometimes fun Great job and ...",Can be boring at times I’m going back in a min...,Three,very nice people sometimes fun great job and g...,nice people sometimes fun great job great work...,can be boring at times im going back in a minu...,boring time im going back minute need little h...
2,https://www.glassdoor.com/Reviews/Walmart-Revi...,Walmart,3.3,Employee discount 10% and time and half holida...,Customers getting in the way Understaffing iss...,Three,employee discount 10 and time and half holiday...,employee discount 10 time half holiday pay adv...,customers getting in the way understaffing iss...,customer getting way understaffing issue negat...
6,https://www.glassdoor.com/Reviews/Accenture-Re...,Accenture,4.0,"Large company, lots of support, good parental ...","Raleigh is small office, fewer local clients t...",Four,large company lots of support good parental le...,large company lot support good parental leave ...,raleigh is small office fewer local clients th...,raleigh small office fewer local client office...
0,https://www.glassdoor.com/Reviews/Amazon-Revie...,Amazon,3.7,Gain useful experience and great benefits Real...,Not much room for advancement You have to be s...,Three,gain useful experience and great benefits real...,gain useful experience great benefit really sm...,not much room for advancement you have to be s...,much room advancement self motivated one hold ...
7,https://www.glassdoor.com/Reviews/IBM-Reviews-...,IBM,3.9,Great company. Fantastic trading program. Move...,No complaints. Great company work and people l...,Three,great company fantastic trading program move a...,great company fantastic trading program move p...,no complaints great company work and people lo...,complaint great company work people low pay co...


## Preparation actions taken

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.
- Lemmatize
- Remove stop words
- 60, 20, 20 split