# Hand labeling review sentences- Part 2

In this notebook, we will label one thousand more sentences from CONs. This will be the last sentences we will hand label.

Once again, I have to be careful to not actually show the content of the reviews.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 100)

%matplotlib inline 
import matplotlib as mpl
import matplotlib.pyplot as plt

import numpy as np
import time

import re

import nltk
import nltk.data
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords


Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
start_time = time.time()

reviews = pd.read_csv('glassdoor_reviews_2.csv')

print('Took ' + str(time.time()-start_time) + ' seconds.')

  interactivity=interactivity, compiler=compiler, result=result)


Took 73.07739281654358 seconds.


In [3]:
#save original version of reviews
reviews_original = reviews.copy()

In [4]:
start_time = time.time()

reviews = reviews_original.copy()

#each review's "Author Title" should be of format "Employee Status - Job Title"
# for example, "Current Employee - Senior Engineer"

#determine how many parts each review's "Author Title" has (should be 2)
reviews.loc[:,'title_length'] = reviews.loc[:,'Author Title'].apply(lambda x: len(x.split(' - ')))

#only consider reviews of proper format "Employee Status - Author Title"
reviews = reviews[reviews['title_length'] == 2]
#could be omitting some job titles with 'dash' in name,
#but decreasing number of reviews from 2631927 to 2615691 (<1% change, so don't care)

#'Author Title' of all reviews now 2
reviews = reviews.drop('title_length', axis=1)

#break up "Author Title" into two columns: "Employee Status" and "Job Title"
reviews.loc[:,'Employee Status'] = reviews.loc[:,'Author Title'].apply(lambda x: x.split(' - ')[0])
reviews.loc[:,'Job Title'] = reviews.loc[:,'Author Title'].apply(lambda x: x.split(' - ')[1])

#remove 10 reviews have incorrect "Employee Status" 
#("Employee Status" not like "Current Employee", "Former Intern", etc.)
reviews = reviews[reviews['Employee Status'] != 'module.emp-review.current-'] #remove 4 reviews
reviews = reviews[reviews['Employee Status'] != 'module.emp-review.former-'] #remove 6 reviews

#add extra columns that states if employee is current or former employee
reviews.loc[:,'current_or_former'] = reviews.loc[:,'Employee Status'].apply(lambda x: x.split(' ')[0])

print('Took ' + str(time.time()-start_time) + ' seconds.')

Took 56.39093804359436 seconds.


In [5]:
#companies and number of reviews of company
companies = pd.read_csv('reviewed_companies.csv')

In [6]:
#only consider companies with at least 100 reviews
#    about 25% of companies have at least 100 reviews

minimum_reviews_to_consider = 100

#companies with at least 100 reviews
companies_at_least_min_reviews = companies[companies['count'] >= minimum_reviews_to_consider]

In [7]:
#reviews from companies with at least 100 reviews
reviews_at_least_min_reviews = reviews[reviews['Company Id'].isin(companies_at_least_min_reviews.loc[:,'Company Id'])]

In [8]:
#filter to jobs in USA
reviews_at_least_min_reviews_usa = \
    reviews_at_least_min_reviews[reviews_at_least_min_reviews['Author Country']=='USA']

In [9]:
#number of reviews to extract from reviews
size_of_sample = 500000

#extract size_of_sample reviews from reviews 
#(with at least 100 reviews)
#set random state for reproducibility
reviews_sample = reviews_at_least_min_reviews_usa.sample(n=size_of_sample, 
                                                     random_state=21).reset_index()

In [None]:
size_of_pros_sample = 1000

#PROs and CONs for 1000 reviews
reviews_small_sample = reviews_sample.loc[:size_of_pros_sample-1,['index','PROs','CONs']].\
    copy().reset_index(drop=True)

In [12]:
reviews_small_sample_cons = reviews_sample.loc[:size_of_pros_sample-1,['index','CONs']]

In [13]:
#make sure CONs are type string
reviews_small_sample_cons.loc[:,'CONs'] = reviews_small_sample_cons.loc[:,'CONs'].apply(lambda pros: str(pros))

def replace_period(a_string):
    '''
    Turns '.' and '+' into '. ' in sentences to help sentence tokenizer work right.
    '''
    a_string = a_string.replace('.', '. ')
    a_string = a_string.replace('+', '. ')
    
    return a_string

#fix TEXT1.TEXT2 by adding space after periods
reviews_small_sample_cons.loc[:,'CONs'] = reviews_small_sample_cons.loc[:,'CONs'].apply(lambda cons: replace_period(cons))

#tokenize sentences
reviews_small_sample_cons.loc[:,'CONs_sentences'] = reviews_small_sample_cons.loc[:,'CONs'].apply(lambda cons: sent_tokenize(cons))

In [14]:
def cons_to_df(series):
    '''
    Breaks up a review series into a DataFrame, with a row for every sentence in CONs.
    
    Args:
    Series (index of review, CONs)
    
    Returns:
    DataFrame ((number of sentences in CONs) x 4)
    
        Example return:
        index     PROs                       sent_number   PROs_sentence           categories
        525143    Bad pay! I hated the managers.    0      Bad Pay!                CB
        525143    Bad pay! I hated the managers.    1      I liked the managers.   SM
    '''
    
    cons_df = pd.DataFrame.from_dict({'index':series['index'],
                                      'CONs':series['CONs'],
                                      'CONs_sentence':series['CONs_sentences'],
                                      'sent_number':range(len(series['CONs_sentences']))})
    
    return cons_df.loc[:,['index','CONs','sent_number','CONs_sentence']]

In [15]:
cons_sentences_df = pd.concat([cons_to_df(reviews_small_sample_cons.loc[idx,:])
                              for idx in range(reviews_small_sample_cons.shape[0])],
                             ignore_index=True)

In [17]:
def input_cons_categories(df):
    '''
    Enables user to classify cons as belonging into different categories.
    
    Args:
        DataFrame with sentences from reviews.
    '''
    
    for idx in df.index:
        print('\n')
        print(idx)
        category = input('\n' + df.loc[idx,'CONs_sentence'] + '\n\n Category CV, WLB, SM, CB, CO, or O (or "break"): ')
        if category == 'break':
            print('Last index checked: {}'.format(idx-1))
            break
        else:
            df.loc[idx,'categories'] = category
            
    return df

In [18]:
cons_sentences_df_1000_1099 = cons_sentences_df.loc[1000:1099,:].copy()

cons_sentences_df_1000_1099.loc[:,'categories'] = cons_sentences_df_1000_1099.apply(lambda x: 'ToBeFilledIn', axis=1)

In [20]:
#hand label CONs sentences 1000-1099
cons_sentences_df_1000_1099 = input_cons_categories(cons_sentences_df_1000_1099)

In [21]:
cons_sentences_df_1000_1099.to_csv('cons_sentences_df_1000_1099.csv')

In [22]:
cons_sentences_df_1100_1999 = cons_sentences_df.loc[1100:1999,:].copy()

cons_sentences_df_1100_1999.loc[:,'categories'] = cons_sentences_df_1100_1999.apply(lambda x: 'ToBeFilledIn', axis=1)

In [24]:
#hand label CONs sentences 1100-1999
#    error when inputting and I was only able to hand label sentences 1100-1584
cons_sentences_df_1100_1999 = input_cons_categories(cons_sentences_df_1100_1999)

In [26]:
#results of labeling 1100-1584
cons_sentences_df_1100_1584 = pd.read_csv('index_categories_1100_1584.csv', 
                                            index_col='Unnamed: 0')


In [30]:
cons_sentences_df_1100_1584.to_csv('cons_sentences_df_1100_1584.csv')

In [31]:
cons_sentences_df_1585_1999 = cons_sentences_df_1100_1999.loc[1585:1999,:].copy()


In [None]:
#label CONs sentences 1585-1999
cons_sentences_df_1585_1999 = input_cons_categories(cons_sentences_df_1585_1999)

In [35]:
cons_sentences_df_1585_1999.to_csv('cons_sentences_df_1585_1999.csv')