In [1]:
import pandas as pd
import numpy as np
import string

pd.options.display.max_rows = 50

## Cleaning CSV Files

**Read in Data**

In [2]:
df1 = pd.read_csv('data/glassdoor_ratings1_62.csv')
df2 = pd.read_csv('data/glassdoor_ratings2_37.csv')
df3 = pd.read_csv('data/glassdoor_ratings3_199.csv')
df4 = pd.read_csv('data/glassdoor_ratings5_797.csv')
df5 = pd.read_csv('data/glassdoor_ratings6_846.csv')
df6 = pd.read_csv('data/glassdoor_ratings7_453.csv')

#Combine data frames
data = pd.concat([df1, df2, df3, df4, df5, df6])

print('Length of data set: ',len(data))
print('Number of duplicates: ', data['NAME'].duplicated().sum())

Length of data set:  2395
Number of duplicates:  883


### Clean indices and duplicates

In [3]:
#Reset index to see all rows
data.reset_index(inplace=True)

#Check out dataframe
print(data.shape)
data.head()

(2395, 15)


Unnamed: 0,index,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,0,Google,10000+ Employees,"Mountain View, CA",Internet,4.5,4.4,4.4,4.3,4.1,4.5,4.3,"See All 18,362 Reviews",,
1,1,Microsoft,10000+ Employees,"Redmond, WA",Computer Hardware & Software,4.4,4.4,4.3,4.1,4.0,4.1,4.1,"See All 29,389 Reviews",Our drive to change the world unites us!\n\nMi...,
2,2,Apple,10000+ Employees,"Cupertino, CA",Computer Hardware & Software,4.3,4.4,4.3,3.7,3.7,4.3,3.8,"See All 20,851 Reviews",We’re a diverse collective of thinkers and doe...,
3,3,US Air Force,10000+ Employees,"Washington, DC",Federal Agencies,4.2,4.2,4.0,3.3,3.4,4.3,4.2,"See All 16,600 Reviews",The mission of the US Department of the Air Fo...,
4,4,Cisco Systems,10000+ Employees,"San Jose, CA",Computer Hardware & Software,4.2,4.3,4.3,4.2,3.8,4.1,3.9,"See All 21,804 Reviews","#WeAreCisco, where each person is unique, but ...",Mission: Cisco wants you to bring your uniquen...


In [4]:
#Drop duplicate companies by 'NAME' - drops size from 1941 to 1188
data.drop_duplicates(subset='NAME', keep='last', inplace=True)
data = data.drop(['index'], axis=1)

In [5]:
#reset index to begin at 0
data.reset_index(inplace=True)

#drop extra indices
# data = data.drop(['index', 'level_0'], axis=1)
data = data.drop(['index'], axis=1)

#Sanity Check
print('Shape: ', data.shape)

Shape:  (1512, 14)


### New column adding Description and Mission together 

In [6]:
# change null values to ' '
data['MISSION'].fillna(' ', inplace=True)
data['DESCRIPTION'].fillna(' ', inplace=True)

data['FULL_DESCRIPTION'] = np.add(data['DESCRIPTION'], data['MISSION'])

data.head()

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION,FULL_DESCRIPTION
0,McKinsey & Company,10000+ Employees,"New York, NY",Consulting,4.5,4.4,4.4,3.0,4.1,4.4,4.5,"See All 5,198 Reviews",We work with leaders across sectors to tackle ...,Mission: Our mission is two-fold: to help our ...,We work with leaders across sectors to tackle ...
1,Cornell University,10000+ Employees,"Ithaca, NY",Colleges & Universities,4.5,4.3,4.4,4.0,4.1,4.1,4.2,"See All 1,765 Reviews","Cornell is a private, Ivy League university re...",,"Cornell is a private, Ivy League university re..."
2,UC Santa Barbara,5001 to 10000 Employees,"Santa Barbara, CA",Colleges & Universities,4.5,4.2,4.3,4.4,4.1,3.7,4.0,"See All 1,049 Reviews","Dude, let's hit the beach! And then we'll hit ...",,"Dude, let's hit the beach! And then we'll hit ..."
3,UC Irvine,10000+ Employees,"Irvine, CA",Colleges & Universities,4.4,4.5,4.3,4.0,4.0,3.8,4.0,"See All 1,398 Reviews","Sun, the beach, Nobel Prize winners, and a Sou...",,"Sun, the beach, Nobel Prize winners, and a Sou..."
4,Iowa State University,5001 to 10000 Employees,"Ames, IA",Colleges & Universities,4.4,4.2,4.2,4.2,4.0,3.8,3.9,"See All 1,203 Reviews",Attending Iowa State University of Science and...,,Attending Iowa State University of Science and...


In [7]:
### WORK ON THIS ###
# how to add a space before "Mission:"??

### Clean up null values

In [8]:
#change '  ' to null values
data['FULL_DESCRIPTION'] = data['FULL_DESCRIPTION'].mask(data['FULL_DESCRIPTION']== '  ')

In [9]:
#Drop null values from RATING_DI
print('NaN values dropped from RATING_DI: ', data.RATING_DI.isna().sum())
data.dropna(subset=['RATING_DI'], inplace=True)

#Drop null values from DESCRIPTION
print('NaN values dropped from FULL_DESCRIPTION: ', data.FULL_DESCRIPTION.isna().sum())
data.dropna(subset=['FULL_DESCRIPTION'], inplace=True)


NaN values dropped from RATING_DI:  63
NaN values dropped from FULL_DESCRIPTION:  39


### Clean up NUM_REVIEWS
- return numbers only

In [10]:
def get_digits(s):
    #remove commas
    exclude = set(string.punctuation)
    s = ''.join(num for num in s if num not in exclude)

    #return digits only
    return s.split()[2]

In [11]:
#apply function to NUM_REVIEWS and change datatype to int
data['NUM_REVIEWS'] = data['NUM_REVIEWS'].apply(get_digits).astype(int)

In [12]:
#removing companies with too few reviews
data = data[(data['NUM_REVIEWS'] > 100)]

### New Column basted on RATING_DI

**drop lower than xx reviews**

In [13]:
# Checking out rating stats
data.describe()

Unnamed: 0,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS
count,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0,1404.0
mean,3.970869,4.031695,3.892949,3.755484,3.541667,3.73312,3.657835,1389.367521
std,0.316496,0.426927,0.392372,0.407912,0.404142,0.417601,0.379747,3042.118459
min,2.4,1.0,1.9,1.7,1.9,2.3,2.3,101.0
25%,3.8,3.8,3.7,3.5,3.3,3.5,3.4,253.0
50%,4.0,4.1,3.9,3.8,3.5,3.8,3.7,530.0
75%,4.2,4.3,4.1,4.0,3.8,4.0,3.9,1509.5
max,4.9,5.0,5.0,4.9,4.9,4.9,4.8,74066.0


In [14]:
data.RATING_DI.mean()

4.031695156695156

In [15]:
len(data[(data['RATING_DI'] > 4.03)])

725

In [16]:
# New column for Target variable - companies with Diversity and Inclusion Rating over 4.0
data['TARGET'] = np.where(data['RATING_DI'] > 4.0 , 1, 0)

In [17]:
data

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION,FULL_DESCRIPTION,TARGET
0,McKinsey & Company,10000+ Employees,"New York, NY",Consulting,4.5,4.4,4.4,3.0,4.1,4.4,4.5,5198,We work with leaders across sectors to tackle ...,Mission: Our mission is two-fold: to help our ...,We work with leaders across sectors to tackle ...,1
1,Cornell University,10000+ Employees,"Ithaca, NY",Colleges & Universities,4.5,4.3,4.4,4.0,4.1,4.1,4.2,1765,"Cornell is a private, Ivy League university re...",,"Cornell is a private, Ivy League university re...",1
2,UC Santa Barbara,5001 to 10000 Employees,"Santa Barbara, CA",Colleges & Universities,4.5,4.2,4.3,4.4,4.1,3.7,4.0,1049,"Dude, let's hit the beach! And then we'll hit ...",,"Dude, let's hit the beach! And then we'll hit ...",1
3,UC Irvine,10000+ Employees,"Irvine, CA",Colleges & Universities,4.4,4.5,4.3,4.0,4.0,3.8,4.0,1398,"Sun, the beach, Nobel Prize winners, and a Sou...",,"Sun, the beach, Nobel Prize winners, and a Sou...",1
4,Iowa State University,5001 to 10000 Employees,"Ames, IA",Colleges & Universities,4.4,4.2,4.2,4.2,4.0,3.8,3.9,1203,Attending Iowa State University of Science and...,,Attending Iowa State University of Science and...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1507,dressbarn,10000+ Employees,"Mahwah, NJ","Department, Clothing, & Shoe Stores",3.8,2.7,3.7,3.6,3.0,3.0,2.8,781,"At dressbarn, in everything we do, we inspire ...",,"At dressbarn, in everything we do, we inspire ...",0
1508,JLG Industries\nPart of Oshkosh Corporation,1001 to 5000 Employees,"Mc Connellsburg, PA",Industrial Manufacturing,3.9,3.2,4.0,3.6,3.3,4.1,3.8,211,"JLG Industries, Inc. is the world’s leading de...",,"JLG Industries, Inc. is the world’s leading de...",0
1509,CallidusCloud\nAcquired by SAP,1001 to 5000 Employees,"Dublin, CA",Enterprise Software & Network Solutions,3.8,4.7,3.7,3.5,3.4,3.7,3.6,270,"CallidusCloud, now part of SAP, is the global ...",,"CallidusCloud, now part of SAP, is the global ...",1
1510,Chevron Phillips Chemical,5001 to 10000 Employees,"The Woodlands, TX",Chemical Manufacturing,3.8,3.1,3.5,3.6,2.9,3.9,3.0,253,"You may not realize it, but you’ve likely used...",,"You may not realize it, but you’ve likely used...",0


## Save final CSV

In [18]:
data.to_csv('data/glassdoor_ratings_main.csv', index=False)