In [19]:
import pandas as pd
import numpy as np
import string

pd.options.display.max_rows = 50

## Cleaning CSV Files

**Read in Data**

In [41]:
df1 = pd.read_csv('data/glassdoor_ratings1_62.csv')
df2 = pd.read_csv('data/glassdoor_ratings2_37.csv')
df3 = pd.read_csv('data/glassdoor_ratings3_199.csv')
df4 = pd.read_csv('data/glassdoor_ratings5_797.csv')
df5 = pd.read_csv('data/glassdoor_ratings6_846.csv')
df6 = pd.read_csv('data/glassdoor_ratings7_453.csv')  
df7 = pd.read_csv('data/glassdoor_ratings8_873.csv') #scrape # 8

#Combine data frames
data = pd.concat([df1, df2, df3, df4, df5, df6, df7])

print('Length of data set: ',len(data))
print('Number of duplicates: ', data['NAME'].duplicated().sum())

Length of data set:  3268
Number of duplicates:  1246


### Initial Cleaning
 - Handle null values
 - reset indices 
 - drop duplicates

In [42]:
## CHECK TO BE SURE THIS DOESNT NEED TO HAPPEN EARLIER

#Reset index to see all rows
data.reset_index(inplace=True)


In [8]:
# #change '  ' to null values
# data['FULL_DESCRIPTION'] = data['FULL_DESCRIPTION'].mask(data['FULL_DESCRIPTION']== '  ')

In [43]:
#Drop null values from RATING_DI
print('NaN values dropped from RATING_DI: ', data.RATING_DI.isna().sum())
data.dropna(subset=['RATING_DI'], inplace=True)

#Drop null values from DESCRIPTION
print('NaN values dropped from DESCRIPTION: ', data.DESCRIPTION.isna().sum())
data.dropna(subset=['DESCRIPTION'], inplace=True)




NaN values dropped from RATING_DI:  153
NaN values dropped from DESCRIPTION:  90


In [44]:
print('Number of duplicates: ', data['NAME'].duplicated().sum())
print('NaN values dropped from DESCRIPTION: ', data.DESCRIPTION.isna().sum())


Number of duplicates:  1088
NaN values dropped from DESCRIPTION:  0


In [45]:
#Drop duplicate companies by 'NAME' - drops size from 1941 to 1188
data.drop_duplicates(subset='NAME', keep='last', inplace=True)
data = data.drop(['index'], axis=1)

In [46]:
#reset index to begin at 0
data.reset_index(inplace=True)

#drop extra indices
# data = data.drop(['index', 'level_0'], axis=1)
data = data.drop(['index'], axis=1)


In [47]:
#Check out dataframe
print('Shape: ', data.shape)
data.head()

Shape:  (1937, 14)


Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,McKinsey & Company,10000+ Employees,"New York, NY",Consulting,4.5,4.4,4.4,3.0,4.1,4.4,4.5,"See All 5,198 Reviews",We work with leaders across sectors to tackle ...,Mission: Our mission is two-fold: to help our ...
1,Cornell University,10000+ Employees,"Ithaca, NY",Colleges & Universities,4.5,4.3,4.4,4.0,4.1,4.1,4.2,"See All 1,765 Reviews","Cornell is a private, Ivy League university re...",
2,UC Santa Barbara,5001 to 10000 Employees,"Santa Barbara, CA",Colleges & Universities,4.5,4.2,4.3,4.4,4.1,3.7,4.0,"See All 1,049 Reviews","Dude, let's hit the beach! And then we'll hit ...",
3,UC Irvine,10000+ Employees,"Irvine, CA",Colleges & Universities,4.4,4.5,4.3,4.0,4.0,3.8,4.0,"See All 1,398 Reviews","Sun, the beach, Nobel Prize winners, and a Sou...",
4,Iowa State University,5001 to 10000 Employees,"Ames, IA",Colleges & Universities,4.4,4.2,4.2,4.2,4.0,3.8,3.9,"See All 1,203 Reviews",Attending Iowa State University of Science and...,


### Inspecting data distribution

In [48]:
##inspect size column
data.SIZE.value_counts()


1001 to 5000 Employees     858
10000+ Employees           712
5001 to 10000 Employees    356
501 to 1000 Employees        5
Unknown                      4
201 to 500 Employees         1
51 to 200 Employees          1
Name: SIZE, dtype: int64

In [49]:
data.INDUSTRY.value_counts().to_frame()

Unnamed: 0,INDUSTRY
Health Care Services & Hospitals,213
Colleges & Universities,155
Computer Hardware & Software,76
Insurance Carriers,63
IT Services,62
...,...
Auto Repair & Maintenance,1
Ticket Sales,1
Gas Stations,1
Farm Support Services,1


### New column adding Description and Mission together 

In [50]:
# change null values in MISSION and DESCRIPTION to ' '
data['MISSION'].fillna(' ', inplace=True)
data['DESCRIPTION'].fillna(' ', inplace=True)

data['FULL_DESCRIPTION'] = np.add(data['DESCRIPTION'], data['MISSION'])

data.head()

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION,FULL_DESCRIPTION
0,McKinsey & Company,10000+ Employees,"New York, NY",Consulting,4.5,4.4,4.4,3.0,4.1,4.4,4.5,"See All 5,198 Reviews",We work with leaders across sectors to tackle ...,Mission: Our mission is two-fold: to help our ...,We work with leaders across sectors to tackle ...
1,Cornell University,10000+ Employees,"Ithaca, NY",Colleges & Universities,4.5,4.3,4.4,4.0,4.1,4.1,4.2,"See All 1,765 Reviews","Cornell is a private, Ivy League university re...",,"Cornell is a private, Ivy League university re..."
2,UC Santa Barbara,5001 to 10000 Employees,"Santa Barbara, CA",Colleges & Universities,4.5,4.2,4.3,4.4,4.1,3.7,4.0,"See All 1,049 Reviews","Dude, let's hit the beach! And then we'll hit ...",,"Dude, let's hit the beach! And then we'll hit ..."
3,UC Irvine,10000+ Employees,"Irvine, CA",Colleges & Universities,4.4,4.5,4.3,4.0,4.0,3.8,4.0,"See All 1,398 Reviews","Sun, the beach, Nobel Prize winners, and a Sou...",,"Sun, the beach, Nobel Prize winners, and a Sou..."
4,Iowa State University,5001 to 10000 Employees,"Ames, IA",Colleges & Universities,4.4,4.2,4.2,4.2,4.0,3.8,3.9,"See All 1,203 Reviews",Attending Iowa State University of Science and...,,Attending Iowa State University of Science and...


In [7]:
### WORK ON THIS ###
# how to add a space before "Mission:"??

### Clean up NUM_REVIEWS
- return numbers only

In [51]:
def get_digits(s):
    #remove commas
    exclude = set(string.punctuation)
    s = ''.join(num for num in s if num not in exclude)

    #return digits only
    return s.split()[2]

In [52]:
#apply function to NUM_REVIEWS and change datatype to int
data['NUM_REVIEWS'] = data['NUM_REVIEWS'].apply(get_digits).astype(int)

In [53]:
#removing companies with too few reviews
data = data[(data['NUM_REVIEWS'] > 100)]

### New Column basted on RATING_DI

**drop lower than xx reviews**

In [54]:
# Checking out rating stats
data.describe()

Unnamed: 0,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS
count,1924.0,1924.0,1924.0,1924.0,1924.0,1924.0,1924.0,1924.0
mean,3.899064,3.985655,3.814085,3.694335,3.460863,3.682848,3.593399,1216.289501
std,0.299122,0.429166,0.380497,0.392565,0.388888,0.403465,0.36629,2710.347067
min,2.4,1.0,1.9,1.7,1.9,2.3,2.3,101.0
25%,3.7,3.7,3.6,3.4,3.2,3.4,3.3,243.0
50%,3.9,4.0,3.8,3.7,3.4,3.7,3.6,460.5
75%,4.1,4.3,4.1,4.0,3.7,3.9,3.8,1225.25
max,4.9,5.0,5.0,4.9,4.9,4.9,4.8,74411.0


In [55]:
data.RATING_DI.mean()

3.9856548856548857

In [86]:
((len(data[(data['RATING_DI'] > 4.0)]) / (len(data))) * 100).loc[:4]


AttributeError: 'float' object has no attribute 'loc'

In [89]:
print('Percentage of companpies below threshold:', 
      (len(data[(data['RATING_DI'] < 4.0)]) / (len(data)))*100)
print('Percentage of companpies above threshold:', 
      (len(data[(data['RATING_DI'] > 4.0)]) / (len(data)))*100)

Percentage of companpies below threshold: 43.555093555093556
Percentage of companpies above threshold: 46.1018711018711


In [90]:
# New column for Target variable - companies with Diversity and Inclusion Rating over 4.0
data['TARGET'] = np.where(data['RATING_DI'] > 4.0 , 1, 0)

In [91]:
data #1924 companies

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION,FULL_DESCRIPTION,TARGET
0,McKinsey & Company,10000+ Employees,"New York, NY",Consulting,4.5,4.4,4.4,3.0,4.1,4.4,4.5,5198,We work with leaders across sectors to tackle ...,Mission: Our mission is two-fold: to help our ...,We work with leaders across sectors to tackle ...,1
1,Cornell University,10000+ Employees,"Ithaca, NY",Colleges & Universities,4.5,4.3,4.4,4.0,4.1,4.1,4.2,1765,"Cornell is a private, Ivy League university re...",,"Cornell is a private, Ivy League university re...",1
2,UC Santa Barbara,5001 to 10000 Employees,"Santa Barbara, CA",Colleges & Universities,4.5,4.2,4.3,4.4,4.1,3.7,4.0,1049,"Dude, let's hit the beach! And then we'll hit ...",,"Dude, let's hit the beach! And then we'll hit ...",1
3,UC Irvine,10000+ Employees,"Irvine, CA",Colleges & Universities,4.4,4.5,4.3,4.0,4.0,3.8,4.0,1398,"Sun, the beach, Nobel Prize winners, and a Sou...",,"Sun, the beach, Nobel Prize winners, and a Sou...",1
4,Iowa State University,5001 to 10000 Employees,"Ames, IA",Colleges & Universities,4.4,4.2,4.2,4.2,4.0,3.8,3.9,1203,Attending Iowa State University of Science and...,,Attending Iowa State University of Science and...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1932,Valeo,10000+ Employees,"Paris, France",Transportation Equipment Manufacturing,3.6,3.9,3.4,3.0,2.9,3.5,3.4,2492,"Valeo is an automotive supplier, partner to al...",,"Valeo is an automotive supplier, partner to al...",0
1933,TVA,10000+ Employees,"Knoxville, TN",Energy,3.6,3.6,3.3,3.8,3.0,4.1,3.2,378,"For more than eight decades, the Tennessee Val...",Mission: Mission:\nTVA was built for the peopl...,"For more than eight decades, the Tennessee Val...",0
1934,Dierbergs Markets,1001 to 5000 Employees,"Chesterfield, MO",Grocery Stores & Supermarkets,3.6,3.8,3.3,3.5,3.6,2.5,3.4,209,Discover your hometown career at the Hometown ...,,Discover your hometown career at the Hometown ...,0
1935,Deckers Brands,1001 to 5000 Employees,"Goleta, CA","Department, Clothing, & Shoe Stores",3.6,4.0,3.9,3.7,3.2,3.5,3.0,293,Deckers Brands is a global leader in designing...,,Deckers Brands is a global leader in designing...,0


## Save final CSV

In [92]:
data.to_csv('data/glassdoor_ratings_main.csv', index=False)