# Cleaning CSV Files

In [1]:
import pandas as pd
import numpy as np
import string

pd.options.display.max_rows = 50

**Read in Data**

In [2]:
df1 = pd.read_csv('data/glassdoor_ratings1_62.csv')#, index_col=0) #index_col=0 to remove numeric index
df2 = pd.read_csv('data/glassdoor_ratings2_37.csv')#, index_col=0)
df3 = pd.read_csv('data/glassdoor_ratings3_199.csv')#, index_col=0)
df4 = pd.read_csv('data/glassdoor_ratings5_797.csv')#, index_col=0)
df5 = pd.read_csv('data/glassdoor_ratings6_846.csv')#, index_col=0)

#Combine data frames
data = pd.concat([df1, df2, df3, df4, df5])

print('Length of data set: ',len(data))
print('Number of duplicates: ', data.duplicated().sum())

Length of data set:  1941
Number of duplicates:  583


In [3]:
#Reset index to see all rows
data.reset_index(inplace=True)

#Check out dataframe
print(data.shape)
data.head()

(1941, 15)


Unnamed: 0,index,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,0,Google,10000+ Employees,"Mountain View, CA",Internet,4.5,4.4,4.4,4.3,4.1,4.5,4.3,"See All 18,362 Reviews",,
1,1,Microsoft,10000+ Employees,"Redmond, WA",Computer Hardware & Software,4.4,4.4,4.3,4.1,4.0,4.1,4.1,"See All 29,389 Reviews",Our drive to change the world unites us!\n\nMi...,
2,2,Apple,10000+ Employees,"Cupertino, CA",Computer Hardware & Software,4.3,4.4,4.3,3.7,3.7,4.3,3.8,"See All 20,851 Reviews",We’re a diverse collective of thinkers and doe...,
3,3,US Air Force,10000+ Employees,"Washington, DC",Federal Agencies,4.2,4.2,4.0,3.3,3.4,4.3,4.2,"See All 16,600 Reviews",The mission of the US Department of the Air Fo...,
4,4,Cisco Systems,10000+ Employees,"San Jose, CA",Computer Hardware & Software,4.2,4.3,4.3,4.2,3.8,4.1,3.9,"See All 21,804 Reviews","#WeAreCisco, where each person is unique, but ...",Mission: Cisco wants you to bring your uniquen...


In [4]:
#Drop duplicate companies by 'NAME' - drops size from 1941 to 1188
data.drop_duplicates(subset='NAME', keep='last', inplace=True)
data = data.drop(['index'], axis=1)

In [5]:
#reset index to begin at 0
data.reset_index(inplace=True)

In [6]:
data.head()

Unnamed: 0,index,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,10,McKinsey & Company,10000+ Employees,"New York, NY",Consulting,4.5,4.4,4.4,3.0,4.1,4.4,4.5,"See All 5,198 Reviews",We work with leaders across sectors to tackle ...,Mission: Our mission is two-fold: to help our ...
1,11,Cornell University,10000+ Employees,"Ithaca, NY",Colleges & Universities,4.5,4.3,4.4,4.0,4.1,4.1,4.2,"See All 1,765 Reviews","Cornell is a private, Ivy League university re...",
2,12,UC Santa Barbara,5001 to 10000 Employees,"Santa Barbara, CA",Colleges & Universities,4.5,4.2,4.3,4.4,4.1,3.7,4.0,"See All 1,049 Reviews","Dude, let's hit the beach! And then we'll hit ...",
3,13,UC Irvine,10000+ Employees,"Irvine, CA",Colleges & Universities,4.4,4.5,4.3,4.0,4.0,3.8,4.0,"See All 1,398 Reviews","Sun, the beach, Nobel Prize winners, and a Sou...",
4,14,Iowa State University,5001 to 10000 Employees,"Ames, IA",Colleges & Universities,4.4,4.2,4.2,4.2,4.0,3.8,3.9,"See All 1,203 Reviews",Attending Iowa State University of Science and...,


In [7]:
#drop extra indices
# data = data.drop(['index', 'level_0'], axis=1)
data = data.drop(['index'], axis=1)

## Cleaned indices and duplicates gone

In [8]:
print('Shape: ', data.shape)
data.head()

Shape:  (1188, 14)


Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,McKinsey & Company,10000+ Employees,"New York, NY",Consulting,4.5,4.4,4.4,3.0,4.1,4.4,4.5,"See All 5,198 Reviews",We work with leaders across sectors to tackle ...,Mission: Our mission is two-fold: to help our ...
1,Cornell University,10000+ Employees,"Ithaca, NY",Colleges & Universities,4.5,4.3,4.4,4.0,4.1,4.1,4.2,"See All 1,765 Reviews","Cornell is a private, Ivy League university re...",
2,UC Santa Barbara,5001 to 10000 Employees,"Santa Barbara, CA",Colleges & Universities,4.5,4.2,4.3,4.4,4.1,3.7,4.0,"See All 1,049 Reviews","Dude, let's hit the beach! And then we'll hit ...",
3,UC Irvine,10000+ Employees,"Irvine, CA",Colleges & Universities,4.4,4.5,4.3,4.0,4.0,3.8,4.0,"See All 1,398 Reviews","Sun, the beach, Nobel Prize winners, and a Sou...",
4,Iowa State University,5001 to 10000 Employees,"Ames, IA",Colleges & Universities,4.4,4.2,4.2,4.2,4.0,3.8,3.9,"See All 1,203 Reviews",Attending Iowa State University of Science and...,


In [9]:
# #Drop duplicate rows
# data.drop_duplicates(inplace=True)

# #Sanity Check
# print('Length of data set: ',len(data))
# print('Number of duplicates: ', data.duplicated().sum())

In [10]:
#Drop null values from RATING_DI
print('NaN values dropped from RATING_DI: ', data.RATING_DI.isna().sum())
data.dropna(subset=['RATING_DI'], inplace=True)

#Drop null values from DESCRIPTION
print('NaN values dropped from DESCRIPTION: ', data.DESCRIPTION.isna().sum())
data.dropna(subset=['DESCRIPTION'], inplace=True)


NaN values dropped from RATING_DI:  61
NaN values dropped from DESCRIPTION:  27


In [11]:
data.head()

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
0,McKinsey & Company,10000+ Employees,"New York, NY",Consulting,4.5,4.4,4.4,3.0,4.1,4.4,4.5,"See All 5,198 Reviews",We work with leaders across sectors to tackle ...,Mission: Our mission is two-fold: to help our ...
1,Cornell University,10000+ Employees,"Ithaca, NY",Colleges & Universities,4.5,4.3,4.4,4.0,4.1,4.1,4.2,"See All 1,765 Reviews","Cornell is a private, Ivy League university re...",
2,UC Santa Barbara,5001 to 10000 Employees,"Santa Barbara, CA",Colleges & Universities,4.5,4.2,4.3,4.4,4.1,3.7,4.0,"See All 1,049 Reviews","Dude, let's hit the beach! And then we'll hit ...",
3,UC Irvine,10000+ Employees,"Irvine, CA",Colleges & Universities,4.4,4.5,4.3,4.0,4.0,3.8,4.0,"See All 1,398 Reviews","Sun, the beach, Nobel Prize winners, and a Sou...",
4,Iowa State University,5001 to 10000 Employees,"Ames, IA",Colleges & Universities,4.4,4.2,4.2,4.2,4.0,3.8,3.9,"See All 1,203 Reviews",Attending Iowa State University of Science and...,


## Clean up NUM_REVIEWS
- pull out numbers only

In [14]:
def get_digits(s):
    #remove commas
    exclude = set(string.punctuation)
    s = ''.join(num for num in s if num not in exclude)

    #return digits only
    return s.split()[2]

In [15]:
#apply function to NUM_REVIEWS and change datatype to int
data['NUM_REVIEWS'] = data['NUM_REVIEWS'].apply(get_digits).astype(int)

**drop lower than 200 reviews**

In [52]:
#removing companies with too few reviews
df = data[(data['NUM_REVIEWS'] < 100)]
df

Unnamed: 0,NAME,SIZE,LOCATION_HQ,INDUSTRY,RATING_OVERALL,RATING_DI,RATING_CV,RATING_WL,RATING_SM,RATING_CB,RATING_CO,NUM_REVIEWS,DESCRIPTION,MISSION
470,Monarch Investment and Management Group\nHirin...,1001 to 5000 Employees,"Franktown, CO",Real Estate,4.6,4.7,4.5,4.5,4.3,4.5,4.4,162,Monarch Investment & Management Group has been...,Mission: Monarch’s mission is to be the best m...
475,Ace Info Solutions,1001 to 5000 Employees,"Reston, VA",IT Services,4.6,5.0,4.6,4.7,4.5,4.3,4.2,112,"AceInfo, a Dovel company, is a trusted partner...",
478,Advantest,1001 to 5000 Employees,"Tokyo, Japan",Electrical & Electronic Manufacturing,4.6,4.5,4.4,4.3,4.2,4.4,4.4,82,"With a long term legacy of success, innovation...",
479,BELAY\nHiring Surge,1001 to 5000 Employees,"Atlanta, GA",Staffing & Outsourcing,4.6,4.4,4.7,4.7,4.6,4.2,4.1,187,"As a leading US-based, virtual solutions compa...",
488,Salk Institute,1001 to 5000 Employees,"La Jolla, CA",Biotech & Pharmaceuticals,4.6,4.5,4.5,4.1,4.1,3.8,4.4,164,The Salk Institute conducts its biological res...,Mission: Every cure has a starting point. Like...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1165,Cardinal Logistics Management,1001 to 5000 Employees,"Concord, NC",Logistics & Supply Chain,3.9,3.9,3.7,3.7,3.5,3.5,3.6,144,Cardinal Logistics Management sings a sweet so...,Mission: Cardinal’s mission is to remain the m...
1167,Jim 'N Nick's\nPart of Roark Capital,1001 to 5000 Employees,"Birmingham, AL",Casual Restaurants,3.9,4.1,4.1,3.4,3.7,3.5,3.5,134,"Hey, we love barbecue. We love preparing it, w...",
1168,Marquis Companies\nHiring Surge,1001 to 5000 Employees,"Portland, OR",Health Care Services & Hospitals,3.9,3.7,4.0,3.4,3.6,3.1,3.8,126,"Marquis Companies provides post acute rehab, l...",
1182,OshKosh B'gosh,1001 to 5000 Employees,"Atlanta, GA","Department, Clothing, & Shoe Stores",3.9,4.7,3.9,4.4,3.7,3.1,3.2,155,"Carter’s, Inc. owns the largest share of the $...",


## Checking out Rating stats 

In [None]:
# Checking out rating stats
data.describe()

In [None]:
data.RATING_DI.mean()

In [19]:
def get_digits(s):
    #remove commas
    exclude = set(string.punctuation)
    s = ''.join(num for num in s if num not in exclude)

    #returs digits
    res = [int(i) for i in s.split() if i.isdigit()]
    return res

In [40]:
reviews = 'See All 1,203 Reviews'

In [41]:
exclude = set(string.punctuation)
reviews = ''.join(num for num in reviews if num not in exclude)

In [42]:
reviews

'See All 1203 Reviews'

In [43]:
##THIS:
int(reviews.split()[2])

1203

In [44]:
##INSTEAD OF:
int(i) for i in reviews.split() if i.isdigit()

SyntaxError: invalid syntax (<ipython-input-44-e8c47c6b8413>, line 2)

In [33]:
res = [int(i) for i in reviews.split() if i.isdigit()]
res

[1203]

In [21]:
get_digits(reviews)

[1203]

In [40]:
res

[1, 203]

In [39]:
temp = re.findall(r'\d+', test_string) 
res = list(map(int, temp))