In [1]:
# Import libraries
import pandas as pd
import numpy as np
pd.set_option('max_colwidth', 400)
import requests
import matplotlib.pyplot as plt
import json
import re
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [2]:
# Read csv from raw github link and create dataframe
url = 'https://raw.githubusercontent.com/dianeooty/datascience_salary/main/Resources/layoffs.csv'
layoffs_df = pd.read_csv(url)

In [3]:
# Read csv from raw github link and create dataframe
url = 'https://raw.githubusercontent.com/dianeooty/datascience_salary/main/Resources/Levels_Fyi_Salary_Data.csv'
salaries_df = pd.read_csv(url)

In [4]:
# View dataframe info
salaries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62642 entries, 0 to 62641
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   timestamp                62642 non-null  object 
 1   company                  62637 non-null  object 
 2   level                    62523 non-null  object 
 3   title                    62642 non-null  object 
 4   totalyearlycompensation  62642 non-null  int64  
 5   location                 62642 non-null  object 
 6   yearsofexperience        62642 non-null  float64
 7   yearsatcompany           62642 non-null  float64
 8   tag                      61788 non-null  object 
 9   basesalary               62642 non-null  int64  
 10  stockgrantvalue          62642 non-null  float64
 11  bonus                    62642 non-null  float64
 12  gender                   43102 non-null  object 
 13  otherdetails             40137 non-null  object 
 14  cityid                

In [5]:
# Display dataframe
salaries_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,...,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education
0,6/7/2017 11:33,Oracle,L3,Product Manager,127000,"Redwood City, CA",1.5,1.5,,107000,...,0,0,0,0,0,0,0,0,,
1,6/10/2017 17:11,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",5.0,3.0,,0,...,0,0,0,0,0,0,0,0,,
2,6/11/2017 14:53,Amazon,L7,Product Manager,310000,"Seattle, WA",8.0,0.0,,155000,...,0,0,0,0,0,0,0,0,,
3,6/17/2017 0:23,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",7.0,5.0,,157000,...,0,0,0,0,0,0,0,0,,
4,6/20/2017 10:58,Microsoft,60,Software Engineer,157000,"Mountain View, CA",5.0,3.0,,0,...,0,0,0,0,0,0,0,0,,


In [6]:
layoffs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2545 entries, 0 to 2544
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   company              2545 non-null   object 
 1   location             2545 non-null   object 
 2   industry             2543 non-null   object 
 3   total_laid_off       1746 non-null   float64
 4   percentage_laid_off  1694 non-null   float64
 5   date                 2543 non-null   object 
 6   stage                2539 non-null   object 
 7   country              2545 non-null   object 
 8   funds_raised         2297 non-null   float64
dtypes: float64(3), object(6)
memory usage: 179.1+ KB


In [7]:
# Display dataframe
layoffs_df.head()

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,stage,country,funds_raised
0,N26,Berlin,Finance,71.0,0.04,2023-04-28,Series E,United States,1700.0
1,Providoor,Melbourne,Food,,1.0,2023-04-28,Unknown,Australia,
2,Dropbox,SF Bay Area,Other,500.0,0.16,2023-04-27,Post-IPO,United States,1700.0
3,Vroom,New York City,Transportation,120.0,0.11,2023-04-27,Post-IPO,United States,1300.0
4,Greenhouse,New York City,Recruiting,100.0,0.12,2023-04-27,Private Equity,United States,110.0


In [8]:
# Convert timestamp format
salaries_df['timestamp'] = pd.to_datetime(salaries_df['timestamp']).dt.strftime('%Y-%m-%d %H:%M:%S')
salaries_df['timestamp'] = salaries_df['timestamp'].astype('datetime64[ns]')

In [9]:
# Convert timestamp column to dates only and add a new column
salaries_df['date'] = pd.to_datetime(salaries_df['timestamp']).dt.date
salaries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62642 entries, 0 to 62641
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   timestamp                62642 non-null  datetime64[ns]
 1   company                  62637 non-null  object        
 2   level                    62523 non-null  object        
 3   title                    62642 non-null  object        
 4   totalyearlycompensation  62642 non-null  int64         
 5   location                 62642 non-null  object        
 6   yearsofexperience        62642 non-null  float64       
 7   yearsatcompany           62642 non-null  float64       
 8   tag                      61788 non-null  object        
 9   basesalary               62642 non-null  int64         
 10  stockgrantvalue          62642 non-null  float64       
 11  bonus                    62642 non-null  float64       
 12  gender                   43102 n

In [10]:
# Convert date column to datetime format
salaries_df['date'] = salaries_df['date'].astype('datetime64[ns]')
salaries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62642 entries, 0 to 62641
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   timestamp                62642 non-null  datetime64[ns]
 1   company                  62637 non-null  object        
 2   level                    62523 non-null  object        
 3   title                    62642 non-null  object        
 4   totalyearlycompensation  62642 non-null  int64         
 5   location                 62642 non-null  object        
 6   yearsofexperience        62642 non-null  float64       
 7   yearsatcompany           62642 non-null  float64       
 8   tag                      61788 non-null  object        
 9   basesalary               62642 non-null  int64         
 10  stockgrantvalue          62642 non-null  float64       
 11  bonus                    62642 non-null  float64       
 12  gender                   43102 n

In [11]:
# Display dataframe to confirm changes
salaries_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,...,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education,date
0,2017-06-07 11:33:00,Oracle,L3,Product Manager,127000,"Redwood City, CA",1.5,1.5,,107000,...,0,0,0,0,0,0,0,,,2017-06-07
1,2017-06-10 17:11:00,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",5.0,3.0,,0,...,0,0,0,0,0,0,0,,,2017-06-10
2,2017-06-11 14:53:00,Amazon,L7,Product Manager,310000,"Seattle, WA",8.0,0.0,,155000,...,0,0,0,0,0,0,0,,,2017-06-11
3,2017-06-17 00:23:00,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",7.0,5.0,,157000,...,0,0,0,0,0,0,0,,,2017-06-17
4,2017-06-20 10:58:00,Microsoft,60,Software Engineer,157000,"Mountain View, CA",5.0,3.0,,0,...,0,0,0,0,0,0,0,,,2017-06-20


In [12]:
# Install opencage for API calls
pip install opencage

SyntaxError: invalid syntax (2842327624.py, line 2)

In [13]:
# Import additional libraries for opencage
from opencage.geocoder import OpenCageGeocode
from pprint import pprint

In [14]:
# Import key to access API
from key import oc_key

In [15]:
# Assign variable for API calls with key
geocoder = OpenCageGeocode(oc_key)

In [16]:
# Create a list of locations for API calls
location = []
[location.append(x) for x in salaries_df.location]
print(location)

['Redwood City, CA', 'San Francisco, CA', 'Seattle, WA', 'Sunnyvale, CA', 'Mountain View, CA', 'Seattle, WA', 'Redmond, WA', 'Seattle, WA', 'Redmond, WA', 'Seattle, WA', 'San Francisco, CA', 'Bellevue, WA', 'Redmond, WA', 'Seattle, WA', 'Seattle, WA', 'Seattle, WA', 'Menlo Park, CA', 'San Francisco, CA', 'London, EN, United Kingdom', 'Redmond, WA', 'Sunnyvale, CA', 'Bellevue, WA', 'Seattle, WA', 'Seattle, WA', 'Seattle, WA', 'Redmond, WA', 'Cupertino, CA', 'Seattle, WA', 'Mountain View, CA', 'New York, NY', 'Seattle, WA', 'Los Gatos, CA', 'Cupertino, CA', 'Dublin, DN, Ireland', 'Seattle, WA', 'Redmond, WA', 'Cupertino, CA', 'San Francisco, CA', 'Redmond, WA', 'Redmond, WA', 'San Francisco, CA', 'San Francisco, CA', 'Seattle, WA', 'Seattle, WA', 'Sunnyvale, CA', 'Baltimore, MD', 'San Francisco, CA', 'San Francisco, CA', 'Menlo Park, CA', 'Seattle, WA', 'Seattle, WA', 'Cupertino, CA', 'Seattle, WA', 'San Francisco, CA', 'Seattle, WA', 'Seattle, WA', 'Seattle, WA', 'Seattle, WA', 'Seattle

In [17]:
# Use set to get distinct locations and cast to list for ordering
distinct = set(location)
distinct_location = list(distinct)

In [18]:
# Create empty lists to hold latitude and longitude values from each city
lng = []
lat = []

# Iterate through API results to extract latitude and longitude values and append to empty lists
for x in distinct_location:
    results = geocoder.geocode(x)
    lng.append(results[0]['geometry']['lng'])
    lat.append(results[0]['geometry']['lat'])

In [19]:
# Create dataframe for cities and coordinates
coord_df = pd.DataFrame(
            {
                "city": distinct_location,
                "latitude": lat,
                "longitude": lng
            }
)

# Display Dataframe
coord_df.head()

Unnamed: 0,city,latitude,longitude
0,"Buffalo, NY",42.886717,-78.878392
1,"Hilbert, WI",44.140268,-88.163991
2,"Moscow, MC, Russia",55.64011,37.53286
3,"Jerusalem, JM, Israel",31.778824,35.225763
4,"Birmingham, EN, United Kingdom",52.523212,-1.843181


In [20]:
# View dataframe dimensions
coord_df.shape

(1050, 3)

In [21]:
# Check data types
coord_df.dtypes

city          object
latitude     float64
longitude    float64
dtype: object

In [22]:
# Save coord_df to csv
coord_df.to_csv('Resources/coordinates.csv', index=False)

In [23]:
# Display dataframe
salaries_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,...,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education,date
0,2017-06-07 11:33:00,Oracle,L3,Product Manager,127000,"Redwood City, CA",1.5,1.5,,107000,...,0,0,0,0,0,0,0,,,2017-06-07
1,2017-06-10 17:11:00,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",5.0,3.0,,0,...,0,0,0,0,0,0,0,,,2017-06-10
2,2017-06-11 14:53:00,Amazon,L7,Product Manager,310000,"Seattle, WA",8.0,0.0,,155000,...,0,0,0,0,0,0,0,,,2017-06-11
3,2017-06-17 00:23:00,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",7.0,5.0,,157000,...,0,0,0,0,0,0,0,,,2017-06-17
4,2017-06-20 10:58:00,Microsoft,60,Software Engineer,157000,"Mountain View, CA",5.0,3.0,,0,...,0,0,0,0,0,0,0,,,2017-06-20


In [24]:
# View dataframe dimensions
salaries_df.shape

(62642, 30)

In [25]:
# Check data types
salaries_df.dtypes

timestamp                  datetime64[ns]
company                            object
level                              object
title                              object
totalyearlycompensation             int64
location                           object
yearsofexperience                 float64
yearsatcompany                    float64
tag                                object
basesalary                          int64
stockgrantvalue                   float64
bonus                             float64
gender                             object
otherdetails                       object
cityid                              int64
dmaid                             float64
rowNumber                           int64
Masters_Degree                      int64
Bachelors_Degree                    int64
Doctorate_Degree                    int64
Highschool                          int64
Some_College                        int64
Race_Asian                          int64
Race_White                        

In [26]:
# IF NEEDED: Import coordinates.csv to merge and save as coord_df
# Read csv from raw github link and create dataframe
url = 'https://raw.githubusercontent.com/dianeooty/datascience_salary/main/Resources/coordinates.csv'
coord_df = pd.read_csv(url)

In [27]:
# # Merge the dataframes using left join to add coordinates to salaries_df
new_df = pd.merge(salaries_df,coord_df,how='left',left_on='location',right_on='city')
new_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,...,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education,date,city,latitude,longitude
0,2017-06-07 11:33:00,Oracle,L3,Product Manager,127000,"Redwood City, CA",1.5,1.5,,107000,...,0,0,0,0,,,2017-06-07,"Redwood City, CA",37.486324,-122.232523
1,2017-06-10 17:11:00,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",5.0,3.0,,0,...,0,0,0,0,,,2017-06-10,"San Francisco, CA",37.779026,-122.419906
2,2017-06-11 14:53:00,Amazon,L7,Product Manager,310000,"Seattle, WA",8.0,0.0,,155000,...,0,0,0,0,,,2017-06-11,"Seattle, WA",47.603832,-122.330062
3,2017-06-17 00:23:00,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",7.0,5.0,,157000,...,0,0,0,0,,,2017-06-17,"Sunnyvale, CA",37.36883,-122.036349
4,2017-06-20 10:58:00,Microsoft,60,Software Engineer,157000,"Mountain View, CA",5.0,3.0,,0,...,0,0,0,0,,,2017-06-20,"Mountain View, CA",37.389389,-122.08321


In [28]:
# View column names
new_df.columns

Index(['timestamp', 'company', 'level', 'title', 'totalyearlycompensation',
       'location', 'yearsofexperience', 'yearsatcompany', 'tag', 'basesalary',
       'stockgrantvalue', 'bonus', 'gender', 'otherdetails', 'cityid', 'dmaid',
       'rowNumber', 'Masters_Degree', 'Bachelors_Degree', 'Doctorate_Degree',
       'Highschool', 'Some_College', 'Race_Asian', 'Race_White',
       'Race_Two_Or_More', 'Race_Black', 'Race_Hispanic', 'Race', 'Education',
       'date', 'city', 'latitude', 'longitude'],
      dtype='object')

In [29]:
# Reorder columns
new_df = new_df[['timestamp','date', 'company', 'level', 'title', 'totalyearlycompensation',
       'location', 'latitude', 'longitude','yearsofexperience', 'yearsatcompany', 'tag', 'basesalary',
       'stockgrantvalue', 'bonus', 'gender', 'otherdetails', 'cityid', 'dmaid',
       'rowNumber', 'Masters_Degree', 'Bachelors_Degree', 'Doctorate_Degree',
       'Highschool', 'Some_College', 'Race_Asian', 'Race_White',
       'Race_Two_Or_More', 'Race_Black', 'Race_Hispanic', 'Race', 'Education']]

In [30]:
# Display dataframe
new_df.head()

Unnamed: 0,timestamp,date,company,level,title,totalyearlycompensation,location,latitude,longitude,yearsofexperience,...,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education
0,2017-06-07 11:33:00,2017-06-07,Oracle,L3,Product Manager,127000,"Redwood City, CA",37.486324,-122.232523,1.5,...,0,0,0,0,0,0,0,0,,
1,2017-06-10 17:11:00,2017-06-10,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",37.779026,-122.419906,5.0,...,0,0,0,0,0,0,0,0,,
2,2017-06-11 14:53:00,2017-06-11,Amazon,L7,Product Manager,310000,"Seattle, WA",47.603832,-122.330062,8.0,...,0,0,0,0,0,0,0,0,,
3,2017-06-17 00:23:00,2017-06-17,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",37.36883,-122.036349,7.0,...,0,0,0,0,0,0,0,0,,
4,2017-06-20 10:58:00,2017-06-20,Microsoft,60,Software Engineer,157000,"Mountain View, CA",37.389389,-122.08321,5.0,...,0,0,0,0,0,0,0,0,,


In [31]:
# View dataframe's info
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62642 entries, 0 to 62641
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   timestamp                62642 non-null  datetime64[ns]
 1   date                     62642 non-null  datetime64[ns]
 2   company                  62637 non-null  object        
 3   level                    62523 non-null  object        
 4   title                    62642 non-null  object        
 5   totalyearlycompensation  62642 non-null  int64         
 6   location                 62642 non-null  object        
 7   latitude                 62642 non-null  float64       
 8   longitude                62642 non-null  float64       
 9   yearsofexperience        62642 non-null  float64       
 10  yearsatcompany           62642 non-null  float64       
 11  tag                      61788 non-null  object        
 12  basesalary               62642 n

In [32]:
# Replace gender column null values with unknown
new_df['gender'].fillna("Unknown", inplace = True)

In [33]:
# Replace gender column incorrect values with unknown
new_df['gender'] = new_df['gender'].str.replace("Title: Senior Software Engineer","Unknown")

In [34]:
# Check values updated
new_df['gender'].value_counts()

Male       35702
Unknown    19541
Female      6999
Other        400
Name: gender, dtype: int64

In [35]:
# Replace null values with unknown
new_df['Race'].fillna("Unknown", inplace = True)

In [36]:
# Replace null values with unknown
new_df['Education'].fillna("Unknown", inplace = True)

In [37]:
# Drop unwanted columns
new_df = new_df.drop(columns=['tag', 'otherdetails', 'cityid', 'dmaid',
       'rowNumber', 'Masters_Degree', 'Bachelors_Degree', 'Doctorate_Degree',
       'Highschool', 'Some_College', 'Race_Asian', 'Race_White',
       'Race_Two_Or_More', 'Race_Black', 'Race_Hispanic'])

In [38]:
# Convert datatypes
new_df['basesalary'] = new_df['basesalary'].astype('int')
new_df['stockgrantvalue'] = new_df['stockgrantvalue'].astype('int')
new_df['bonus'] = new_df['bonus'].astype('int')

In [39]:
# Check dataframe's info
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62642 entries, 0 to 62641
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   timestamp                62642 non-null  datetime64[ns]
 1   date                     62642 non-null  datetime64[ns]
 2   company                  62637 non-null  object        
 3   level                    62523 non-null  object        
 4   title                    62642 non-null  object        
 5   totalyearlycompensation  62642 non-null  int64         
 6   location                 62642 non-null  object        
 7   latitude                 62642 non-null  float64       
 8   longitude                62642 non-null  float64       
 9   yearsofexperience        62642 non-null  float64       
 10  yearsatcompany           62642 non-null  float64       
 11  basesalary               62642 non-null  int32         
 12  stockgrantvalue          62642 n

In [40]:
# Update company names to match
new_df = new_df.replace(to_replace ='^[cC]oupa$', value = 'Coupa Software', regex = True)
new_df = new_df.replace(to_replace ='^[cC]oupa [sS]oftware$', value = 'Coupa Software', regex = True)
new_df = new_df.replace(to_replace ='^[gG]oogle$', value = 'Google', regex = True)
new_df = new_df.replace(to_replace ='^[gG]oogle LLC$', value = 'Google', regex = True)
new_df = new_df.replace(to_replace ='^[aA]didas$', value = 'Adidas', regex = True)
new_df = new_df.replace(to_replace ='^[aA]dobe$', value = 'Adobe', regex = True)
new_df = new_df.replace(to_replace ='^[aA]irbus$', value = 'Airbus', regex = True)
new_df = new_df.replace(to_replace ='^[aA]kamai$', value = 'Akamai Technologies', regex = True)
new_df = new_df.replace(to_replace ='^[aA]libaba$', value = 'Alibaba', regex = True)
new_df = new_df.replace(to_replace ='^[aA]ltran$', value = 'Altran', regex = True)
new_df = new_df.replace(to_replace ='^[aA]mazon$', value = 'Amazon', regex = True)
new_df = new_df.replace(to_replace ='^[aA]mzon$', value = 'Amazon', regex = True)
new_df = new_df.replace(to_replace ='^[aA]mazon [wW]eb [sS]ervices$', value = 'AWS', regex = True)
new_df = new_df.replace(to_replace ='^Aws$', value = 'AWS', regex = True)
new_df = new_df.replace(to_replace ='^[aA]md$', value = 'AMD', regex = True)
new_df = new_df.replace(to_replace ='^[aA]merican [aA]irlines$', value = 'American Airlines', regex = True)
new_df = new_df.replace(to_replace ='^[aA]merican [eE]xpress$', value = 'American Express', regex = True)
new_df = new_df.replace(to_replace ='^[aA]merican [fF]amily [iI]nsurance$', value = 'American Family Insurance', regex = True)
new_df = new_df.replace(to_replace ='^[aA]mgen$', value = 'Amgen', regex = True)
new_df = new_df.replace(to_replace ='^[aA]mplitude$', value = 'Amplitude', regex = True)
new_df = new_df.replace(to_replace ='^[aA]nalog [dD]evices$', value = 'Analog Devices', regex = True)
new_df = new_df.replace(to_replace ='^ANSYS$', value = 'Ansys', regex = True)
new_df = new_df.replace(to_replace ='^[aA]pp[dD]ynamics%', value = 'AppDynamics', regex = True)
new_df = new_df.replace(to_replace ='^[aA]pple$', value = 'Apple', regex = True)
new_df = new_df.replace(to_replace ='^[aA]pple [iI]nc$', value = 'Apple', regex = True)
new_df = new_df.replace(to_replace ='^[aA]pplied [mM]aterials$', value = 'Applied Materials', regex = True)
new_df = new_df.replace(to_replace ='^[aA]pptio$', value = 'Apptio', regex = True)
new_df = new_df.replace(to_replace ='^argo ai$', value = 'Argo AI', regex = True)
new_df = new_df.replace(to_replace ='^[aA]rista$', value = 'Arista Networks', regex = True)
new_df = new_df.replace(to_replace ='^[aA][rR][mM]$', value = 'Arm', regex = True)
new_df = new_df.replace(to_replace ='^ARUBA$', value = 'Aruba', regex = True)
new_df = new_df.replace(to_replace ='^Aruba networks$', value = 'Aruba Networks', regex = True)
new_df = new_df.replace(to_replace ='^Asus$', value = 'ASUS', regex = True)
new_df = new_df.replace(to_replace ='^[aA]thena[hH]ealth$', value = 'AthenaHealth', regex = True)
new_df = new_df.replace(to_replace ='^atlassian$', value = 'Atlassian', regex = True)
new_df = new_df.replace(to_replace ='^auth0$', value = 'Auth0', regex = True)
new_df = new_df.replace(to_replace ='^axon$', value = 'Axon', regex = True)
new_df = new_df.replace(to_replace ='^Bain$', value = 'Bain & Company', regex = True)
new_df = new_df.replace(to_replace ='^bandwidth$', value = 'Bandwidth', regex = True)
new_df = new_df.replace(to_replace ='^Bank of America Merrill Lynch$', value = 'Bank of America', regex = True)
new_df = new_df.replace(to_replace ='^barracuda networks$', value = 'Barracuda Networks', regex = True)
new_df = new_df.replace(to_replace ='^BazaarVoice$', value = 'Bazaarvoice', regex = True)
new_df = new_df.replace(to_replace ='^[bB][cC][gG]$', value = 'BCG Digital Ventures', regex = True)
new_df = new_df.replace(to_replace ='^Better.com$', value = 'Better Mortgage', regex = True)
new_df = new_df.replace(to_replace ='^beyond limits$', value = 'Beyond Limits', regex = True)
new_df = new_df.replace(to_replace ='^Bigcommerce$', value = 'BigCommerce', regex = True)
new_df = new_df.replace(to_replace ='^Blackberry$', value = 'BlackBerry', regex = True)
new_df = new_df.replace(to_replace ='^Blackline$', value = 'BlackLine', regex = True)
new_df = new_df.replace(to_replace ='^Blackrock$', value = 'BlackRock', regex = True)
new_df = new_df.replace(to_replace ='^blend$', value = 'Blend', regex = True)
new_df = new_df.replace(to_replace ='^[bB][lL][oO][oO][mM][bB][eE][rR][gG] [lL][pP]$', value = 'Bloomberg', regex = True)
new_df = new_df.replace(to_replace ='^BOeing$', value = 'Boeing', regex = True)
new_df = new_df.replace(to_replace ='^Bolt(EU)$', value = 'Bolt', regex = True)
new_df = new_df.replace(to_replace ='^[bB]ooking$', value = 'Booking.com', regex = True)
new_df = new_df.replace(to_replace ='^BOOKING.COM$', value = 'Booking.com', regex = True)
new_df = new_df.replace(to_replace ='^[bB]osch$', value = 'Bosch Global', regex = True)
new_df = new_df.replace(to_replace ='^box$', value = 'Box', regex = True)
new_df = new_df.replace(to_replace ='^brex$', value = 'Brex', regex = True)
new_df = new_df.replace(to_replace ='^broadcom$', value = 'Broadcom', regex = True)
new_df = new_df.replace(to_replace ='^c3.ai$', value = 'C3.ai', regex = True)
new_df = new_df.replace(to_replace ='^ca technologies$', value = 'CA Technologies', regex = True)
new_df = new_df.replace(to_replace ='^CACI$', value = 'CACI International', regex = True)
new_df = new_df.replace(to_replace ='^[cC]adence$', value = 'Cadence Design Systems', regex = True)
new_df = new_df.replace(to_replace ='^[cC]adence [dD]esign [sS]ystems$', value = 'Cadence Design Systems', regex = True)
new_df = new_df.replace(to_replace ='^Cambia health solutions$', value = 'Cambia Health Solutions', regex = True)
new_df = new_df.replace(to_replace ='^capgemini$', value = 'Capgemini', regex = True)
new_df = new_df.replace(to_replace ='^[cC]apital [oO]ne$', value = 'Capital One', regex = True)
new_df = new_df.replace(to_replace ='^capsule$', value = 'Capsule', regex = True)
new_df = new_df.replace(to_replace ='^Carmax$', value = 'CarMax', regex = True)
new_df = new_df.replace(to_replace ='^CBS interactive$', value = 'CBS Interactive', regex = True)
new_df = new_df.replace(to_replace ='^cerner$', value = 'Cerner', regex = True)
new_df = new_df.replace(to_replace ='^[cC][gG][iI]$', value = 'CGI Group', regex = True)
new_df = new_df.replace(to_replace ='^[cC][gG][iI] [gG]roup$', value = 'CGI Group', regex = True)
new_df = new_df.replace(to_replace ='^[cC]harles [sS]chwab$', value = 'Charles Schwab', regex = True)
new_df = new_df.replace(to_replace ='^chase$', value = 'Chase', regex = True)
new_df = new_df.replace(to_replace ='^chegg$', value = 'Chegg', regex = True)
new_df = new_df.replace(to_replace ='^chevron$', value = 'Chevron', regex = True)
new_df = new_df.replace(to_replace ='^chewy$', value = 'Chewy', regex = True)
new_df = new_df.replace(to_replace ='^cibc$', value = 'CIBC', regex = True)
new_df = new_df.replace(to_replace ='^[cC]isco$', value = 'Cisco', regex = True)
new_df = new_df.replace(to_replace ='^[cC]isco [sS]ystems$', value = 'Cisco', regex = True)
new_df = new_df.replace(to_replace ='^citadel$', value = 'Citadel', regex = True)
new_df = new_df.replace(to_replace ='^citi$', value = 'Citi', regex = True)
new_df = new_df.replace(to_replace ='^[cC]iti[bB]ank$', value = 'Citi', regex = True)
new_df = new_df.replace(to_replace ='^cloudera$', value = 'Cloudera', regex = True)
new_df = new_df.replace(to_replace ='^cloudflare$', value = 'Cloudflare', regex = True)
new_df = new_df.replace(to_replace ='^[cC]loud[kK]itchens$', value = 'CloudKitchens', regex = True)
new_df = new_df.replace(to_replace ='^[cC]ognizant [tT]echnology [sS]olutions$', value = 'Cognizant', regex = True)
new_df = new_df.replace(to_replace ='^Collins aerospace$', value = 'Collins Aerospace', regex = True)
new_df = new_df.replace(to_replace ='^Commscope$', value = 'CommScope', regex = True)
new_df = new_df.replace(to_replace ='^commvault$', value = 'Commvault', regex = True)
new_df = new_df.replace(to_replace ='^Costar Group$', value = 'CoStar Group', regex = True)
new_df = new_df.replace(to_replace ='^Costco Wholesale$', value = 'Costco', regex = True)
new_df = new_df.replace(to_replace ='^couchbase$', value = 'Couchbase', regex = True)
new_df = new_df.replace(to_replace ='^coupang$', value = 'Coupang', regex = True)
new_df = new_df.replace(to_replace ='^[cC]redit [kK]arma$', value = 'Credit Karma', regex = True)
new_df = new_df.replace(to_replace ='^[cC]redit [sS]uisse$', value = 'Credit Suisse', regex = True)
new_df = new_df.replace(to_replace ='^[cC]sg]$', value = 'CSG', regex = True)
new_df = new_df.replace(to_replace ='^[cC]rowd[sS]trike]$', value = 'CrowdStrike', regex = True)
new_df = new_df.replace(to_replace ='^[cC][vV][sS] [hH]ealth]$', value = 'CVS Health', regex = True)
new_df = new_df.replace(to_replace ='^[dD]ell$', value = 'Dell Technologies', regex = True)
new_df = new_df.replace(to_replace ='^[dD]eloitte [aA]dvisory$', value = 'Deloitte', regex = True)
new_df = new_df.replace(to_replace ='^[dD]eloitte [cC]onsulting$', value = 'Deloitte', regex = True)
new_df = new_df.replace(to_replace ='^[dD]eloitte [cC]onsulting LLP$', value = 'Deloitte', regex = True)
new_df = new_df.replace(to_replace ='^[dD]excom$', value = 'Dexcom', regex = True)
new_df = new_df.replace(to_replace ='^[dD]iscover$', value = 'Discover', regex = True)
new_df = new_df.replace(to_replace ='^[dD]iscovery$', value = 'Discovery', regex = True)
new_df = new_df.replace(to_replace ='^[dD][iI][sS][hH]$', value = 'DISH Network', regex = True)
new_df = new_df.replace(to_replace ='^Dish Network$', value = 'DISH Network', regex = True)
new_df = new_df.replace(to_replace ='^[dD]ocu[sS]ign$', value = 'DocuSign', regex = True)
new_df = new_df.replace(to_replace ='^[dD]octo[lL]ib$', value = 'DoctoLib', regex = True)
new_df = new_df.replace(to_replace ='^[dD]oor[dD]ash$', value = 'DoorDash', regex = True)
new_df = new_df.replace(to_replace ='^[eE][bB][aA][yY]$', value = 'eBay', regex = True)
new_df = new_df.replace(to_replace ='^[eE]*[tT]rade$', value = 'E*TRADE', regex = True)
new_df = new_df.replace(to_replace ='^[eE][pP][aA][mM]$', value = 'EPAM Systems', regex = True)
new_df = new_df.replace(to_replace ='^[eE][pP][aA][mM] Systems$', value = 'EPAM Systems', regex = True)
new_df = new_df.replace(to_replace ='^epsilon$', value = 'Epsilon', regex = True)
new_df = new_df.replace(to_replace ='^[eE]rnst and [yY]oung$', value = 'Ernst & Young', regex = True)
new_df = new_df.replace(to_replace ='^EverQoute$', value = 'Everquote', regex = True)
new_df = new_df.replace(to_replace ='^Expedia [gG]roup$', value = 'Expedia', regex = True)
new_df = new_df.replace(to_replace ='^Exxonmobil$', value = 'ExxonMobil', regex = True)
new_df = new_df.replace(to_replace ='^EY$', value = 'EY-Parthenon', regex = True)
new_df = new_df.replace(to_replace ='^facebook$', value = 'Facebook', regex = True)
new_df = new_df.replace(to_replace ='^Fanduel$', value = 'FanDuel', regex = True)
new_df = new_df.replace(to_replace ='^Fedex$', value = 'FedEx', regex = True)
new_df = new_df.replace(to_replace ='^fastly$', value = 'Fastly', regex = True)
new_df = new_df.replace(to_replace ='^[fF]ireeye$', value = 'FireEye', regex = True)
new_df = new_df.replace(to_replace ='^First Repbulic bank$', value = 'First Republic Bank', regex = True)
new_df = new_df.replace(to_replace ='^[fF]is$', value = 'FIS', regex = True)
new_df = new_df.replace(to_replace ='^fiserv$', value = 'Fiserv', regex = True)
new_df = new_df.replace(to_replace ='^Ford$', value = 'Ford Motor Company', regex = True)
new_df = new_df.replace(to_replace ='^Ford Motor$', value = 'Ford Motor Company', regex = True)
new_df = new_df.replace(to_replace ='^foursquare$', value = 'Foursquare', regex = True)
new_df = new_df.replace(to_replace ='^Freddie mac$', value = 'Freddie Mac', regex = True)
new_df = new_df.replace(to_replace ='^Fullstory$', value = 'FullStory', regex = True)
new_df = new_df.replace(to_replace ='^Ge$', value = 'GE', regex = True)
new_df = new_df.replace(to_replace ='^G[eE] [dD]igital$', value = 'GE Digital', regex = True)
new_df = new_df.replace(to_replace ='^G[eE] [aA]viation$', value = 'GE Aviation', regex = True)
new_df = new_df.replace(to_replace ='^[gG][eE] HEALTHCARE$', value = 'GE Healthcare', regex = True)
new_df = new_df.replace(to_replace ='^GEICO$', value = 'Geico', regex = True)
new_df = new_df.replace(to_replace ='^GENERAL ATOMICS$', value = 'General Atomics', regex = True)
new_df = new_df.replace(to_replace ='^General Electric$', value = 'GE', regex = True)
new_df = new_df.replace(to_replace ='^General mills$', value = 'General Mills', regex = True)
new_df = new_df.replace(to_replace ='^Gitlab$', value = 'GitLab', regex = True)
new_df = new_df.replace(to_replace ='^Global payments$', value = 'Global Payments', regex = True)
new_df = new_df.replace(to_replace ='^Globalfoundries$', value = 'GlobalFo0undries', regex = True)
new_df = new_df.replace(to_replace ='^[gG][oO][jJ][eE][kK]$', value = 'Gojek', regex = True)
new_df = new_df.replace(to_replace ='^[gG][oO][jJ][eE][kK] [tT]ech$', value = 'Gojek', regex = True)
new_df = new_df.replace(to_replace ='^grab$', value = 'Grab', regex = True)
new_df = new_df.replace(to_replace ='^[gG]rub[hH]ub$', value = 'GrubHub', regex = True)
new_df = new_df.replace(to_replace ='^Guardant health$', value = 'Guardant Health', regex = True)
new_df = new_df.replace(to_replace ='^Hellofresh$', value = 'HelloFresh', regex = True)
new_df = new_df.replace(to_replace ='^[hH][eE][rR][eE]$', value = 'Here Technologies', regex = True)
new_df = new_df.replace(to_replace ='^[hH][eE][rR][eE] [tT]echnologies$', value = 'Here Technologies', regex = True)
new_df = new_df.replace(to_replace ='^[hH][pP]$', value = 'HP Inc', regex = True)
new_df = new_df.replace(to_replace ='^[hH][pP] [iI][nN][cC]$', value = 'HP Inc', regex = True)
new_df = new_df.replace(to_replace ='^[hH][pP][eE]$', value = 'Hewlett Packard Enterprise', regex = True)
new_df = new_df.replace(to_replace ='^homeadvisor$', value = 'Homeadvisor', regex = True)
new_df = new_df.replace(to_replace ='^houzz$', value = 'Houzz', regex = True)
new_df = new_df.replace(to_replace ='^[hH]sbc$', value = 'HSBC', regex = True)
new_df = new_df.replace(to_replace ='^[hH]ubspot$', value = 'HubSpot', regex = True)
new_df = new_df.replace(to_replace ='^ibm$', value = 'IBM', regex = True)
new_df = new_df.replace(to_replace ='^icims$', value = 'iCIMS', regex = True)
new_df = new_df.replace(to_replace ='^IDEMIA$', value = 'Idemia', regex = True)
new_df = new_df.replace(to_replace ='^iheartradio$', value = 'iHeartRadio', regex = True)
new_df = new_df.replace(to_replace ='^illumina$', value = 'Illumina', regex = True)
new_df = new_df.replace(to_replace ='^infinera$', value = 'Infinera', regex = True)
new_df = new_df.replace(to_replace ='^Infosys [lL][tT][dD]$', value = 'Infosys', regex = True)
new_df = new_df.replace(to_replace ='^[iI]ntel [cC]orporation$', value = 'Intel', regex = True)
new_df = new_df.replace(to_replace ='^invest[cC]loud$', value = 'InvestCloud', regex = True)
new_df = new_df.replace(to_replace ='^intuitive [sS]urgical$', value = 'Intuitive Surgical', regex = True)
new_df = new_df.replace(to_replace ='^IRobot$', value = 'iRobot', regex = True)
new_df = new_df.replace(to_replace ='^ixl learning$', value = 'IXL Learning', regex = True)
new_df = new_df.replace(to_replace ='^Jane Street$', value = 'Jane Street Capital', regex = True)
new_df = new_df.replace(to_replace ='^Johnson and Johnson$', value = 'Johnson & Johnson', regex = True)
new_df = new_df.replace(to_replace ='^[jJ][pP] Morgan$', value = 'JPMorgan Chase', regex = True)
new_df = new_df.replace(to_replace ='^[jJ][pP][mM][oO][rR][gG][aA][nN]$', value = 'JPMorgan Chase', regex = True)
new_df = new_df.replace(to_replace ='^JP Morgan Chase$', value = 'JPMorgan Chase', regex = True)
new_df = new_df.replace(to_replace ='^[jJ]uniper$', value = 'Juniper Networks', regex = True)
new_df = new_df.replace(to_replace ='^[jJ]uniper [nN]etworks$', value = 'Juniper Networks', regex = True)
new_df = new_df.replace(to_replace ='^Juul Labs$', value = 'JUUL Labs', regex = True)
new_df = new_df.replace(to_replace ='^Kayak$', value = 'KAYAK', regex = True)
new_df = new_df.replace(to_replace ='^[kK]eep[tT]ruckin$', value = 'KeepTruckin', regex = True)
new_df = new_df.replace(to_replace ='^keysight$', value = 'Keysight', regex = True)
new_df = new_df.replace(to_replace ='^kpmg$', value = 'KPMG', regex = True)
new_df = new_df.replace(to_replace ='^L3Harris$', value = 'L3Harris Technologies', regex = True)
new_df = new_df.replace(to_replace ='^[lL][eE][iI][dD][oO][sS]$', value = 'Leidos', regex = True)
new_df = new_df.replace(to_replace ='^Lendingclub$', value = 'LendingClub', regex = True)
new_df = new_df.replace(to_replace ='^lenovo$', value = 'Lenovo', regex = True)
new_df = new_df.replace(to_replace ='^liberty mutual$', value = 'Liberty Mutual', regex = True)
new_df = new_df.replace(to_replace ='^[lL]iberty [mM]utual [iI]nsurance$', value = 'Liberty Mutual', regex = True)
new_df = new_df.replace(to_replace ='^[lL]inked[iI]n$', value = 'LinkedIn', regex = True)
new_df = new_df.replace(to_replace ='^[lL]ive[pP]erson$', value = 'LivePerson', regex = True)
new_df = new_df.replace(to_replace ='^[lL]ive[rR]amp$', value = 'LiveRamp', regex = True)
new_df = new_df.replace(to_replace ='^[lL]oan[dD]epot$', value = 'LoanDepot', regex = True)
new_df = new_df.replace(to_replace ='^[lL]og[mM]e[iI]n$', value = 'LogMeIn', regex = True)
new_df = new_df.replace(to_replace ="^[lL]owe's$", value = 'Lowes', regex = True)
new_df = new_df.replace(to_replace ="^[mM]acy's,$", value = "Macy's", regex = True)
new_df = new_df.replace(to_replace ="^[mM]agic [lL]eap$", value = 'Magic Leap', regex = True)
new_df = new_df.replace(to_replace ="^[mM]ail.ru Group$", value = 'Mail.Ru Group', regex = True)
new_df = new_df.replace(to_replace ="^[mM]ail[cC]himp$", value = 'MailChimp', regex = True)
new_df = new_df.replace(to_replace ="^[mM]ake[mM]y[tT]rip$", value = 'MailChimp', regex = True)
new_df = new_df.replace(to_replace ="^mapbox$", value = 'Mapbox', regex = True)
new_df = new_df.replace(to_replace ="^[mM]ass[mM]utual$", value = 'MassMutual', regex = True)
new_df = new_df.replace(to_replace ="^[mM]aster[cC]ard$", value = 'Mastercard', regex = True)
new_df = new_df.replace(to_replace ="^[mM]athworks$", value = 'MathWorks', regex = True)
new_df = new_df.replace(to_replace ="^MAXAR Technologies$", value = 'Maxar Technologies', regex = True)
new_df = new_df.replace(to_replace ="^[mM]c[aA]fee$", value = 'McAfee', regex = True)
new_df = new_df.replace(to_replace ="^[mM]cKinsey$", value = 'McKinsey & Company', regex = True)
new_df = new_df.replace(to_replace ="^[mM]c[kK]insey & Company$", value = 'McKinsey & Company', regex = True)
new_df = new_df.replace(to_replace ="^[mM]edia.net$", value = 'Media.net', regex = True)
new_df = new_df.replace(to_replace ="^[mM]edia[tT]ek$", value = 'MediaTek', regex = True)
new_df = new_df.replace(to_replace ="^[mM]essage[bB]ird$", value = 'MessageBird', regex = True)
new_df = new_df.replace(to_replace ="^[mM]et[lL]ife$", value = 'MetLife', regex = True)
new_df = new_df.replace(to_replace ="^[mM]iy$", value = 'Mi', regex = True)
new_df = new_df.replace(to_replace ="^[mM]icrochip$", value = 'Microchip Technology', regex = True)
new_df = new_df.replace(to_replace ="^[mM]icron$", value = 'Micron Technology', regex = True)
new_df = new_df.replace(to_replace ="^[mM]icron [tT]echnology$", value = 'Micron Technology', regex = True)
new_df = new_df.replace(to_replace ="^MICROSOFT$", value = 'Microsoft', regex = True)
new_df = new_df.replace(to_replace ="^microsoft$", value = 'Microsoft', regex = True)
new_df = new_df.replace(to_replace ="^[mM]icrosoft [cC]orporation$", value = 'Microsoft', regex = True)
new_df = new_df.replace(to_replace ="^microstrategy$", value = 'Microstrategy', regex = True)
new_df = new_df.replace(to_replace ="^Mitre$", value = 'MITRE', regex = True)
new_df = new_df.replace(to_replace ="^MixPanel$", value = 'Mixpanel', regex = True)
new_df = new_df.replace(to_replace ="^Mobileiron$", value = 'MobileIron', regex = True)
new_df = new_df.replace(to_replace ="^Moody's$", value = "Moody's Analytics", regex = True)
new_df = new_df.replace(to_replace ="^Motorola [sS]olutions$", value = 'Motorola', regex = True)
new_df = new_df.replace(to_replace ="^Mutual Of Omaha$", value = 'Mutual of Omaha', regex = True)
new_df = new_df.replace(to_replace ="^mural$", value = 'MURAL', regex = True)
new_df = new_df.replace(to_replace ="^Msft$", value = 'MSFT', regex = True)
new_df = new_df.replace(to_replace ="^mphasis$", value = 'Mphasis', regex = True)
new_df = new_df.replace(to_replace ="^Nasdaq$", value = 'NASDAQ', regex = True)
new_df = new_df.replace(to_replace ="^Nerdwallet$", value = 'NerdWallet', regex = True)
new_df = new_df.replace(to_replace ="^Netapp$", value = 'NetApp', regex = True)
new_df = new_df.replace(to_replace ="^netskope$", value = 'Netskope', regex = True)
new_df = new_df.replace(to_replace ="^Nextera Analytics$", value = 'NextEra Analytics', regex = True)
new_df = new_df.replace(to_replace ="^Nice$", value = 'NICE', regex = True)
new_df = new_df.replace(to_replace ="^nokia$", value = 'Nokia', regex = True)
new_df = new_df.replace(to_replace ="^NORTHROP GRUMMAN$", value = 'Northrop Grumman', regex = True)
new_df = new_df.replace(to_replace ="^NTT Data$", value = 'NTT DATA', regex = True)
new_df = new_df.replace(to_replace ="^Nuance$", value = 'Nuance Communications', regex = True)
new_df = new_df.replace(to_replace ="^nuance communications$", value = 'Nuance Communications', regex = True)
new_df = new_df.replace(to_replace ="^nutanix$", value = 'Nutanix', regex = True)
new_df = new_df.replace(to_replace ="^[nN]vidia$", value = 'NVIDIA', regex = True)
new_df = new_df.replace(to_replace ="^[nN][xX][pP]$", value = 'NXP Semiconductors', regex = True)
new_df = new_df.replace(to_replace ="^[nN][xX][pP] [sS]Semiconductors$", value = 'NXP Semiconductors', regex = True)
new_df = new_df.replace(to_replace ="^ocado technology$", value = 'Ocado Technology', regex = True)
new_df = new_df.replace(to_replace ="^[oO][nN] [sS]emiconductor$", value = 'ON Semiconductor', regex = True)
new_df = new_df.replace(to_replace ="^[oO]pen [tT]ext$", value = 'OpenText', regex = True)
new_df = new_df.replace(to_replace ="^oppo$", value = 'OPPO', regex = True)
new_df = new_df.replace(to_replace ="^[oO][rR][aA][cC][lL][eE]$", value = 'Oracle', regex = True)
new_df = new_df.replace(to_replace ="^other$", value = 'Other', regex = True)
new_df = new_df.replace(to_replace ="^[oO][yY][oO]$", value = 'OYO', regex = True)

In [41]:
new_df = new_df.replace(to_replace ="^[pP]ager[dD]uty$", value = 'PagerDuty', regex = True)
new_df = new_df.replace(to_replace ="^[pP]alo [aA]lto [nN]etworks$", value = 'Palo Alto Networks', regex = True)
new_df = new_df.replace(to_replace ="^[pP]anosonic$", value = 'Panosonic', regex = True)
new_df = new_df.replace(to_replace ="^[pP]ay[pP]al$", value = 'PayPal', regex = True)
new_df = new_df.replace(to_replace ="^peloton$", value = 'Peloton', regex = True)
new_df = new_df.replace(to_replace ="^[pP]hilips$", value = 'Philips', regex = True)
new_df = new_df.replace(to_replace ="^[pP]lay[sS]tation$", value = 'PlayStation', regex = True)
new_df = new_df.replace(to_replace ="^[pP]nc$", value = 'PNC', regex = True)
new_df = new_df.replace(to_replace ="^policygenius$", value = 'Policygenius', regex = True)
new_df = new_df.replace(to_replace ="^postmates$", value = 'Postmates', regex = True)
new_df = new_df.replace(to_replace ="^Procore$", value = 'Procore Technologies', regex = True)
new_df = new_df.replace(to_replace ="^proofpoint$", value = 'Proofpoint', regex = True)
new_df = new_df.replace(to_replace ="^prudential financial$", value = 'Prudential Financial', regex = True)
new_df = new_df.replace(to_replace ="^pwc$", value = 'PwC', regex = True)
new_df = new_df.replace(to_replace ="^QUALCOMM$", value = 'Qualcomm', regex = True)
new_df = new_df.replace(to_replace ="^qualcomm$", value = 'Qualcomm', regex = True)
new_df = new_df.replace(to_replace ="^Qualcomm Inc$", value = 'Qualcomm', regex = True)
new_df = new_df.replace(to_replace ="^Quantumblack$", value = 'QuantumBlack', regex = True)
new_df = new_df.replace(to_replace ="^Quest diagnostics$", value = 'Quest Diagnostics', regex = True)
new_df = new_df.replace(to_replace ="^QUEST DIAGNOSTICS$", value = 'Quest Diagnostics', regex = True)
new_df = new_df.replace(to_replace ="^Quest diagnostics$", value = 'Quest Diagnostics', regex = True)
new_df = new_df.replace(to_replace ="^quora$", value = 'Quora', regex = True)
new_df = new_df.replace(to_replace ="^[rR]ally [hH]ealth$", value = 'Rally Health', regex = True)
new_df = new_df.replace(to_replace ="^[rR]aytheon$", value = 'Raytheon Technologies', regex = True)
new_df = new_df.replace(to_replace ="^realtek$", value = 'Realtek', regex = True)
new_df = new_df.replace(to_replace ="^realtor.com$", value = 'Realtor.com', regex = True)
new_df = new_df.replace(to_replace ="^Red hat$", value = 'Red Hat', regex = True)
new_df = new_df.replace(to_replace ="^Red ventures$", value = 'Red Ventures', regex = True)
new_df = new_df.replace(to_replace ="^Rent The Runway$", value = 'Rent the Runway', regex = True)
new_df = new_df.replace(to_replace ="^Retailmenot$", value = 'RetailMeNot', regex = True)
new_df = new_df.replace(to_replace ="^roche$", value = 'Roche', regex = True)
new_df = new_df.replace(to_replace ="^roku$", value = 'Roku', regex = True)
new_df = new_df.replace(to_replace ="^S&P GLOBAL$", value = 'S&P Global', regex = True)
new_df = new_df.replace(to_replace ="^Saic$", value = 'SAIC', regex = True)
new_df = new_df.replace(to_replace ="^salesforce$", value = 'Salesforce', regex = True)
new_df = new_df.replace(to_replace ="^samsara$", value = 'Samsara', regex = True)
new_df = new_df.replace(to_replace ="^SAMSUNG$", value = 'Samsung', regex = True)
new_df = new_df.replace(to_replace ="^sap$", value = 'SAP', regex = True)
new_df = new_df.replace(to_replace ="^[sS]ap Concur$", value = 'SAP Concur', regex = True)
new_df = new_df.replace(to_replace ="^SCHNEIDER ELECTRIC$", value = 'Schneider Electric', regex = True)
new_df = new_df.replace(to_replace ="^seagate$", value = 'Seagate', regex = True)
new_df = new_df.replace(to_replace ="^[sS]ervicenow$", value = 'ServiceNow', regex = True)
new_df = new_df.replace(to_replace ="^[sS]ervicetitan$", value = 'ServiceTitan', regex = True)
new_df = new_df.replace(to_replace ="^shopee$", value = 'Shopee', regex = True)
new_df = new_df.replace(to_replace ="^shutterstock$", value = 'Shutterstock', regex = True)
new_df = new_df.replace(to_replace ="^siemens$", value = 'Siemens', regex = True)
new_df = new_df.replace(to_replace ="^Siriusxm$", value = 'SiriusXM', regex = True)
new_df = new_df.replace(to_replace ="^smartsheet$", value = 'Smartsheet', regex = True)
new_df = new_df.replace(to_replace ="^snapchat$", value = 'Snapchat', regex = True)
new_df = new_df.replace(to_replace ="^[sS]ofi$", value = 'SoFi', regex = True)
new_df = new_df.replace(to_replace ="^sony$", value = 'Sony', regex = True)
new_df = new_df.replace(to_replace ="^Squarepoint capital$", value = 'Squarepoint Capital', regex = True)
new_df = new_df.replace(to_replace ="^Squarespace$", value = 'SquareSpace', regex = True)
new_df = new_df.replace(to_replace ="^[sS]tartup$", value = 'StartUp', regex = True)
new_df = new_df.replace(to_replace ="^STChealth$", value = 'STCHealth', regex = True)
new_df = new_df.replace(to_replace ="^[sS]tubhub$", value = 'StubHub', regex = True)
new_df = new_df.replace(to_replace ="^[sS]urvey[mM]onkey$", value = 'SurveyMonkey', regex = True)
new_df = new_df.replace(to_replace ="^swiggy$", value = 'Swiggy', regex = True)
new_df = new_df.replace(to_replace ="^synaptics$", value = 'Synaptics', regex = True)
new_df = new_df.replace(to_replace ="^synopsys$", value = 'Synopsys', regex = True)
new_df = new_df.replace(to_replace ="^SYNOPSYS$", value = 'Synopsys', regex = True)
new_df = new_df.replace(to_replace ="^tableau software$", value = 'Tableau Software', regex = True)
new_df = new_df.replace(to_replace ="^talend$", value = 'Talend', regex = True)
new_df = new_df.replace(to_replace ="^target$", value = 'Target', regex = True)
new_df = new_df.replace(to_replace ="^Taskrabbit$", value = 'TaskRabbit', regex = True)
new_df = new_df.replace(to_replace ="^TATA Consultancy Services$", value = 'Tata Consultancy Services', regex = True)
new_df = new_df.replace(to_replace ="^[tT][cC][sS]$", value = 'TCS', regex = True)
new_df = new_df.replace(to_replace ="^[tT][dD] [bB]ank$", value = 'TD Bank', regex = True)
new_df = new_df.replace(to_replace ="^Teksystems$", value = 'TEKsystems', regex = True)
new_df = new_df.replace(to_replace ="^[tT]elus$", value = 'TELUS', regex = True)
new_df = new_df.replace(to_replace ="^tencent$", value = 'Tencent', regex = True)
new_df = new_df.replace(to_replace ="^tesla$", value = 'Tesla', regex = True)
new_df = new_df.replace(to_replace ="^test$", value = 'Test', regex = True)
new_df = new_df.replace(to_replace ="^[tT]he [hH]ome [dD]epot$", value = 'The Home Depot', regex = True)
new_df = new_df.replace(to_replace ="^Thought machine$", value = 'Thought Machine', regex = True)
new_df = new_df.replace(to_replace ="^Thoughtspot$", value = 'ThoughtSpot', regex = True)
new_df = new_df.replace(to_replace ="^Thoughtworks$", value = 'ThoughtWorks', regex = True)
new_df = new_df.replace(to_replace ="^ticketmaster$", value = 'Ticketmaster', regex = True)
new_df = new_df.replace(to_replace ="^[tT]ik[tT]ok$", value = 'TikTok', regex = True)
new_df = new_df.replace(to_replace ="^tipalti$", value = 'Tipalti', regex = True)
new_df = new_df.replace(to_replace ="^T-mobile$", value = 'T-Mobile', regex = True)
new_df = new_df.replace(to_replace ="^TOAST$", value = 'Toast', regex = True)
new_df = new_df.replace(to_replace ="^top hat$", value = 'Top Hat', regex = True)
new_df = new_df.replace(to_replace ="^trinet$", value = 'Trinet', regex = True)
new_df = new_df.replace(to_replace ="^[tT]rip[aA]dvisor$", value = 'TripAdvisor', regex = True)
new_df = new_df.replace(to_replace ="^truckstop.com$", value = 'Truckstop.com', regex = True)
new_df = new_df.replace(to_replace ="^Truecar$", value = 'TrueCar', regex = True)
new_df = new_df.replace(to_replace ="^[tT]smc$", value = 'TSMC', regex = True)
new_df = new_df.replace(to_replace ="^Tunee$", value = 'TUNE', regex = True)
new_df = new_df.replace(to_replace ="^twitch$", value = 'Twitch', regex = True)
new_df = new_df.replace(to_replace ="^twitter$", value = 'Twitter', regex = True)
new_df = new_df.replace(to_replace ="^Two sigma$", value = 'Two Sigma', regex = True)
new_df = new_df.replace(to_replace ="^[uU][bB][eE][rR]$", value = 'Uber', regex = True)
new_df = new_df.replace(to_replace ="^[uU]bs$", value = 'UBS', regex = True)
new_df = new_df.replace(to_replace ="^ultimate software$", value = 'Ultimate Software', regex = True)
new_df = new_df.replace(to_replace ="^united airlines$", value = 'United Airlines', regex = True)
new_df = new_df.replace(to_replace ="^veeva systems$", value = 'Veeva Systems', regex = True)
new_df = new_df.replace(to_replace ="^Verisign$", value = 'VERISIGN', regex = True)
new_df = new_df.replace(to_replace ="^Veritas technologies$", value = 'Veritas Technologies', regex = True)
new_df = new_df.replace(to_replace ="^verizon$", value = 'Verizon', regex = True)
new_df = new_df.replace(to_replace ="^verkada$", value = 'Verkada', regex = True)
new_df = new_df.replace(to_replace ="^Virtu financial$", value = 'Virtu Financial', regex = True)
new_df = new_df.replace(to_replace ="^virtusa$", value = 'Virtusa', regex = True)
new_df = new_df.replace(to_replace ="^[vV][iI][sS][aA]$", value = 'Visa', regex = True)
new_df = new_df.replace(to_replace ="^[vV]izio$", value = 'VIZIO', regex = True)
new_df = new_df.replace(to_replace ="^[vV][mM][wW]are$", value = 'VMware', regex = True)
new_df = new_df.replace(to_replace ="^walmart$", value = 'Walmart', regex = True)
new_df = new_df.replace(to_replace ="^[wW]almart labs$", value = 'Walmart Labs', regex = True)
new_df = new_df.replace(to_replace ="^warby parker$", value = 'Warby Parker', regex = True)
new_df = new_df.replace(to_replace ="^[wW]arnermedia$", value = 'WarnerMedia', regex = True)
new_df = new_df.replace(to_replace ="^wayfair$", value = 'Wayfair', regex = True)
new_df = new_df.replace(to_replace ="^[wW]ells [fF]argo$", value = 'Wells Fargo', regex = True)
new_df = new_df.replace(to_replace ="^[wW]epay$", value = 'WePay', regex = True)
new_df = new_df.replace(to_replace ="^[wW]estern [dD]igital$", value = 'Western Digital', regex = True)
new_df = new_df.replace(to_replace ="^[wW]e[wW]ork$", value = 'WeWork', regex = True)
new_df = new_df.replace(to_replace ="^[wW]ipro$", value = 'Wipro Limited', regex = True)
new_df = new_df.replace(to_replace ="^wish$", value = 'Wish', regex = True)
new_df = new_df.replace(to_replace ="^workday$", value = 'Workday', regex = True)
new_df = new_df.replace(to_replace ="^World wide technology$", value = 'World Wide Technology', regex = True)
new_df = new_df.replace(to_replace ="^XILINX$", value = 'Xilinx', regex = True)
new_df = new_df.replace(to_replace ="^Xing$", value = 'XING', regex = True)
new_df = new_df.replace(to_replace ="^yahoo$", value = 'Yahoo', regex = True)
new_df = new_df.replace(to_replace ="^yandex$", value = 'Yandex', regex = True)
new_df = new_df.replace(to_replace ="^yelp", value = 'Yelp', regex = True)
new_df = new_df.replace(to_replace ="^zapier$", value = 'Zapier', regex = True)
new_df = new_df.replace(to_replace ="^zendesk$", value = 'Zendesk', regex = True)
new_df = new_df.replace(to_replace ="^zillow$", value = 'Zillow', regex = True)
new_df = new_df.replace(to_replace ="^[wW]estern [dD]igital$", value = 'Western Digital', regex = True)
new_df = new_df.replace(to_replace ="^zillow group$", value = 'Zillow Group', regex = True)
new_df = new_df.replace(to_replace ="^zoom$", value = 'Zoom', regex = True)
new_df = new_df.replace(to_replace ="^[zZ]oominfo$", value = 'ZoomInfo', regex = True)
new_df = new_df.replace(to_replace ="^zoox$", value = 'Zoox', regex = True)
new_df = new_df.replace(to_replace ="^3m$", value = '3M', regex = True)
new_df = new_df.replace(to_replace ="^7-eleven$", value = '7-Eleven', regex = True)
new_df = new_df.replace(to_replace ="^AMazon$", value = 'Amazon', regex = True)
new_df = new_df.replace(to_replace ="^Activecampaign$", value = 'ActiveCampaign', regex = True)
new_df = new_df.replace(to_replace ="^Alphasights$", value = 'AlphaSights', regex = True)
new_df = new_df.replace(to_replace ="^Amazon.com$", value = 'Amazon', regex = True)
new_df = new_df.replace(to_replace ="^Appfolio$", value = 'AppFolio', regex = True)
new_df = new_df.replace(to_replace ="^Apple Inc.$", value = 'Apple', regex = True)
new_df = new_df.replace(to_replace ="^Astrazeneca$", value = 'AstraZeneca', regex = True)
new_df = new_df.replace(to_replace ="^[bB]ank [oO]f [aA]merica$", value = 'Bank of America', regex = True)
new_df = new_df.replace(to_replace ="^Bny Mellon$", value = 'BNY Mellon', regex = True)
new_df = new_df.replace(to_replace ="^Browserstack$", value = 'BrowserStack', regex = True)
new_df = new_df.replace(to_replace ="^Buzzfeed$", value = 'BuzzFeed', regex = True)
new_df = new_df.replace(to_replace ="^Bytedance$", value = 'ByteDance', regex = True)
new_df = new_df.replace(to_replace ="^CIsco$", value = 'Cisco', regex = True)
new_df = new_df.replace(to_replace ="^CiSco$", value = 'Cisco', regex = True)
new_df = new_df.replace(to_replace ="^CVS health$", value = 'CSV Health', regex = True)
new_df = new_df.replace(to_replace ="^Cirrus logic$", value = 'Cirrus Logic', regex = True)
new_df = new_df.replace(to_replace ="^Constant contact$", value = 'Constant Contact', regex = True)
new_df = new_df.replace(to_replace ="^Cornerstone [oO]n[dD]emand$", value = 'Cornerstone OnDemand', regex = True)
new_df = new_df.replace(to_replace ="^Costco wholesale$", value = 'Costco', regex = True)
new_df = new_df.replace(to_replace ="^Crowdstrike$", value = 'CrowdStrike', regex = True)
new_df = new_df.replace(to_replace ="^Cure.fit$", value = 'Cure.Fit', regex = True)
new_df = new_df.replace(to_replace ="^Deutsche bank$", value = 'Deutsche Bank', regex = True)
new_df = new_df.replace(to_replace ="^Doubleverify$", value = 'DoubleVerify', regex = True)
new_df = new_df.replace(to_replace ="^E*Trade$", value = 'E*TRADE', regex = True)
new_df = new_df.replace(to_replace ="^Everquote$", value = 'EverQuote', regex = True)
new_df = new_df.replace(to_replace ="^F5 networks$", value = 'F5 Networks', regex = True)
new_df = new_df.replace(to_replace ="^Factset$", value = 'FactSet', regex = True)
new_df = new_df.replace(to_replace ="^Fannie mae$", value = 'Fannie Mae', regex = True)
new_df = new_df.replace(to_replace ="^First Republic bank$", value = 'First Republic Bank', regex = True)
new_df = new_df.replace(to_replace ="^GE healthcare$", value = 'GE Healthcare', regex = True)
new_df = new_df.replace(to_replace ="^GOogle$", value = 'Google', regex = True)
new_df = new_df.replace(to_replace ="^Globallogic$", value = 'GlobalLogic', regex = True)
new_df = new_df.replace(to_replace ="^Glu mobile$", value = 'Glu Mobile', regex = True)
new_df = new_df.replace(to_replace ="^Godaddy$", value = 'GoDaddy', regex = True)
new_df = new_df.replace(to_replace ="^Goldman sachs$", value = 'Goldman Sachs', regex = True)
new_df = new_df.replace(to_replace ="^Guidewire$", value = 'Guidewire Software', regex = True)
new_df = new_df.replace(to_replace ="^Hashicorp$", value = 'HashiCorp', regex = True)
new_df = new_df.replace(to_replace ="^Homeadvisor$", value = 'HomeAdvisor', regex = True)
new_df = new_df.replace(to_replace ="^Ibm$", value = 'IBM', regex = True)
new_df = new_df.replace(to_replace ="^JP morgan$", value = 'JPMorgan Chase', regex = True)
new_df = new_df.replace(to_replace ="^JPmorgan$", value = 'JPMorgan Chase', regex = True)
new_df = new_df.replace(to_replace ="^Jp morgan$", value = 'JPMorgan Chase', regex = True)
new_df = new_df.replace(to_replace ="^Liberty mutual$", value = 'Liberty Mutual', regex = True)
new_df = new_df.replace(to_replace ="^MIcrosoft$", value = 'Microsoft', regex = True)
new_df = new_df.replace(to_replace ="^Microstrategy$", value = 'MicroStrategy', regex = True)
new_df = new_df.replace(to_replace ="^Moody's analytics$", value = "Moody's Analytics", regex = True)
new_df = new_df.replace(to_replace ="^National instruments$", value = 'National Instruments', regex = True)
new_df = new_df.replace(to_replace ="^Paytm$", value = 'PayTm', regex = True)
new_df = new_df.replace(to_replace ="^Phonepe$", value = 'PhonePe', regex = True)
new_df = new_df.replace(to_replace ="^Publicis sapient$", value = 'Publicis Sapient', regex = True)
new_df = new_df.replace(to_replace ="^Pure storage$", value = 'Pure Storage', regex = True)
new_df = new_df.replace(to_replace ="^Qualcomm inc$", value = 'Qualcomm', regex = True)
new_df = new_df.replace(to_replace ="^Ringcentral$", value = 'RingCentral', regex = True)
new_df = new_df.replace(to_replace ="^SAS institute$", value = 'SAS Institute', regex = True)
new_df = new_df.replace(to_replace ="^Salesforce$", value = 'SalesForce', regex = True)
new_df = new_df.replace(to_replace ="^Samsung research america$", value = 'Samsung Research America', regex = True)
new_df = new_df.replace(to_replace ="^Smartsheet$", value = 'SmartSheet', regex = True)
new_df = new_df.replace(to_replace ="^SquarePoint capital$", value = 'Squarepoint Capital', regex = True)
new_df = new_df.replace(to_replace ="^Transunion$", value = 'TransUnion', regex = True)
new_df = new_df.replace(to_replace ="^Verizon Media$", value = 'Verizon', regex = True)
new_df = new_df.replace(to_replace ="^Visa [iI]nc$", value = 'Visa', regex = True)
new_df = new_df.replace(to_replace ="^Volkswagen$", value = 'VolksWagen', regex = True)
new_df = new_df.replace(to_replace ="^Western DIgital$", value = 'Western Digital', regex = True)
new_df = new_df.replace(to_replace ="^Willis towers watson$", value = 'Willis Towers Watson', regex = True)
new_df = new_df.replace(to_replace ="^ZS$", value = 'ZS Associates', regex = True)
new_df = new_df.replace(to_replace ="^ZS associates$", value = 'ZS Associates', regex = True)
new_df = new_df.replace(to_replace ="^agoda$", value = 'Agoda', regex = True)
new_df = new_df.replace(to_replace ="^airbnb$", value = 'Airbnb', regex = True)
new_df = new_df.replace(to_replace ="^bytedance$", value = 'ByteDance', regex = True)
new_df = new_df.replace(to_replace ="^bank of america$", value = 'Bank of America', regex = True)
new_df = new_df.replace(to_replace ="^canva$", value = 'Canva', regex = True)
new_df = new_df.replace(to_replace ="^cohesity$", value = 'Cohesity', regex = True)
new_df = new_df.replace(to_replace ="^comcast$", value = 'Comcast', regex = True)
new_df = new_df.replace(to_replace ="^cvs health$", value = 'CVS Health', regex = True)
new_df = new_df.replace(to_replace ="^deloitte$", value = 'Deloitte', regex = True)
new_df = new_df.replace(to_replace ="^fico", value = 'FICO', regex = True)
new_df = new_df.replace(to_replace ="^comcast$", value = 'Comcast', regex = True)
new_df = new_df.replace(to_replace ="^fortinet$", value = 'Fortinet', regex = True)
new_df = new_df.replace(to_replace ="^ge healthcare$", value = 'GE Healthcare', regex = True)
new_df = new_df.replace(to_replace ="^globalfoundries$", value = 'GlobalFoundries', regex = True)
new_df = new_df.replace(to_replace ="^goPuff$", value = 'GoPuff', regex = True)
new_df = new_df.replace(to_replace ="^goldman sachs$", value = 'Goldman Sachs', regex = True)
new_df = new_df.replace(to_replace ="^google llc$", value = 'Google', regex = True)
new_df = new_df.replace(to_replace ="^hitachi$", value = 'Hitachi', regex = True)
new_df = new_df.replace(to_replace ="^iCIMS$", value = 'Icims', regex = True)
new_df = new_df.replace(to_replace ="^iHeartRadio$", value = 'IHeartRadio', regex = True)
new_df = new_df.replace(to_replace ="^instacart$", value = 'Instacart', regex = True)
new_df = new_df.replace(to_replace ="^intel$", value = 'Intel', regex = True)
new_df = new_df.replace(to_replace ="^intuit$", value = 'Intuit', regex = True)
new_df = new_df.replace(to_replace ="^jp morgan$", value = 'JPMorgan Chase', regex = True)
new_df = new_df.replace(to_replace ="^kroger$", value = 'Kroger', regex = True)
new_df = new_df.replace(to_replace ="^lockheed martin$", value = 'Lockheed Martin', regex = True)
new_df = new_df.replace(to_replace ="^lowes$", value = 'Lowes', regex = True)
new_df = new_df.replace(to_replace ="^lyft$", value = 'Lyft', regex = True)
new_df = new_df.replace(to_replace ="^macy's$", value = "Macy's", regex = True)
new_df = new_df.replace(to_replace ="^mi$", value = 'Mi', regex = True)
new_df = new_df.replace(to_replace ="^national instruments$", value = 'National Instruments', regex = True)
new_df = new_df.replace(to_replace ="^nxp semiconductors$", value = 'NXP Semiconductors', regex = True)
new_df = new_df.replace(to_replace ="^Nxp Semiconductors$", value = 'NXP Semiconductors', regex = True)
new_df = new_df.replace(to_replace ="^opentext$", value = 'OpenText', regex = True)
new_df = new_df.replace(to_replace ="^panasonic$", value = 'Panasonic', regex = True)
new_df = new_df.replace(to_replace ="^pandora$", value = 'Pandora', regex = True)
new_df = new_df.replace(to_replace ="^rapid7$", value = 'Rapid7', regex = True)
new_df = new_df.replace(to_replace ="^samsung$", value = 'Samsung', regex = True)
new_df = new_df.replace(to_replace ="^schlumberger$", value = 'Sclumberger', regex = True)
new_df = new_df.replace(to_replace ="^sony interactive entertainment$", value = 'Sony Interactive Entertainment', regex = True)
new_df = new_df.replace(to_replace ="^taskrabbit$", value = 'TaskRabbit', regex = True)
new_df = new_df.replace(to_replace ="^udaan.com$", value = 'Udaan.com', regex = True)
new_df = new_df.replace(to_replace ="^veritas technologies$", value = 'Veritas Technologies', regex = True)
new_df = new_df.replace(to_replace ="^vrbo$", value = 'Vrbo', regex = True)
new_df = new_df.replace(to_replace ="^zynga$", value = 'Zynga', regex = True)
new_df = new_df.replace(to_replace ="^ Google$", value = 'Google', regex = True)
new_df = new_df.replace(to_replace ="^idemia$", value = 'Idemia', regex = True)
new_df = new_df.replace(to_replace ="^indeed$", value = 'Indeed', regex = True)

In [42]:
# Save new_df to csv
new_df.to_csv('Resources/salaries_cleaned.csv')

In [43]:
# Display dataframe
new_df.head()

Unnamed: 0,timestamp,date,company,level,title,totalyearlycompensation,location,latitude,longitude,yearsofexperience,yearsatcompany,basesalary,stockgrantvalue,bonus,gender,Race,Education
0,2017-06-07 11:33:00,2017-06-07,Oracle,L3,Product Manager,127000,"Redwood City, CA",37.486324,-122.232523,1.5,1.5,107000,20000,10000,Unknown,Unknown,Unknown
1,2017-06-10 17:11:00,2017-06-10,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",37.779026,-122.419906,5.0,3.0,0,0,0,Unknown,Unknown,Unknown
2,2017-06-11 14:53:00,2017-06-11,Amazon,L7,Product Manager,310000,"Seattle, WA",47.603832,-122.330062,8.0,0.0,155000,0,0,Unknown,Unknown,Unknown
3,2017-06-17 00:23:00,2017-06-17,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",37.36883,-122.036349,7.0,5.0,157000,180000,35000,Unknown,Unknown,Unknown
4,2017-06-20 10:58:00,2017-06-20,Microsoft,60,Software Engineer,157000,"Mountain View, CA",37.389389,-122.08321,5.0,3.0,0,0,0,Unknown,Unknown,Unknown


In [44]:
# Display dataframe
layoffs_df.head()

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,stage,country,funds_raised
0,N26,Berlin,Finance,71.0,0.04,2023-04-28,Series E,United States,1700.0
1,Providoor,Melbourne,Food,,1.0,2023-04-28,Unknown,Australia,
2,Dropbox,SF Bay Area,Other,500.0,0.16,2023-04-27,Post-IPO,United States,1700.0
3,Vroom,New York City,Transportation,120.0,0.11,2023-04-27,Post-IPO,United States,1300.0
4,Greenhouse,New York City,Recruiting,100.0,0.12,2023-04-27,Private Equity,United States,110.0


In [45]:
# Check dataframe's info
layoffs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2545 entries, 0 to 2544
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   company              2545 non-null   object 
 1   location             2545 non-null   object 
 2   industry             2543 non-null   object 
 3   total_laid_off       1746 non-null   float64
 4   percentage_laid_off  1694 non-null   float64
 5   date                 2543 non-null   object 
 6   stage                2539 non-null   object 
 7   country              2545 non-null   object 
 8   funds_raised         2297 non-null   float64
dtypes: float64(3), object(6)
memory usage: 179.1+ KB


In [46]:
# Replace null values with 0
layoffs_df['total_laid_off'].fillna(0, inplace = True)

In [47]:
# Convert datatypes
layoffs_df['date'] = layoffs_df['date'].astype('datetime64[ns]')

In [48]:
# Convert to percentage amount
layoffs_df['percentage_laid_off'] = layoffs_df['percentage_laid_off']*100

In [49]:
# Convert datatypes
layoffs_df['total_laid_off'] = layoffs_df['total_laid_off'].astype('int')

In [50]:
# Display dataframe
layoffs_df.head(30)

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,stage,country,funds_raised
0,N26,Berlin,Finance,71,4.0,2023-04-28,Series E,United States,1700.0
1,Providoor,Melbourne,Food,0,100.0,2023-04-28,Unknown,Australia,
2,Dropbox,SF Bay Area,Other,500,16.0,2023-04-27,Post-IPO,United States,1700.0
3,Vroom,New York City,Transportation,120,11.0,2023-04-27,Post-IPO,United States,1300.0
4,Greenhouse,New York City,Recruiting,100,12.0,2023-04-27,Private Equity,United States,110.0
5,Rebellion Defense,Washington D.C.,Data,90,,2023-04-27,Series B,United States,150.0
6,Poppulo,Denver,HR,85,,2023-04-27,Acquired,United States,30.0
7,Megaport,Brisbane,Infrastructure,50,16.0,2023-04-27,Post-IPO,Australia,98.0
8,Airtasker,Sydney,Retail,45,20.0,2023-04-27,Series C,Australia,26.0
9,Chief,New York City,Other,43,14.0,2023-04-27,Series B,United States,140.0


In [51]:
# Check dataframe's info
layoffs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2545 entries, 0 to 2544
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   company              2545 non-null   object        
 1   location             2545 non-null   object        
 2   industry             2543 non-null   object        
 3   total_laid_off       2545 non-null   int32         
 4   percentage_laid_off  1694 non-null   float64       
 5   date                 2543 non-null   datetime64[ns]
 6   stage                2539 non-null   object        
 7   country              2545 non-null   object        
 8   funds_raised         2297 non-null   float64       
dtypes: datetime64[ns](1), float64(2), int32(1), object(5)
memory usage: 169.1+ KB


In [52]:
# Drop unwanted columns
layoffs_df = layoffs_df.drop(columns=['funds_raised'])

In [53]:
# Save layoffs_df to csv
layoffs_df.to_csv('Resources/layoffs_cleaned.csv')

In [54]:
# Save new_df to json
new_df.to_json('Resources/salaries.json',orient='records',indent=4)

In [55]:
# Save layoffs_df to json
layoffs_df.to_json('Resources/layoffs.json',orient='records',indent=4)

In [56]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint

In [57]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

* Import the `salaries.json` file with the following line in Terminal:
    mongoimport --type json -d datascience -c salaries --drop --jsonArray salaries.json
    
* Import the `layoffs.json` file with the following line in Terminal:
    mongoimport --type json -d datascience -c salaries --drop --jsonArray layoffs.json

In [58]:
# Assign the datascience database to a variable name
db = mongo['datascience']
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'datascience')

In [59]:
# assign the salaries collection to a variable
salaries = db['salaries']  

In [60]:
# assign the salaries collection to a variable
layoffs = db['layoffs']  

In [61]:
pprint(db.salaries.find_one())

{'Education': 'Unknown',
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b68e892bf4ecdadbb7'),
 'basesalary': 107000,
 'bonus': 10000,
 'company': 'Oracle',
 'date': 1496793600000.0,
 'gender': 'Unknown',
 'latitude': 37.4863239,
 'level': 'L3',
 'location': 'Redwood City, CA',
 'longitude': -122.232523,
 'stockgrantvalue': 20000,
 'timestamp': 1496835180000.0,
 'title': 'Product Manager',
 'totalyearlycompensation': 127000,
 'yearsatcompany': 1.5,
 'yearsofexperience': 1.5}


In [62]:
# Check for documents with the company name Amazon in salaries collection
query = {"company":"Amazon"}

results = salaries.find(query)

for result in results:
    pprint(result)

{'Education': 'Unknown',
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b68e892bf4ecdadbb9'),
 'basesalary': 155000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1497139200000.0,
 'gender': 'Unknown',
 'latitude': 47.6038321,
 'level': 'L7',
 'location': 'Seattle, WA',
 'longitude': -122.330062,
 'stockgrantvalue': 0,
 'timestamp': 1497192780000.0,
 'title': 'Product Manager',
 'totalyearlycompensation': 310000,
 'yearsatcompany': 0,
 'yearsofexperience': 8}
{'Education': 'Unknown',
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b68e892bf4ecdadbc5'),
 'basesalary': 160000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1499990400000.0,
 'gender': 'Unknown',
 'latitude': 47.6038321,
 'level': 'L6',
 'location': 'Seattle, WA',
 'longitude': -122.330062,
 'stockgrantvalue': 0,
 'timestamp': 1500068160000.0,
 'title': 'Software Engineering Manager',
 'totalyearlycompensation': 287000,
 'yearsatcompany': 1,
 'yearsofexperience': 12}
{'Education': 'Unknown',
 'Race': 'Unknown',
 '_id': ObjectId('6465a

 'location': 'Santa Clara, CA',
 'longitude': -121.6846349,
 'stockgrantvalue': 0,
 'timestamp': 1554565140000.0,
 'title': 'Software Engineering Manager',
 'totalyearlycompensation': 430000,
 'yearsatcompany': 2,
 'yearsofexperience': 20}
{'Education': 'Unknown',
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b68e892bf4ecdaf17b'),
 'basesalary': 108000,
 'bonus': 24000,
 'company': 'Amazon',
 'date': 1554508800000.0,
 'gender': 'Male',
 'latitude': 37.7790262,
 'level': 'L4',
 'location': 'San Francisco, CA',
 'longitude': -122.419906,
 'stockgrantvalue': 15000,
 'timestamp': 1554566940000.0,
 'title': 'Software Engineer',
 'totalyearlycompensation': 137000,
 'yearsatcompany': 0,
 'yearsofexperience': 0}
{'Education': 'Unknown',
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b68e892bf4ecdaf17d'),
 'basesalary': 0,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1554508800000.0,
 'gender': 'Unknown',
 'latitude': 37.4443293,
 'level': 'L5',
 'location': 'Palo Alto, CA',
 'longitude': -122.159846

{'Education': 'Unknown',
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b68e892bf4ecdaf290'),
 'basesalary': 155000,
 'bonus': 135000,
 'company': 'Amazon',
 'date': 1555459200000.0,
 'gender': 'Male',
 'latitude': 47.6038321,
 'level': 'Principal Product Manager',
 'location': 'Seattle, WA',
 'longitude': -122.330062,
 'stockgrantvalue': 40000,
 'timestamp': 1555484940000.0,
 'title': 'Product Manager',
 'totalyearlycompensation': 330000,
 'yearsatcompany': 1,
 'yearsofexperience': 8}
{'Education': 'Unknown',
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b68e892bf4ecdaf298'),
 'basesalary': 146000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1555459200000.0,
 'gender': 'Male',
 'latitude': 47.6038321,
 'level': 'SDE III',
 'location': 'Seattle, WA',
 'longitude': -122.330062,
 'stockgrantvalue': 100000,
 'timestamp': 1555503000000.0,
 'title': 'Software Engineer',
 'totalyearlycompensation': 246000,
 'yearsatcompany': 5,
 'yearsofexperience': 5}
{'Education': 'Unknown',
 'Race': 'Unknown',

 'date': 1579824000000.0,
 'gender': 'Male',
 'latitude': 47.6038321,
 'level': 'L4',
 'location': 'Seattle, WA',
 'longitude': -122.330062,
 'stockgrantvalue': 2000,
 'timestamp': 1579846140000.0,
 'title': 'Software Engineer',
 'totalyearlycompensation': 142000,
 'yearsatcompany': 1,
 'yearsofexperience': 1}
{'Education': 'Unknown',
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b78e892bf4ecdb184a'),
 'basesalary': 103000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1579824000000.0,
 'gender': 'Male',
 'latitude': 49.2608724,
 'level': 'L5',
 'location': 'Vancouver, BC, Canada',
 'longitude': -123.113952,
 'stockgrantvalue': 42000,
 'timestamp': 1579851120000.0,
 'title': 'Software Engineer',
 'totalyearlycompensation': 145000,
 'yearsatcompany': 0,
 'yearsofexperience': 10}
{'Education': 'Unknown',
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b78e892bf4ecdb184b'),
 'basesalary': 36000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1579824000000.0,
 'gender': 'Male',
 'latitude': 12.976793

{'Education': "Master's Degree",
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b78e892bf4ecdb1950'),
 'basesalary': 142000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1580169600000.0,
 'gender': 'Male',
 'latitude': 47.6144219,
 'level': 'L5',
 'location': 'Bellevue, WA',
 'longitude': -122.192337,
 'stockgrantvalue': 52000,
 'timestamp': 1580248080000.0,
 'title': 'Software Engineer',
 'totalyearlycompensation': 194000,
 'yearsatcompany': 2,
 'yearsofexperience': 4}
{'Education': 'PhD',
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b78e892bf4ecdb1951'),
 'basesalary': 180000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1580169600000.0,
 'gender': 'Male',
 'latitude': 37.3361663,
 'level': 'L6',
 'location': 'San Jose, CA',
 'longitude': -121.890591,
 'stockgrantvalue': 170000,
 'timestamp': 1580248380000.0,
 'title': 'Software Engineer',
 'totalyearlycompensation': 352000,
 'yearsatcompany': 0,
 'yearsofexperience': 13}
{'Education': "Master's Degree",
 'Race': 'Unknown',
 '_id': ObjectI

 'date': 1596240000000.0,
 'gender': 'Unknown',
 'latitude': 34.0536909,
 'level': 'L7',
 'location': 'Los Angeles, CA',
 'longitude': -118.242766,
 'stockgrantvalue': 150000,
 'timestamp': 1596311460000.0,
 'title': 'Sales',
 'totalyearlycompensation': 310000,
 'yearsatcompany': 7,
 'yearsofexperience': 20}
{'Education': "Master's Degree",
 'Race': 'Unknown',
 '_id': ObjectId('6465a8b88e892bf4ecdb3f58'),
 'basesalary': 47000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1596240000000.0,
 'gender': 'Female',
 'latitude': 12.9767936,
 'level': 'L5',
 'location': 'Bangalore, KA, India',
 'longitude': 77.590082,
 'stockgrantvalue': 25000,
 'timestamp': 1596314460000.0,
 'title': 'Technical Program Manager',
 'totalyearlycompensation': 72000,
 'yearsatcompany': 4,
 'yearsofexperience': 9}
{'Education': "Master's Degree",
 'Race': 'Asian',
 '_id': ObjectId('6465a8b88e892bf4ecdb3f60'),
 'basesalary': 138000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1596240000000.0,
 'gender': 'Female',
 'l

 'latitude': 33.7500378,
 'level': 'L5',
 'location': 'Orange, CA',
 'longitude': -117.8704931,
 'stockgrantvalue': 10000,
 'timestamp': 1596905460000.0,
 'title': 'Solution Architect',
 'totalyearlycompensation': 210000,
 'yearsatcompany': 0,
 'yearsofexperience': 22}
{'Education': "Bachelor's Degree",
 'Race': 'White',
 '_id': ObjectId('6465a8b88e892bf4ecdb4175'),
 'basesalary': 145000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1596844800000.0,
 'gender': 'Male',
 'latitude': 47.6038321,
 'level': 'SDE II',
 'location': 'Seattle, WA',
 'longitude': -122.330062,
 'stockgrantvalue': 65000,
 'timestamp': 1596908760000.0,
 'title': 'Software Engineer',
 'totalyearlycompensation': 210000,
 'yearsatcompany': 1,
 'yearsofexperience': 5}
{'Education': "Master's Degree",
 'Race': 'Asian',
 '_id': ObjectId('6465a8b88e892bf4ecdb417e'),
 'basesalary': 155000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1596844800000.0,
 'gender': 'Male',
 'latitude': 41.8755616,
 'level': 'L6',
 'location': 'C

 'yearsofexperience': 3}
{'Education': "Bachelor's Degree",
 'Race': 'White',
 '_id': ObjectId('6465a8b98e892bf4ecdb7089'),
 'basesalary': 144000,
 'bonus': 11000,
 'company': 'Amazon',
 'date': 1608854400000.0,
 'gender': 'Male',
 'latitude': 47.6038321,
 'level': 'L5',
 'location': 'Seattle, WA',
 'longitude': -122.330062,
 'stockgrantvalue': 18000,
 'timestamp': 1608864720000.0,
 'title': 'Software Engineer',
 'totalyearlycompensation': 173000,
 'yearsatcompany': 2,
 'yearsofexperience': 2}
{'Education': "Master's Degree",
 'Race': 'Asian',
 '_id': ObjectId('6465a8b98e892bf4ecdb7092'),
 'basesalary': 140000,
 'bonus': 0,
 'company': 'Amazon',
 'date': 1608854400000.0,
 'gender': 'Male',
 'latitude': 47.6038321,
 'level': 'L5',
 'location': 'Seattle, WA',
 'longitude': -122.330062,
 'stockgrantvalue': 80000,
 'timestamp': 1608886080000.0,
 'title': 'Software Engineer',
 'totalyearlycompensation': 220000,
 'yearsatcompany': 7,
 'yearsofexperience': 12}
{'Education': 'Unknown',
 'Race'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [63]:
# Check for documents with the stage "Post-IPO" in layoffs collection
query = {"stage":"Post-IPO"}

results = layoffs.find(query)

for result in results:
    pprint(result)

{'_id': ObjectId('6465a8d58e892bf4ecdbd06c'),
 'company': 'Dropbox',
 'country': 'United States',
 'date': 1682553600000.0,
 'industry': 'Other',
 'location': 'SF Bay Area',
 'percentage_laid_off': 16,
 'stage': 'Post-IPO',
 'total_laid_off': 500}
{'_id': ObjectId('6465a8d58e892bf4ecdbd06d'),
 'company': 'Vroom',
 'country': 'United States',
 'date': 1682553600000.0,
 'industry': 'Transportation',
 'location': 'New York City',
 'percentage_laid_off': 11,
 'stage': 'Post-IPO',
 'total_laid_off': 120}
{'_id': ObjectId('6465a8d58e892bf4ecdbd071'),
 'company': 'Megaport',
 'country': 'Australia',
 'date': 1682553600000.0,
 'industry': 'Infrastructure',
 'location': 'Brisbane',
 'percentage_laid_off': 16,
 'stage': 'Post-IPO',
 'total_laid_off': 50}
{'_id': ObjectId('6465a8d58e892bf4ecdbd075'),
 'company': 'Alteryx',
 'country': 'United States',
 'date': 1682553600000.0,
 'industry': 'Data',
 'location': 'Los Angeles',
 'percentage_laid_off': 11,
 'stage': 'Post-IPO',
 'total_laid_off': 0}


In [64]:
# Groupby companies for list of levels
levels_df = new_df.groupby("company")["level"].unique().to_frame()
levels_df

Unnamed: 0_level_0,level
company,Unnamed: 1_level_1
10x Genomics,"[L3, Senior, L2, l2]"
23andMe,"[Software Engineer, L1, Software Engineer I, M1, L4]"
2U,"[Software Engineer III, 2, L1, L3, 3, Senior]"
3M,"[Job Grade 9, T3, T2, 10, L8, L4, T4A, Grade 11, 14, T5A, T4, T1, Sales]"
7-Eleven,"[Senior, Software Engineer II, L3, 2]"
...,...
fuboTV,"[L4, Software Engineer, SE1, L3]"
iHerb,"[Director, L3, Junior Developer 1, Senior Software Developer I, Senior Software Developer]"
iManage,"[Junior, Senior, Senior Specialist, L3]"
iRobot,"[Principal, UX Designer, Software Engineer (2nd level), L1, L5, Senior Software Engineer, L2]"


In [65]:
# Save levels_df to csv
levels_df.to_csv('Resources/level.csv')