In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from pathlib import Path 

In [30]:
# load file
file_load = "Resources/Salary_Data.csv"

In [31]:
# read file and store in Pandas DataFrame
salary_data_df = pd.read_csv(file_load)
salary_data_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,...,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education
0,6/7/17 11:33,Oracle,L3,Product Manager,127000,"Redwood City, CA",1.5,1.5,,107000,...,0,0,0,0,0,0,0,0,,
1,6/10/17 17:11,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",5.0,3.0,,0,...,0,0,0,0,0,0,0,0,,
2,6/11/17 14:53,Amazon,L7,Product Manager,310000,"Seattle, WA",8.0,0.0,,155000,...,0,0,0,0,0,0,0,0,,
3,6/17/17 0:23,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",7.0,5.0,,157000,...,0,0,0,0,0,0,0,0,,
4,6/20/17 10:58,Microsoft,60,Software Engineer,157000,"Mountain View, CA",5.0,3.0,,0,...,0,0,0,0,0,0,0,0,,


In [32]:
# see columns
salary_data_df.columns


Index(['timestamp', 'company', 'level', 'title', 'totalyearlycompensation',
       'location', 'yearsofexperience', 'yearsatcompany', 'tag', 'basesalary',
       'stockgrantvalue', 'bonus', 'gender', 'otherdetails', 'cityid', 'dmaid',
       'rowNumber', 'Masters_Degree', 'Bachelors_Degree', 'Doctorate_Degree',
       'Highschool', 'Some_College', 'Race_Asian', 'Race_White',
       'Race_Two_Or_More', 'Race_Black', 'Race_Hispanic', 'Race', 'Education'],
      dtype='object')

In [33]:
# check data types
salary_data_df.dtypes

timestamp                   object
company                     object
level                       object
title                       object
totalyearlycompensation      int64
location                    object
yearsofexperience          float64
yearsatcompany             float64
tag                         object
basesalary                   int64
stockgrantvalue            float64
bonus                      float64
gender                      object
otherdetails                object
cityid                       int64
dmaid                      float64
rowNumber                    int64
Masters_Degree               int64
Bachelors_Degree             int64
Doctorate_Degree             int64
Highschool                   int64
Some_College                 int64
Race_Asian                   int64
Race_White                   int64
Race_Two_Or_More             int64
Race_Black                   int64
Race_Hispanic                int64
Race                        object
Education           

In [34]:
# get columns and rows that ar not null
salary_data_df.count()

timestamp                  62642
company                    62637
level                      62523
title                      62642
totalyearlycompensation    62642
location                   62642
yearsofexperience          62642
yearsatcompany             62642
tag                        61788
basesalary                 62642
stockgrantvalue            62642
bonus                      62642
gender                     43102
otherdetails               40137
cityid                     62642
dmaid                      62640
rowNumber                  62642
Masters_Degree             62642
Bachelors_Degree           62642
Doctorate_Degree           62642
Highschool                 62642
Some_College               62642
Race_Asian                 62642
Race_White                 62642
Race_Two_Or_More           62642
Race_Black                 62642
Race_Hispanic              62642
Race                       22427
Education                  30370
dtype: int64

In [35]:
# get sum on columns and rows that are not null
salary_data_df.isnull().sum()

timestamp                      0
company                        5
level                        119
title                          0
totalyearlycompensation        0
location                       0
yearsofexperience              0
yearsatcompany                 0
tag                          854
basesalary                     0
stockgrantvalue                0
bonus                          0
gender                     19540
otherdetails               22505
cityid                         0
dmaid                          2
rowNumber                      0
Masters_Degree                 0
Bachelors_Degree               0
Doctorate_Degree               0
Highschool                     0
Some_College                   0
Race_Asian                     0
Race_White                     0
Race_Two_Or_More               0
Race_Black                     0
Race_Hispanic                  0
Race                       40215
Education                  32272
dtype: int64

In [73]:
# change null values in 'Race' 'Education' and 'gender' columns to "UNKNOWN"
salary_data_df[['gender', 'Race', 'Education']] = salary_data_df[['gender', 'Race', 'Education']].fillna('Unknown')

Want to get rid of columns that are not usefull: 'level', 'tag', 'otherdetails', 
We can also delete the rows that have null values in 'company' and 'dmaid'

In [74]:
# drop columns 'level', 'tag', 'otherdetails', 
clean_salary_data_df = salary_data_df.drop(['tag', 'otherdetails', 'cityid', 'dmaid'], axis=1)

In [75]:
# see clean columns
clean_salary_data_df.columns

Index(['timestamp', 'company', 'level', 'title', 'totalyearlycompensation',
       'location', 'yearsofexperience', 'yearsatcompany', 'basesalary',
       'stockgrantvalue', 'bonus', 'gender', 'rowNumber', 'Masters_Degree',
       'Bachelors_Degree', 'Doctorate_Degree', 'Highschool', 'Some_College',
       'Race_Asian', 'Race_White', 'Race_Two_Or_More', 'Race_Black',
       'Race_Hispanic', 'Race', 'Education'],
      dtype='object')

In [77]:
#drop rows that have null values
clean_salary_data_df = clean_salary_data_df.dropna()

In [78]:
# get sum on columns and rows that are not null
clean_salary_data_df.isnull().sum()

timestamp                  0
company                    0
level                      0
title                      0
totalyearlycompensation    0
location                   0
yearsofexperience          0
yearsatcompany             0
basesalary                 0
stockgrantvalue            0
bonus                      0
gender                     0
rowNumber                  0
Masters_Degree             0
Bachelors_Degree           0
Doctorate_Degree           0
Highschool                 0
Some_College               0
Race_Asian                 0
Race_White                 0
Race_Two_Or_More           0
Race_Black                 0
Race_Hispanic              0
Race                       0
Education                  0
dtype: int64

In [79]:
clean_salary_data_df.count()

timestamp                  62518
company                    62518
level                      62518
title                      62518
totalyearlycompensation    62518
location                   62518
yearsofexperience          62518
yearsatcompany             62518
basesalary                 62518
stockgrantvalue            62518
bonus                      62518
gender                     62518
rowNumber                  62518
Masters_Degree             62518
Bachelors_Degree           62518
Doctorate_Degree           62518
Highschool                 62518
Some_College               62518
Race_Asian                 62518
Race_White                 62518
Race_Two_Or_More           62518
Race_Black                 62518
Race_Hispanic              62518
Race                       62518
Education                  62518
dtype: int64

In [80]:
# change the type of timestamp column to_datetime
clean_salary_data_df['timestamp'] = pd.to_datetime(clean_salary_data_df['timestamp'])

In [81]:
clean_salary_data_df.dtypes

timestamp                  datetime64[ns]
company                            object
level                              object
title                              object
totalyearlycompensation             int64
location                           object
yearsofexperience                 float64
yearsatcompany                    float64
basesalary                          int64
stockgrantvalue                   float64
bonus                             float64
gender                             object
rowNumber                           int64
Masters_Degree                      int64
Bachelors_Degree                    int64
Doctorate_Degree                    int64
Highschool                          int64
Some_College                        int64
Race_Asian                          int64
Race_White                          int64
Race_Two_Or_More                    int64
Race_Black                          int64
Race_Hispanic                       int64
Race                              

## Cleaning country column 

In [82]:
# Splitting 'location' column into 4 columns (city, state, country, extra)
clean_salary_data_df[['city', 'state', 'country', 'extra']] = clean_salary_data_df.location.str.split(",", expand=True)
clean_salary_data_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,basesalary,stockgrantvalue,...,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education,city,state,country,extra
0,2017-06-07 11:33:00,Oracle,L3,Product Manager,127000,"Redwood City, CA",1.5,1.5,107000,20000.0,...,0,0,0,0,Unknown,Unknown,Redwood City,CA,,
1,2017-06-10 17:11:00,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",5.0,3.0,0,0.0,...,0,0,0,0,Unknown,Unknown,San Francisco,CA,,
2,2017-06-11 14:53:00,Amazon,L7,Product Manager,310000,"Seattle, WA",8.0,0.0,155000,0.0,...,0,0,0,0,Unknown,Unknown,Seattle,WA,,
3,2017-06-17 00:23:00,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",7.0,5.0,157000,180000.0,...,0,0,0,0,Unknown,Unknown,Sunnyvale,CA,,
4,2017-06-20 10:58:00,Microsoft,60,Software Engineer,157000,"Mountain View, CA",5.0,3.0,0,0.0,...,0,0,0,0,Unknown,Unknown,Mountain View,CA,,


In [83]:
# Unique names of 'country' column

clean_salary_data_df.country.unique()

array([None, ' United Kingdom', ' Ireland', ' India', ' Belarus',
       ' Canada', ' Russia', ' Netherlands', ' Switzerland', ' Singapore',
       ' Germany', ' Japan', ' Sweden', ' Australia', ' United States',
       ' Israel', ' Poland', ' China', ' Austria', ' Luxembourg',
       ' Czech Republic', ' France', ' Pakistan', ' New Zealand',
       ' Denmark', ' Hong Kong (SAR)', ' South Africa', ' Spain',
       ' United Arab Emirates', ' Hungary', ' Brazil', ' Bulgaria',
       ' Philippines', ' Indonesia', ' Puerto Rico', ' Taiwan',
       ' Romania', ' Mexico', ' Costa Rica', ' Marshall Islands',
       ' Vietnam', ' Panama', ' Argentina', ' Norway', ' Moldova',
       ' Estonia', ' Kenya', ' Turkey', ' Italy', ' Lithuania',
       ' Nigeria', ' Korea', ' Ukraine', ' Jordan', ' Thailand',
       ' Colombia', ' Serbia', ' Portugal', ' Guatemala', ' Yugoslavia',
       ' Uruguay', ' Slovakia', ' Bangladesh', ' Finland', ' Chile',
       ' Malaysia', ' Latvia', ' Saudi Arabia', ' Per

In [84]:
# Unique names of 'extra' column

clean_salary_data_df.extra.unique()

array([None, ' South'], dtype=object)

In [85]:
# Unique names of 'country' columns 
# Deleted all names of countries except 'None' and 'United States'

array = [None, ' United States']

clean_salary_data_df = clean_salary_data_df.loc[clean_salary_data_df.country.isin(array)]
clean_salary_data_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,basesalary,stockgrantvalue,...,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education,city,state,country,extra
0,2017-06-07 11:33:00,Oracle,L3,Product Manager,127000,"Redwood City, CA",1.5,1.5,107000,20000.0,...,0,0,0,0,Unknown,Unknown,Redwood City,CA,,
1,2017-06-10 17:11:00,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",5.0,3.0,0,0.0,...,0,0,0,0,Unknown,Unknown,San Francisco,CA,,
2,2017-06-11 14:53:00,Amazon,L7,Product Manager,310000,"Seattle, WA",8.0,0.0,155000,0.0,...,0,0,0,0,Unknown,Unknown,Seattle,WA,,
3,2017-06-17 00:23:00,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",7.0,5.0,157000,180000.0,...,0,0,0,0,Unknown,Unknown,Sunnyvale,CA,,
4,2017-06-20 10:58:00,Microsoft,60,Software Engineer,157000,"Mountain View, CA",5.0,3.0,0,0.0,...,0,0,0,0,Unknown,Unknown,Mountain View,CA,,


In [86]:
clean_salary_data_df.state.unique()

array([' CA', ' WA', ' NY', ' MD', ' OR', ' DC', ' TX', ' MA', ' LA',
       ' PA', ' SC', ' VA', ' CO', ' NE', ' IN', ' WI', ' MN', ' IL',
       ' NJ', ' AZ', ' OH', ' NC', ' FL', ' GA', ' MO', ' RI', ' UT',
       ' MI', ' CT', ' NM', ' AR', ' VT', ' IA', ' KS', ' NH', ' ID',
       ' TN', ' DE', ' AL', ' NV', ' KY', ' Israel', ' WV', ' OK', ' MS',
       ' ME', ' MT', ' ND', ' HI', ' WY'], dtype=object)

In [87]:
clean_salary_data_df= clean_salary_data_df[clean_salary_data_df.state != ' Israel']
clean_salary_data_df.state.unique()

array([' CA', ' WA', ' NY', ' MD', ' OR', ' DC', ' TX', ' MA', ' LA',
       ' PA', ' SC', ' VA', ' CO', ' NE', ' IN', ' WI', ' MN', ' IL',
       ' NJ', ' AZ', ' OH', ' NC', ' FL', ' GA', ' MO', ' RI', ' UT',
       ' MI', ' CT', ' NM', ' AR', ' VT', ' IA', ' KS', ' NH', ' ID',
       ' TN', ' DE', ' AL', ' NV', ' KY', ' WV', ' OK', ' MS', ' ME',
       ' MT', ' ND', ' HI', ' WY'], dtype=object)

In [88]:
clean_salary_data_df.drop(['location', 'country', 'extra'], axis=1, inplace=True)

In [89]:
clean_salary_data_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,basesalary,stockgrantvalue,bonus,...,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education,city,state
0,2017-06-07 11:33:00,Oracle,L3,Product Manager,127000,1.5,1.5,107000,20000.0,10000.0,...,0,0,0,0,0,0,Unknown,Unknown,Redwood City,CA
1,2017-06-10 17:11:00,eBay,SE 2,Software Engineer,100000,5.0,3.0,0,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,San Francisco,CA
2,2017-06-11 14:53:00,Amazon,L7,Product Manager,310000,8.0,0.0,155000,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,Seattle,WA
3,2017-06-17 00:23:00,Apple,M1,Software Engineering Manager,372000,7.0,5.0,157000,180000.0,35000.0,...,0,0,0,0,0,0,Unknown,Unknown,Sunnyvale,CA
4,2017-06-20 10:58:00,Microsoft,60,Software Engineer,157000,5.0,3.0,0,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,Mountain View,CA


In [90]:
clean_salary_data_df.count()

timestamp                  52746
company                    52746
level                      52746
title                      52746
totalyearlycompensation    52746
yearsofexperience          52746
yearsatcompany             52746
basesalary                 52746
stockgrantvalue            52746
bonus                      52746
gender                     52746
rowNumber                  52746
Masters_Degree             52746
Bachelors_Degree           52746
Doctorate_Degree           52746
Highschool                 52746
Some_College               52746
Race_Asian                 52746
Race_White                 52746
Race_Two_Or_More           52746
Race_Black                 52746
Race_Hispanic              52746
Race                       52746
Education                  52746
city                       52746
state                      52746
dtype: int64

## Cleaning company column


In [91]:
# Show comany column
clean_salary_data_df['company']

0            Oracle
1              eBay
2            Amazon
3             Apple
4         Microsoft
            ...    
62637        Google
62638     Microsoft
62639          MSFT
62640    Salesforce
62641         apple
Name: company, Length: 52746, dtype: object

In [92]:
# Check how many unique values are in the original company column
clean_salary_data_df['company'].nunique()

1463

In [93]:
# Converts all lowercase characters into uppercase characters
clean_salary_data_df['company'] = clean_salary_data_df['company'].str.strip().str.upper()
clean_salary_data_df['company'].head(20)

0         ORACLE
1           EBAY
2         AMAZON
3          APPLE
4      MICROSOFT
5      MICROSOFT
6      MICROSOFT
7      MICROSOFT
8      MICROSOFT
9      MICROSOFT
10    SALESFORCE
11     MICROSOFT
12     MICROSOFT
13     MICROSOFT
14        AMAZON
15        AMAZON
16      FACEBOOK
17          UBER
19     MICROSOFT
20          OATH
Name: company, dtype: object

In [94]:
# Remove any extra spaces
clean_salary_data_df['company']= clean_salary_data_df['company'].replace(r'\s+', ' ', regex=True)
clean_salary_data_df['company'].head(20)

0         ORACLE
1           EBAY
2         AMAZON
3          APPLE
4      MICROSOFT
5      MICROSOFT
6      MICROSOFT
7      MICROSOFT
8      MICROSOFT
9      MICROSOFT
10    SALESFORCE
11     MICROSOFT
12     MICROSOFT
13     MICROSOFT
14        AMAZON
15        AMAZON
16      FACEBOOK
17          UBER
19     MICROSOFT
20          OATH
Name: company, dtype: object

In [95]:
##
clean_salary_data_df['company'] = clean_salary_data_df['company'].str.replace(' LLC', '').str.replace('.ORG', '').str.replace(' LTD', '').str.replace(' CORPORATION', '').str.replace(' INC', '')
clean_salary_data_df['company'] = clean_salary_data_df['company'].str.replace(' MEDIA', '').str.replace(' GROUP', '').str.replace(' TECHNOLOGY', '').str.replace(' TECHNOLOGIES', '').str.strip()
clean_salary_data_df.head(20)

  


Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,basesalary,stockgrantvalue,bonus,...,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education,city,state
0,2017-06-07 11:33:00,ORACLE,L3,Product Manager,127000,1.5,1.5,107000,20000.0,10000.0,...,0,0,0,0,0,0,Unknown,Unknown,Redwood City,CA
1,2017-06-10 17:11:00,EBAY,SE 2,Software Engineer,100000,5.0,3.0,0,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,San Francisco,CA
2,2017-06-11 14:53:00,AMAZON,L7,Product Manager,310000,8.0,0.0,155000,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,Seattle,WA
3,2017-06-17 00:23:00,APPLE,M1,Software Engineering Manager,372000,7.0,5.0,157000,180000.0,35000.0,...,0,0,0,0,0,0,Unknown,Unknown,Sunnyvale,CA
4,2017-06-20 10:58:00,MICROSOFT,60,Software Engineer,157000,5.0,3.0,0,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,Mountain View,CA
5,2017-06-21 17:27:00,MICROSOFT,63,Software Engineer,208000,8.5,8.5,0,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,Seattle,WA
6,2017-06-22 12:37:00,MICROSOFT,65,Software Engineering Manager,300000,15.0,11.0,180000,65000.0,55000.0,...,0,0,0,0,0,0,Unknown,Unknown,Redmond,WA
7,2017-06-22 13:55:00,MICROSOFT,62,Software Engineer,156000,4.0,4.0,135000,8000.0,13000.0,...,0,0,0,0,0,0,Unknown,Unknown,Seattle,WA
8,2017-06-22 23:08:00,MICROSOFT,59,Software Engineer,120000,3.0,1.0,0,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,Redmond,WA
9,2017-06-26 21:25:00,MICROSOFT,63,Software Engineer,201000,12.0,6.0,157000,26000.0,28000.0,...,0,0,0,0,0,0,Unknown,Unknown,Seattle,WA


In [96]:
# Looking at the raw data file there are examples wher the same company is spelled in different ways. 
# For example, "AMAZON", shows as "AMZON", "AWS", "AMAZON WEB SERVICES".

# Clean company names
def clean_company_names(name):
     try: 
         if name.startswith('AMAZON'): final_name = 'AMAZON'
         elif name.startswith('AMZON'): final_name = 'AMAZON'
         elif name.startswith('AWS'): final_name = 'AMAZON'
         elif name.startswith('AMAZON.COM'): final_name = 'AMAZON'
         elif name.startswith('AKAMI'): final_name = 'AKAMI'
         elif name.startswith('BAIN'): final_name = 'BAIN & COMPANY'
         elif name.startswith('APPLE'): final_name = 'APPLE'
         elif name.startswith('ARISTA'): final_name = 'ARISTA'
         elif name.startswith('ARUBA'): final_name = 'ARUBA'
         elif name.startswith('BLOOMBERG'): final_name = 'BLOOMBERG'
         elif name.startswith('BOOKING'): final_name = 'BOOKING.COM'
         elif name.startswith('BOSCH'): final_name = 'BOSCH'
         elif name.startswith('CACI'): final_name = 'CACI'
         elif name.startswith('CADENCE'): final_name = 'CADENCE'
         elif name.startswith('CGI'): final_name = 'CGI'
         elif name.startswith('CISCO'): final_name = 'CISCO'   
         elif name.startswith('COGNIZANT'): final_name = 'COGNIZANT'
         elif name.startswith('COSTCO'): final_name = 'COSTCO'
         elif name.startswith('COUPA'): final_name = 'COUPA'
         elif name.startswith('DELL'): final_name = 'DELL'
         elif name.startswith('DELOITTE'): final_name = 'DELOITTE'
         elif name.startswith('DISCOVER'): final_name = 'DISCOVER'
         elif name.startswith('DISH'): final_name = 'DISH'
         elif name.startswith('DISNEY'): final_name = 'DISNEY'   
         elif name.startswith('EPAM'): final_name = 'EPAM'
         elif name.startswith('ERNST'): final_name = 'ERNST & YOUNG'
         elif name.startswith('EXPEDIA'): final_name = 'EXPEDIA'
         elif name.startswith('FORD'): final_name = 'FORD'
         elif name.startswith('GE'): final_name = 'GE'
         elif name.startswith('GENERAL ELECTRIC'): final_name = 'EXPEDIA'
         elif name.startswith('GOOGLE'): final_name = 'GOOGLE'
         elif name.startswith('GUIDEWARE'): final_name = 'GUIDEWARE'
         elif name.startswith('HERE'): final_name = 'HERE'
         elif name.startswith('INTUITIVE'): final_name = 'INTUITIVE'
         elif name.startswith('JANE STREET'): final_name = 'JANE STREET'
         elif name.startswith('JOHNSON'): final_name = 'JOHNSON'
         elif name.startswith('JPMORGAN'): final_name = 'JP MORGAN'
         elif name.startswith('JP'): final_name = 'JP MORGAN'
         elif name.startswith('JUNIPER'): final_name = 'JUNIPER'
         elif name.startswith('L3HARRIS'): final_name = 'L3HARRIS'
         elif name.startswith('LIBERTY MUTUAL'): final_name = 'LIBERTY MUTUAL'
         elif name.startswith('MACY'): final_name = "MACY'S"
         elif name.startswith('MCKINSEY'): final_name = 'MCKINSEY & COMPANY'
         elif name.startswith('MICROCHIP'): final_name = 'MICROCHIP'
         elif name.startswith('MICRON'): final_name = 'MICRON'
         elif name.startswith('MICROSOFT'): final_name = 'MICROSOFT'
         elif name.startswith('MSFT'): final_name = 'MICROSOFT'
         elif name.startswith("MOODY'S"): final_name = "MOODY'S"                
         elif name.startswith('MOTOROLA'): final_name = 'MOTOROLA'
         elif name.startswith('NUANCE'): final_name = 'NUANCE'
         elif name.startswith('NXP'): final_name = 'NXP'
         elif name.startswith('PANASONIC'): final_name = 'PANASONIC'
         elif name.startswith('PROCORE'): final_name = 'PROCORE'
         elif name.startswith('QUALCOMM'): final_name = 'QUALCOMM'
         elif name.startswith('RAYTHEON'): final_name = 'RAYTHEON'
         elif name.startswith('SAMSUNG'): final_name = 'SAMSUNG'
         elif name.startswith('SAP'): final_name = 'SAP'
         elif name.startswith('SAS'): final_name = 'SAS'
         elif name.startswith('SONY'): final_name = 'SONY'          
         elif name.startswith('TOYOTA'): final_name = 'TOYOTA'
         elif name.startswith('VERIZON'): final_name = 'VERIZON'
         elif name.startswith('VISA'): final_name = 'VISA'      
         elif name.startswith('WALMART'): final_name = 'WALMART'
         elif name.startswith('WILPRO'): final_name = 'WILPRO'
         elif name.startswith('ZILLOW'): final_name = 'ZILLOW'
         elif name.startswith('ZS ASSOCIATES'): final_name = 'ZS'    
         else: final_name = name
     except:
         final_name = name
     return final_name

clean_salary_data_df['company'] = clean_salary_data_df['company'].apply(lambda name: clean_company_names(name))
clean_salary_data_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,basesalary,stockgrantvalue,bonus,...,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education,city,state
0,2017-06-07 11:33:00,ORACLE,L3,Product Manager,127000,1.5,1.5,107000,20000.0,10000.0,...,0,0,0,0,0,0,Unknown,Unknown,Redwood City,CA
1,2017-06-10 17:11:00,EBAY,SE 2,Software Engineer,100000,5.0,3.0,0,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,San Francisco,CA
2,2017-06-11 14:53:00,AMAZON,L7,Product Manager,310000,8.0,0.0,155000,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,Seattle,WA
3,2017-06-17 00:23:00,APPLE,M1,Software Engineering Manager,372000,7.0,5.0,157000,180000.0,35000.0,...,0,0,0,0,0,0,Unknown,Unknown,Sunnyvale,CA
4,2017-06-20 10:58:00,MICROSOFT,60,Software Engineer,157000,5.0,3.0,0,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,Mountain View,CA


In [97]:
# Check how many unique values are in the clean company column
clean_salary_data_df['company'].nunique()

927

## Cleaning Job Title Column

In [98]:
# view all titles
clean_salary_data_df['title']


0                     Product Manager
1                   Software Engineer
2                     Product Manager
3        Software Engineering Manager
4                   Software Engineer
                     ...             
62637               Software Engineer
62638               Software Engineer
62639               Software Engineer
62640               Software Engineer
62641               Software Engineer
Name: title, Length: 52746, dtype: object

In [101]:
# how many of each title
clean_salary_data_df['title'].value_counts()

Software Engineer               34201
Product Manager                  4170
Software Engineering Manager     3041
Data Scientist                   2221
Hardware Engineer                1968
Product Designer                 1344
Technical Program Manager        1231
Solution Architect                893
Management Consultant             814
Business Analyst                  731
Marketing                         625
Mechanical Engineer               455
Recruiter                         403
Sales                             337
Human Resources                   312
Name: title, dtype: int64

In [102]:
# Filter to only jobs related to Data Scientist and Analyst
clean_salary_data_df = clean_salary_data_df[(clean_salary_data_df.title == "Data Scientist") | (clean_salary_data_df.title == 'Business Analyst')]
clean_salary_data_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,yearsofexperience,yearsatcompany,basesalary,stockgrantvalue,bonus,...,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education,city,state
419,2018-06-05 14:06:00,LINKEDIN,Senior,Data Scientist,233000,4.0,0.0,162000,220000.0,10000.0,...,0,0,0,0,0,0,Unknown,Unknown,San Francisco,CA
440,2018-06-08 09:49:00,MICROSOFT,64,Data Scientist,218000,11.0,11.0,165000,28000.0,23000.0,...,0,0,0,0,0,0,Unknown,Unknown,Seattle,WA
444,2018-06-08 17:55:00,EBAY,26,Data Scientist,180000,10.0,5.0,0,0.0,0.0,...,0,0,0,0,0,0,Unknown,Unknown,San Jose,CA
454,2018-06-10 19:39:00,TWITTER,Staff,Data Scientist,500000,4.0,4.0,200000,280000.0,20000.0,...,0,0,0,0,0,0,Unknown,Unknown,San Francisco,CA
495,2018-06-17 11:39:00,FACEBOOK,5,Data Scientist,370000,8.0,3.0,190000,140000.0,40000.0,...,0,0,0,0,0,0,Unknown,Unknown,Seattle,WA


In [104]:
# Check current information
clean_salary_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2952 entries, 419 to 62623
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   timestamp                2952 non-null   datetime64[ns]
 1   company                  2952 non-null   object        
 2   level                    2952 non-null   object        
 3   title                    2952 non-null   object        
 4   totalyearlycompensation  2952 non-null   int64         
 5   yearsofexperience        2952 non-null   float64       
 6   yearsatcompany           2952 non-null   float64       
 7   basesalary               2952 non-null   int64         
 8   stockgrantvalue          2952 non-null   float64       
 9   bonus                    2952 non-null   float64       
 10  gender                   2952 non-null   object        
 11  rowNumber                2952 non-null   int64         
 12  Masters_Degree           2952 n

## Export dataframe to Excel file

In [105]:
filepath = Path('Resources/clean_salary_data.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
clean_salary_data_df.to_csv(filepath)