In [1]:
import pandas as pd
import numpy as np
from pathlib import Path 

In [2]:
# load file
file_load = "Resources/Salary_Data.csv"

In [3]:
# read file and store in Pandas DataFrame
salary_data_df = pd.read_csv(file_load)
salary_data_df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,...,Doctorate_Degree,Highschool,Some_College,Race_Asian,Race_White,Race_Two_Or_More,Race_Black,Race_Hispanic,Race,Education
0,6/7/17 11:33,Oracle,L3,Product Manager,127000,"Redwood City, CA",1.5,1.5,,107000,...,0,0,0,0,0,0,0,0,,
1,6/10/17 17:11,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",5.0,3.0,,0,...,0,0,0,0,0,0,0,0,,
2,6/11/17 14:53,Amazon,L7,Product Manager,310000,"Seattle, WA",8.0,0.0,,155000,...,0,0,0,0,0,0,0,0,,
3,6/17/17 0:23,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",7.0,5.0,,157000,...,0,0,0,0,0,0,0,0,,
4,6/20/17 10:58,Microsoft,60,Software Engineer,157000,"Mountain View, CA",5.0,3.0,,0,...,0,0,0,0,0,0,0,0,,


In [4]:
# see columns
salary_data_df.columns


Index(['timestamp', 'company', 'level', 'title', 'totalyearlycompensation',
       'location', 'yearsofexperience', 'yearsatcompany', 'tag', 'basesalary',
       'stockgrantvalue', 'bonus', 'gender', 'otherdetails', 'cityid', 'dmaid',
       'rowNumber', 'Masters_Degree', 'Bachelors_Degree', 'Doctorate_Degree',
       'Highschool', 'Some_College', 'Race_Asian', 'Race_White',
       'Race_Two_Or_More', 'Race_Black', 'Race_Hispanic', 'Race', 'Education'],
      dtype='object')

In [5]:
# check data types
salary_data_df.dtypes

timestamp                   object
company                     object
level                       object
title                       object
totalyearlycompensation      int64
location                    object
yearsofexperience          float64
yearsatcompany             float64
tag                         object
basesalary                   int64
stockgrantvalue            float64
bonus                      float64
gender                      object
otherdetails                object
cityid                       int64
dmaid                      float64
rowNumber                    int64
Masters_Degree               int64
Bachelors_Degree             int64
Doctorate_Degree             int64
Highschool                   int64
Some_College                 int64
Race_Asian                   int64
Race_White                   int64
Race_Two_Or_More             int64
Race_Black                   int64
Race_Hispanic                int64
Race                        object
Education           

In [6]:
# get columns and rows that ar not null
salary_data_df.count()

timestamp                  62642
company                    62637
level                      62523
title                      62642
totalyearlycompensation    62642
location                   62642
yearsofexperience          62642
yearsatcompany             62642
tag                        61788
basesalary                 62642
stockgrantvalue            62642
bonus                      62642
gender                     43102
otherdetails               40137
cityid                     62642
dmaid                      62640
rowNumber                  62642
Masters_Degree             62642
Bachelors_Degree           62642
Doctorate_Degree           62642
Highschool                 62642
Some_College               62642
Race_Asian                 62642
Race_White                 62642
Race_Two_Or_More           62642
Race_Black                 62642
Race_Hispanic              62642
Race                       22427
Education                  30370
dtype: int64

In [7]:
# get sum on columns and rows that are not null
salary_data_df.isnull().sum()

timestamp                      0
company                        5
level                        119
title                          0
totalyearlycompensation        0
location                       0
yearsofexperience              0
yearsatcompany                 0
tag                          854
basesalary                     0
stockgrantvalue                0
bonus                          0
gender                     19540
otherdetails               22505
cityid                         0
dmaid                          2
rowNumber                      0
Masters_Degree                 0
Bachelors_Degree               0
Doctorate_Degree               0
Highschool                     0
Some_College                   0
Race_Asian                     0
Race_White                     0
Race_Two_Or_More               0
Race_Black                     0
Race_Hispanic                  0
Race                       40215
Education                  32272
dtype: int64

Want to get rid of columns that have a high number of nulls : 'level', 'tag', 'gender', 'otherdetails', 'Race' and 'Education'
We can also delete the rows that have null values in 'company' and 'dmaid'

In [8]:
# drop columns 'level', 'tag', 'gender', 'otherdetails', 'Race' and 'Education' 
clean_salary_data_df = salary_data_df.drop(['tag', 'gender', 'otherdetails', 'Race', 'Education','cityid', 'dmaid'], axis=1)

In [9]:
# see clean columns
clean_salary_data_df.columns

Index(['timestamp', 'company', 'level', 'title', 'totalyearlycompensation',
       'location', 'yearsofexperience', 'yearsatcompany', 'basesalary',
       'stockgrantvalue', 'bonus', 'rowNumber', 'Masters_Degree',
       'Bachelors_Degree', 'Doctorate_Degree', 'Highschool', 'Some_College',
       'Race_Asian', 'Race_White', 'Race_Two_Or_More', 'Race_Black',
       'Race_Hispanic'],
      dtype='object')

In [10]:
#drop rows that have na values
clean_salary_data_df = clean_salary_data_df.dropna()

In [11]:
# get sum on columns and rows that are not null
clean_salary_data_df.isnull().sum()

timestamp                  0
company                    0
level                      0
title                      0
totalyearlycompensation    0
location                   0
yearsofexperience          0
yearsatcompany             0
basesalary                 0
stockgrantvalue            0
bonus                      0
rowNumber                  0
Masters_Degree             0
Bachelors_Degree           0
Doctorate_Degree           0
Highschool                 0
Some_College               0
Race_Asian                 0
Race_White                 0
Race_Two_Or_More           0
Race_Black                 0
Race_Hispanic              0
dtype: int64

In [12]:
clean_salary_data_df.count()

timestamp                  62518
company                    62518
level                      62518
title                      62518
totalyearlycompensation    62518
location                   62518
yearsofexperience          62518
yearsatcompany             62518
basesalary                 62518
stockgrantvalue            62518
bonus                      62518
rowNumber                  62518
Masters_Degree             62518
Bachelors_Degree           62518
Doctorate_Degree           62518
Highschool                 62518
Some_College               62518
Race_Asian                 62518
Race_White                 62518
Race_Two_Or_More           62518
Race_Black                 62518
Race_Hispanic              62518
dtype: int64

In [13]:
# change the type of timestamp column to_datetime
clean_salary_data_df['timestamp'] = pd.to_datetime(clean_salary_data_df['timestamp'])

In [14]:
clean_salary_data_df.dtypes

timestamp                  datetime64[ns]
company                            object
level                              object
title                              object
totalyearlycompensation             int64
location                           object
yearsofexperience                 float64
yearsatcompany                    float64
basesalary                          int64
stockgrantvalue                   float64
bonus                             float64
rowNumber                           int64
Masters_Degree                      int64
Bachelors_Degree                    int64
Doctorate_Degree                    int64
Highschool                          int64
Some_College                        int64
Race_Asian                          int64
Race_White                          int64
Race_Two_Or_More                    int64
Race_Black                          int64
Race_Hispanic                       int64
dtype: object

In [17]:
filepath = Path('Resources/clean_salary_data.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
clean_salary_data_df.to_csv(filepath)