In [1]:
# Import libraries
import pandas as pd
import numpy as np
pd.set_option('max_colwidth', 400)
import requests
import matplotlib.pyplot as plt
import json
import re
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [2]:
# Read csv from raw github link and create dataframe
url = 'https://raw.githubusercontent.com/dianeooty/datascience_salary/main/Resources/layoffs_cleaned.csv'
layoffs_df = pd.read_csv(url)

In [3]:
# Read csv from raw github link and create dataframe
url = 'https://raw.githubusercontent.com/dianeooty/datascience_salary/main/Resources/salaries_cleaned.csv'
salaries_df = pd.read_csv(url)

In [4]:
# View dataframe info
salaries_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62642 entries, 0 to 62641
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               62642 non-null  int64  
 1   timestamp                62642 non-null  object 
 2   date                     62642 non-null  object 
 3   company                  62637 non-null  object 
 4   level                    62523 non-null  object 
 5   title                    62642 non-null  object 
 6   totalyearlycompensation  62642 non-null  int64  
 7   location                 62642 non-null  object 
 8   latitude                 62642 non-null  float64
 9   longitude                62642 non-null  float64
 10  yearsofexperience        62642 non-null  float64
 11  yearsatcompany           62642 non-null  float64
 12  basesalary               62642 non-null  int64  
 13  stockgrantvalue          62642 non-null  int64  
 14  bonus                 

In [5]:
# Display dataframe
salaries_df = salaries_df.drop(columns='Unnamed: 0')
salaries_df.head()

Unnamed: 0,timestamp,date,company,level,title,totalyearlycompensation,location,latitude,longitude,yearsofexperience,yearsatcompany,basesalary,stockgrantvalue,bonus,gender,Race,Education
0,2017-06-07 11:33:00,2017-06-07 00:00:00,Oracle,L3,Product Manager,127000,"Redwood City, CA",37.486324,-122.232523,1.5,1.5,107000,20000,10000,Unknown,Unknown,Unknown
1,2017-06-10 17:11:00,2017-06-10 00:00:00,eBay,SE 2,Software Engineer,100000,"San Francisco, CA",37.779026,-122.419906,5.0,3.0,0,0,0,Unknown,Unknown,Unknown
2,2017-06-11 14:53:00,2017-06-11 00:00:00,Amazon,L7,Product Manager,310000,"Seattle, WA",47.603832,-122.330062,8.0,0.0,155000,0,0,Unknown,Unknown,Unknown
3,2017-06-17 00:23:00,2017-06-17 00:00:00,Apple,M1,Software Engineering Manager,372000,"Sunnyvale, CA",37.36883,-122.036349,7.0,5.0,157000,180000,35000,Unknown,Unknown,Unknown
4,2017-06-20 10:58:00,2017-06-20 00:00:00,Microsoft,60,Software Engineer,157000,"Mountain View, CA",37.389389,-122.08321,5.0,3.0,0,0,0,Unknown,Unknown,Unknown


In [6]:
# Check dataframe's info
layoffs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2545 entries, 0 to 2544
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           2545 non-null   int64  
 1   company              2545 non-null   object 
 2   location             2545 non-null   object 
 3   industry             2543 non-null   object 
 4   total_laid_off       2545 non-null   int64  
 5   percentage_laid_off  1694 non-null   float64
 6   date                 2543 non-null   object 
 7   stage                2539 non-null   object 
 8   country              2545 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 179.1+ KB


In [7]:
# Display dataframe
layoffs_df = layoffs_df.drop(columns='Unnamed: 0')
layoffs_df.head()

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,stage,country
0,N26,Berlin,Finance,71,4.0,2023-04-28,Series E,United States
1,Providoor,Melbourne,Food,0,100.0,2023-04-28,Unknown,Australia
2,Dropbox,SF Bay Area,Other,500,16.0,2023-04-27,Post-IPO,United States
3,Vroom,New York City,Transportation,120,11.0,2023-04-27,Post-IPO,United States
4,Greenhouse,New York City,Recruiting,100,12.0,2023-04-27,Private Equity,United States


In [8]:
# Copy layoffs_df
copy_layoff_df = layoffs_df

In [9]:
# Create a dataframe to hold only the STEM companies found in salaries_df
stem_df = copy_layoff_df[copy_layoff_df['company'].isin(salaries_df['company'])]

In [10]:
stem_df.shape

(517, 8)

In [11]:
stem_df.head()

Unnamed: 0,company,location,industry,total_laid_off,percentage_laid_off,date,stage,country
0,N26,Berlin,Finance,71,4.0,2023-04-28,Series E,United States
2,Dropbox,SF Bay Area,Other,500,16.0,2023-04-27,Post-IPO,United States
11,Alteryx,Los Angeles,Data,0,11.0,2023-04-27,Post-IPO,United States
19,Red Hat,Raleigh,Other,760,4.0,2023-04-24,Acquired,United States
21,Lyft,SF Bay Area,Transportation,1072,26.0,2023-04-21,Post-IPO,United States


In [12]:
[x for x in stem_df['company']]

['N26',
 'Dropbox',
 'Alteryx',
 'Red Hat',
 'Lyft',
 'Benchling',
 'Pluralsight',
 'BuzzFeed',
 'Lenovo',
 'Opendoor',
 'Viasat',
 'Medtronic',
 'Redfin',
 'Amplitude',
 'Apple',
 'Hulu',
 'Roku',
 'LendingTree',
 'Spotify',
 'CoverMyMeds',
 'Electronic Arts',
 'Seagate',
 'Lucid Motors',
 'Rackspace',
 'Cimpress',
 'Indeed',
 'Logitech',
 'Glassdoor',
 'Just Eat',
 'Marvell',
 'Workhuman',
 'Xing',
 'Expedia',
 'Amazon',
 'Course Hero',
 'Freshworks',
 'Klaviyo',
 'Samsung',
 'Xero',
 'Shopee',
 'Gopuff',
 'Wattpad',
 'Zulily',
 'Atlassian',
 'SiriusXM',
 'UKG',
 'Airbnb',
 'Indigo',
 'Zscaler',
 'Flipkart',
 'Truckstop.com',
 'Thoughtworks',
 'Waymo',
 'Sonder',
 'Electronic Arts',
 'Eventbrite',
 'Palantir',
 'Twitter',
 'Ericsson',
 'Medallia',
 'Poshmark',
 'Bolt',
 'Criteo',
 'Zalando',
 'HP',
 'Micron',
 'Tencent',
 'Evernote',
 'DocuSign',
 'Smartsheet',
 'Convoy',
 'Wix',
 'ServiceTitan',
 'DigitalOcean',
 'Sprinklr',
 'Betterment',
 'Udemy',
 'Twilio',
 'iRobot',
 'Collectiv