# Background

In [3]:
import platform
display(platform.system())
import os
file_download_link = 'https://data.ca.gov/dataset/3f08b68e-1d1a-4ba4-a07d-1ec3392ed191/resource/78a9d6ee-ec9a-4c25-ae34-5bac44010cb2/download/qcew_2016-2019.csv'
if os.name == 'nt':
    print('Please download your dataset here:', file_download_link)
else:
    # If on another OS, use wget to download the CSV file directly
    !wget -O qcew_2016-2019.csv "$file_download_link" -o /dev/null
    print("File downloaded successfully as qcew_2016-2019.csv")

'Darwin'

File downloaded successfully as qcew_2016-2019.csv


In [None]:
#Only run if using Google Colab, do not run in VSCode
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [5]:
import sys
!{sys.executable} -m pip install imbalanced-learn delayed



In [4]:
import pandas as pd
# If put the data(.csv) under the same folder, you could use
df = pd.read_csv('./qcew_2016-2019.csv')
print(df.shape)
df.head()

(1009955, 15)


Unnamed: 0,Area Type,Area Name,Year,Quarter,Ownership,NAICS Level,NAICS Code,Industry Name,Establishments,Average Monthly Employment,1st Month Emp,2nd Month Emp,3rd Month Emp,Total Wages (All Workers),Average Weekly Wages
0,County,Alameda County,2016,1st Qtr,Federal Government,2,1023,Financial Activities,1,10,10,10,10,359690.0,2767.0
1,County,Alameda County,2016,1st Qtr,Federal Government,2,1028,Public Administration,72,5174,5193,5143,5188,136068970.0,2023.0
2,County,Alameda County,2016,1st Qtr,Private,5,11121,Vegetable and Melon Farming,7,29,31,27,29,210934.0,560.0
3,County,Alameda County,2016,1st Qtr,Private,4,1114,Greenhouse and Nursery Production,7,52,61,64,31,748616.0,1107.0
4,County,Alameda County,2016,1st Qtr,Private,5,11142,Nursery and Floriculture Production,7,52,61,64,31,748616.0,1107.0


In [6]:
#Lots of data points, maybe we can drop the older years
print("Number of datapoints from 2018-2019: ", df.loc[df['Year'] == 2019].shape[0] + df.loc[df['Year'] == 2018].shape[0])

Number of datapoints from 2018-2019:  506913


In [7]:
#500K datapoints should be sufficient, and it should also be more accurate since its newer data
df = df[df['Year'] != 2016]
df = df[df['Year'] != 2017]
print(df.shape)

(506913, 15)


In [8]:
df.columns

Index(['Area Type', 'Area Name', 'Year', 'Quarter', 'Ownership', 'NAICS Level',
       'NAICS Code', 'Industry Name', 'Establishments',
       'Average Monthly Employment', '1st Month Emp', '2nd Month Emp',
       '3rd Month Emp', 'Total Wages (All Workers)', 'Average Weekly Wages'],
      dtype='object')

In [9]:
#Since the NAICS code just represents the industry, and we already have indsutry name, lets drop NAICS codes and NAICS levels
df.drop(['NAICS Level','NAICS Code'],axis=1,inplace=True)
df.head()

Unnamed: 0,Area Type,Area Name,Year,Quarter,Ownership,Industry Name,Establishments,Average Monthly Employment,1st Month Emp,2nd Month Emp,3rd Month Emp,Total Wages (All Workers),Average Weekly Wages
147,County,Marin County,2019,Annual,Private,Used Car Dealers,6,46,0,0,0,3170232.0,1340.0
148,County,Marin County,2019,Annual,Private,Department Stores,10,819,0,0,0,28355846.0,666.0
149,County,Marin County,2019,Annual,Private,Travel Agencies,19,189,0,0,0,10466322.0,1065.0
150,County,Modoc County,2019,Annual,Private,Logging,4,19,0,0,0,1008116.0,1048.0
151,County,Modoc County,2019,Annual,Private,Logging,4,19,0,0,0,1008116.0,1048.0


In [10]:
#We don't need to know the total wages for all of the workers, who owns the business, or the individual months of employement
df.drop(['Ownership','1st Month Emp','2nd Month Emp','3rd Month Emp','Total Wages (All Workers)'],axis=1,inplace=True)
df.head()

Unnamed: 0,Area Type,Area Name,Year,Quarter,Industry Name,Establishments,Average Monthly Employment,Average Weekly Wages
147,County,Marin County,2019,Annual,Used Car Dealers,6,46,1340.0
148,County,Marin County,2019,Annual,Department Stores,10,819,666.0
149,County,Marin County,2019,Annual,Travel Agencies,19,189,1065.0
150,County,Modoc County,2019,Annual,Logging,4,19,1048.0
151,County,Modoc County,2019,Annual,Logging,4,19,1048.0


In [11]:
#Checking what Area Types we have
df['Area Type'].unique()

array(['County', 'California - Statewide', 'United States'], dtype=object)

In [12]:
#We want to focus on the county level, lets ensure there are enough datapoints to drop the rest
print(df.loc[df['Area Type'] == 'County'].shape)
df = df[df['Area Type'] == 'County']

(441540, 8)


In [13]:
#Since there may be overlap between Quarters or the Annual datapoints, lets only use Annual
print(df.loc[df['Quarter'] == 'Annual'].shape)
df = df[df['Quarter'] == 'Annual']

(89255, 8)


In [14]:
#Now we can drop 'Area Type' and 'Quarter'
df.drop(['Area Type','Quarter'],axis=1,inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Area Name,Year,Industry Name,Establishments,Average Monthly Employment,Average Weekly Wages
0,Marin County,2019,Used Car Dealers,6,46,1340.0
1,Marin County,2019,Department Stores,10,819,666.0
2,Marin County,2019,Travel Agencies,19,189,1065.0
3,Modoc County,2019,Logging,4,19,1048.0
4,Modoc County,2019,Logging,4,19,1048.0


In [15]:
df.shape

(89255, 6)

In [16]:
#Check for missing values
df.isnull().sum()

Area Name                     0
Year                          0
Industry Name                 0
Establishments                0
Average Monthly Employment    0
Average Weekly Wages          0
dtype: int64

In [17]:
#Drop outliers using IQR
def dropOutliers(df, colName):
    [Q1, Q2, Q3] = df[colName].quantile([0.25, 0.5, 0.75])
    IQR = Q3 - Q1
    return df[(df[colName] >= (Q1 - (1.5 * IQR))) & (df[colName] <= (Q3 + (1.5 * IQR)))]
df = dropOutliers(df, 'Establishments')
df = dropOutliers(df, 'Average Monthly Employment')
df = dropOutliers(df, 'Average Weekly Wages')
print(df.shape)

(65689, 6)
