In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [35]:
df = pd.DataFrame(pd.read_csv('scraped_and_full_data.csv'))

In [36]:
df.columns

Index(['Unnamed: 0', 'date', 'companyId', 'jobId', 'country', 'stateProvince',
       'city', 'avgOverallRating', 'numReviews', 'industry', 'normTitle',
       'normTitleCategory', 'descriptionCharacterLength',
       'descriptionWordCount', 'experienceRequired', 'estimatedSalary',
       'salaryCurrency', 'jobLanguage', 'supervisingJob', 'licenseRequiredJob',
       'educationRequirements', 'jobAgeDays', 'clicks', 'localClicks', 'State',
       'Number of Residents in Millions', '2017rate', 'Index', 'Grocery',
       'Housing', 'Utilities', 'Transportation', 'Heath', 'Misc.'],
      dtype='object')

Get rid of unneeded columns, and drop NA values.

In [37]:
unneeded = ['Unnamed: 0', 'date', 'companyId', 'country', 'avgOverallRating', 'numReviews', 'descriptionCharacterLength',
       'descriptionWordCount', 'experienceRequired','salaryCurrency', 'jobLanguage', 'supervisingJob', 'licenseRequiredJob',
       'jobAgeDays', 'Index', 'Grocery', "industry", "city", "estimatedSalary"
       'Housing', 'Utilities', 'Transportation', 'Heath', 'Misc.', 'normTitle',
       'normTitleCategory']

In [45]:
df_important = df.drop([col for col in unneeded if col in df], axis = 1)
df_important.head()

Unnamed: 0,jobId,stateProvince,estimatedSalary,educationRequirements,clicks,localClicks,State,Number of Residents in Millions,2017rate,Housing
0,job0000002,AZ,22800,High School,12,2,Arizona,7.02,4.9,89.5
1,job0000416,AZ,27000,,24,2,Arizona,7.02,4.9,89.5
2,job0000420,AZ,43700,,20,2,Arizona,7.02,4.9,89.5
3,job0000420,AZ,43700,,12,1,Arizona,7.02,4.9,89.5
4,job0000420,AZ,43700,,17,1,Arizona,7.02,4.9,89.5


In [46]:
df_important = df_important.dropna(subset=["educationRequirements"])
df_important.head()

Unnamed: 0,jobId,stateProvince,estimatedSalary,educationRequirements,clicks,localClicks,State,Number of Residents in Millions,2017rate,Housing
0,job0000002,AZ,22800,High School,12,2,Arizona,7.02,4.9,89.5
1,job0000416,AZ,27000,,24,2,Arizona,7.02,4.9,89.5
2,job0000420,AZ,43700,,20,2,Arizona,7.02,4.9,89.5
3,job0000420,AZ,43700,,12,1,Arizona,7.02,4.9,89.5
4,job0000420,AZ,43700,,17,1,Arizona,7.02,4.9,89.5


Here we will create a new dataframe with columns based on calculations from the data from the total clicks dataframe. 

In [56]:
df_total_clicks = pd.DataFrame(None)
df_total_clicks["Total Job Clicks"] = df_important.groupby(["jobId"])["clicks"].sum()
df_total_clicks["Total Job Local Clicks"] = df_important.groupby(["jobId"])["localClicks"].sum()

In [57]:
df_total_clicks = df_total_clicks.reset_index()
df_total_clicks["localClicks/totalClicks"] = df_total_clicks["Total Job Local Clicks"]/df_total_clicks["Total Job Clicks"]


Unnamed: 0,jobId,Total Job Clicks,Total Job Local Clicks,localClicks/totalClicks
0,job0000002,12,2,0.166667
1,job0000003,15,3,0.2
2,job0000005,25,8,0.32
3,job0000006,33,1,0.030303
4,job0000008,87,9,0.103448


In [59]:
unemployment = pd.DataFrame(df_important.groupby(["stateProvince"])["2017rate"].mean()).reset_index()
unemployment.head()

Unnamed: 0,stateProvince,2017rate
0,AK,7.2
1,AL,4.4
2,AR,3.7
3,AZ,4.9
4,CA,4.8
5,CO,2.8
6,CT,4.7
7,DE,4.6
8,FL,4.2
9,GA,4.7


We will then create dataframes for the educational requirements of job types in every state, as well as the attention given to different jobs in each state scaled by state unemployment.

In [89]:
job_state = pd.DataFrame(None)
job_state["Ones"] = df_important.groupby(["jobId"])["stateProvince"].value_counts()
job_state = job_state.reset_index()
job_state = job_state.drop(['Ones'], axis = 1)

In [94]:
edu = pd.DataFrame(None)
edu["Ones"] = df_important.groupby(["jobId"])["educationRequirements"].value_counts()
edu = edu.reset_index()
edu = edu.drop(["Ones"], axis = 1)

In [99]:
merge1 = job_state.merge(unemployment)
merge2 = merge1.merge(edu)
df_individual_jobs = merge2.merge(df_total_clicks)

In [107]:
df_edu_clicks_by_state = pd.DataFrame(None)
df_edu_clicks_by_state["Average Local:Total Click Ratio"] = df_individual_jobs.groupby(["stateProvince", "educationRequirements"])["localClicks/totalClicks"].mean()
df_edu_clicks_by_state = df_edu_clicks_by_state.reset_index()
df_edu_clicks_by_state = df_edu_clicks_by_state.merge(unemployment)
df_edu_clicks_by_state["Average Local:Total Click Ratio/State Unemployment"] = df_edu_clicks_by_state["Average Local:Total Click Ratio"]/df_edu_clicks_by_state["2017rate"]
df_edu_clicks_by_state

Unnamed: 0,stateProvince,educationRequirements,Average Local:Total Click Ratio,2017rate,Average Local:Total Click Ratio/State Unemployment
0,AK,High School,0.184546,7.2,0.025631
1,AK,Higher Education,0.176235,7.2,0.024477
2,AK,,0.184500,7.2,0.025625
3,AL,High School,0.170290,4.4,0.038702
4,AL,Higher Education,0.165083,4.4,0.037519
5,AL,,0.170260,4.4,0.038695
6,AR,High School,0.163654,3.7,0.044231
7,AR,Higher Education,0.143603,3.7,0.038812
8,AR,,0.160872,3.7,0.043479
9,AZ,High School,0.193026,4.9,0.039393


In [108]:
df_edu_clicks_by_state.to_csv("edu_click_ratio_by_state.csv")