### Import Libraries

In [37]:
# import libraries and set max column width as 100 to display 

import plotly.plotly as py
import pandas as pd
from pivottablejs import pivot_ui
import os
import qgrid

pd.set_option("display.max_colwidth", 40)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

### Get Data

In [38]:
# Importing Data and display fill 

data_h1 = pd.read_csv("H-1B_Disclosure_Data_FY17.csv", low_memory=False, index_col=0)
print("%s MB" %(os.path.getsize('H-1B_Disclosure_Data_FY17.csv') / 1000000))

251.614924 MB


### Data Exploration

In [39]:
# display all column Names

print("Number of features: %d\n" %len(data_h1.columns))
print("##### All features ######")
for col in data_h1.columns:
    print(col)

Number of features: 52

##### All features ######
CASE_NUMBER
CASE_STATUS
CASE_SUBMITTED
DECISION_DATE
VISA_CLASS
EMPLOYMENT_START_DATE
EMPLOYMENT_END_DATE
EMPLOYER_NAME
EMPLOYER_BUSINESS_DBA
EMPLOYER_ADDRESS
EMPLOYER_CITY
EMPLOYER_STATE
EMPLOYER_POSTAL_CODE
EMPLOYER_COUNTRY
EMPLOYER_PROVINCE
EMPLOYER_PHONE
EMPLOYER_PHONE_EXT
AGENT_REPRESENTING_EMPLOYER
AGENT_ATTORNEY_NAME
AGENT_ATTORNEY_CITY
AGENT_ATTORNEY_STATE
JOB_TITLE
SOC_CODE
SOC_NAME
NAICS_CODE
TOTAL_WORKERS
NEW_EMPLOYMENT
CONTINUED_EMPLOYMENT
CHANGE_PREVIOUS_EMPLOYMENT
NEW_CONCURRENT_EMPLOYMENT
CHANGE_EMPLOYER
AMENDED_PETITION
FULL_TIME_POSITION
PREVAILING_WAGE
PW_UNIT_OF_PAY
PW_WAGE_LEVEL
PW_SOURCE
PW_SOURCE_YEAR
PW_SOURCE_OTHER
WAGE_RATE_OF_PAY_FROM
WAGE_RATE_OF_PAY_TO
WAGE_UNIT_OF_PAY
H1B_DEPENDENT
WILLFUL_VIOLATOR
SUPPORT_H1B
LABOR_CON_AGREE
PUBLIC_DISCLOSURE_LOCATION
WORKSITE_CITY
WORKSITE_COUNTY
WORKSITE_STATE
WORKSITE_POSTAL_CODE
ORIGINAL_CERT_DATE


In [40]:
# Proportions of all available Visa types in the data set 

data_h1.VISA_CLASS.value_counts(normalize = True) * 100

H-1B               97.703354
E-3 Australian      1.946210
H-1B1 Singapore     0.200752
H-1B1 Chile         0.149684
Name: VISA_CLASS, dtype: float64

In [41]:
# Getting rid of non h1b visa and checking top 10 states based on h1b visa counts

data_h1 = data_h1[data_h1["VISA_CLASS"] == "H-1B"]
pd.DataFrame(data_h1.EMPLOYER_STATE.value_counts(normalize = True)).head(10) * 100 

Unnamed: 0,EMPLOYER_STATE
CA,16.706757
TX,14.21137
NJ,12.357157
IL,6.687848
NY,6.622633
MI,4.431365
PA,4.140518
MA,3.757583
MD,3.488364
WA,3.278627


In [42]:
# Most popular job in a state

most_popular_job_title = data_h1[["EMPLOYER_STATE","JOB_TITLE"]]
most_popular_job_title = most_popular_job_title.groupby(["EMPLOYER_STATE","JOB_TITLE"]).size().reset_index()
most_popular_job_title.columns = ['EMPLOYER_STATE', 'JOB_TITLE', "COUNT"]
test = most_popular_job_title.groupby(['EMPLOYER_STATE', 'JOB_TITLE']).agg({'COUNT':sum})
test = test['COUNT'].groupby(level=0, group_keys=False)
res = test.apply(lambda x: x.sort_values(ascending=False).head(1))
res = pd.DataFrame(res).reset_index()

tot = pd.DataFrame(data_h1["EMPLOYER_STATE"].value_counts()).reset_index()
tot.columns = ["EMPLOYER_STATE","TOTAL"]
res = pd.merge(res, tot, on = "EMPLOYER_STATE", how = "left")
res["PERCENTAGE"] = res["COUNT"] * 100 /res["TOTAL"]

In [43]:
res.head()

Unnamed: 0,EMPLOYER_STATE,JOB_TITLE,COUNT,TOTAL,PERCENTAGE
0,AK,MEDICAL TECHNOLOGIST,6,75,8.0
1,AL,ASSISTANT PROFESSOR,144,1174,12.265758
2,AR,PROGRAMMER ANALYST,228,2173,10.492407
3,AS,ASSOCIATE VETERINARIAN,1,1,100.0
4,AZ,SOFTWARE ENGINEER,281,4032,6.969246


In [44]:
# Companies in a particular state Sponsoring

def H1Bcompanies_in_state(state_two_letter, top = None):
    companies_in_ut = data_h1[data_h1["EMPLOYER_STATE"] == state_two_letter][["EMPLOYER_STATE","EMPLOYER_NAME"]]["EMPLOYER_NAME"].value_counts()
#     for c, n in zip(companies_in_ut.index, companies_in_ut):
#         print(c, n)
    if top == None:result = companies_in_ut 
    else:result = companies_in_ut.head(top)
    return result

In [45]:
# Top 10 coompanies in Utah Sponsoring H1-B

H1Bcompanies_in_state("UT", top = 10)

CONNVERTEX TECHNOLOGIES INC.     234
UNIVERSITY OF UTAH               203
OVERSTOCK.COM, INC.              100
INNOVECTURE                       69
IM FLASH TECHNOLOGIES, LLC        57
NITYA SOFTWARE SOLUTIONS, INC     44
FATPIPE TECHNOLOGIES INC.         42
ANCESTRY.COM OPERATIONS, INC.     40
ZB, N.A.                          39
VAREX IMAGING CORPORATION         30
Name: EMPLOYER_NAME, dtype: int64

In [46]:
# Top 10 companies in Georgia Sponsoring H1-B

H1Bcompanies_in_state("GA",top = 10)

EMORY UNIVERSITY                     288
CHARTER GLOBAL, INC.                 286
NIIT TECHNOLOGIES LIMITED            267
SOFTVISION, LLC                      238
PROFICIENT BUSINESS SYSTEMS, INC.    207
SOFTPATH SYSTEM LLC                  182
ADROIX CORP                          178
DEVISE SOFT, INC                     158
IDHASOFT, INC.                       158
VALUELABS, INC                       157
Name: EMPLOYER_NAME, dtype: int64

In [47]:
# Levels share

data_h1["PW_WAGE_LEVEL"].value_counts(normalize = True) * 100

Level I      44.538002
Level II     35.991126
Level III    12.417337
Level IV      7.053536
Name: PW_WAGE_LEVEL, dtype: float64

In [48]:
# Top 10 H1B job titles in USA

title = pd.DataFrame(data_h1.JOB_TITLE.value_counts()).reset_index()
title["Percentage"] = pd.DataFrame(data_h1.JOB_TITLE.value_counts
                                   (normalize = True) * 100).reset_index()["JOB_TITLE"]
title.columns = ["title","count","perc"]
title.head(10)

Unnamed: 0,title,count,perc
0,PROGRAMMER ANALYST,47007,7.70229
1,SOFTWARE ENGINEER,27636,4.528272
2,SOFTWARE DEVELOPER,16019,2.624779
3,SYSTEMS ANALYST,10751,1.761596
4,BUSINESS ANALYST,7349,1.204164
5,COMPUTER PROGRAMMER,7277,1.192366
6,SENIOR SOFTWARE ENGINEER,6855,1.12322
7,COMPUTER SYSTEMS ANALYST,6522,1.068657
8,DEVELOPER,5824,0.954286
9,ASSISTANT PROFESSOR,4934,0.808456


In [49]:
# Where does Data Science and related title Stand here?

print("Percentage of Data Science and Related Titles %f"
      %(100 * title[title["title"].str.contains("DATA SCIENC")]["perc"].sum()))

Percentage of Data Science and Related Titles 5.833206


In [131]:
# Checking available "PW_UNIT_OF_PAY"

data_h1["PW_UNIT_OF_PAY"].value_counts(normalize = True)

Year         0.938129
Hour         0.061128
Month        0.000483
Week         0.000185
Bi-Weekly    0.000074
Name: PW_UNIT_OF_PAY, dtype: float64

In [132]:
# Creating "ACTUAL_SALARY" variable to normalize all the salaries  

# unitpay_to_num = {"Year":1, "Hour": 2080, "Month": 12, "Bi-Weekly": 24}
# data_h1["MULTIPLIER"] = data_h1["PW_UNIT_OF_PAY"].map(unitpay_to_num)
# data_h1["ACTUAL_SALARY"] = data_h1["WAGE_RATE_OF_PAY_FROM"] * data_h1["MULTIPLIER"]

# retaining only yearly pays
data_h1 = data_h1[data_h1["PW_UNIT_OF_PAY"] == "Year"]

In [133]:
# What are associate data scientist are being paid in US?

ass_ds = data_h1[data_h1["JOB_TITLE"] == "ASSOCIATE DATA SCIENTIST"][["WAGE_RATE_OF_PAY_FROM","PW_UNIT_OF_PAY","EMPLOYER_NAME","EMPLOYER_CITY","EMPLOYER_STATE"]]
ass_ds = ass_ds.reset_index(drop = True)

In [134]:
# Visualization of Associate Data Scientist 

viz = ass_ds.sort_values(by = "WAGE_RATE_OF_PAY_FROM", ascending= False)

In [135]:
#viz = viz.drop("text", axis = 1)

viz["WAGE_RATE_OF_PAY_FROM"] = viz["WAGE_RATE_OF_PAY_FROM"].astype(float)
final_viz = pd.DataFrame(viz.groupby(['EMPLOYER_STATE'])['WAGE_RATE_OF_PAY_FROM'].mean()).reset_index()

In [136]:
# fill missing job titles with "not available"

data_h1["JOB_TITLE"] = data_h1["JOB_TITLE"].fillna("not available")

### Only Data Science related jobs

In [142]:
def dataviz_job_salary_dist(data, job_title_list, salary_column):
    df_list = [data_h1[data_h1["JOB_TITLE"].str.contains(jt)] for jt in job_title_list]
    result_df = pd.concat(df_list, axis = 1)
    return ( result_df, pd.DataFrame(result_df.groupby(['EMPLOYER_STATE'])[salary_column].mean()).reset_index())

In [156]:
result_df, all_final_viz = dataviz_job_salary_dist(data_h1[data_h1["NEW_EMPLOYMENT"] == 1], ["BUSINESS ANALYST"],"WAGE_RATE_OF_PAY_FROM")

In [158]:
result_df

Unnamed: 0,CASE_NUMBER,CASE_STATUS,CASE_SUBMITTED,DECISION_DATE,VISA_CLASS,EMPLOYMENT_START_DATE,EMPLOYMENT_END_DATE,EMPLOYER_NAME,EMPLOYER_BUSINESS_DBA,EMPLOYER_ADDRESS,EMPLOYER_CITY,EMPLOYER_STATE,EMPLOYER_POSTAL_CODE,EMPLOYER_COUNTRY,EMPLOYER_PROVINCE,EMPLOYER_PHONE,EMPLOYER_PHONE_EXT,AGENT_REPRESENTING_EMPLOYER,AGENT_ATTORNEY_NAME,AGENT_ATTORNEY_CITY,AGENT_ATTORNEY_STATE,JOB_TITLE,SOC_CODE,SOC_NAME,NAICS_CODE,TOTAL_WORKERS,NEW_EMPLOYMENT,CONTINUED_EMPLOYMENT,CHANGE_PREVIOUS_EMPLOYMENT,NEW_CONCURRENT_EMPLOYMENT,CHANGE_EMPLOYER,AMENDED_PETITION,FULL_TIME_POSITION,PREVAILING_WAGE,PW_UNIT_OF_PAY,PW_WAGE_LEVEL,PW_SOURCE,PW_SOURCE_YEAR,PW_SOURCE_OTHER,WAGE_RATE_OF_PAY_FROM,WAGE_RATE_OF_PAY_TO,WAGE_UNIT_OF_PAY,H1B_DEPENDENT,WILLFUL_VIOLATOR,SUPPORT_H1B,LABOR_CON_AGREE,PUBLIC_DISCLOSURE_LOCATION,WORKSITE_CITY,WORKSITE_COUNTY,WORKSITE_STATE,WORKSITE_POSTAL_CODE,ORIGINAL_CERT_DATE,MULTIPLIER,ACTUAL_SALARY
11,I-200-16083-623465,CERTIFIED-WITHDRAWN,2016-03-23,2016-10-02,H-1B,2016-09-22,2019-09-20,ADVANCED TECHNOLOGY GROUP USA INC,,666 PLAINSBORO RD,PLAINSBORO,NJ,08536,UNITED STATES OF AMERICA,,6092695555,,N,",",,,BUSINESS ANALYST,15-1131,COMPUTER PROGRAMMERS,541511,1,1,0,0,0,0,0,Y,63170.0,Year,Level I,Other,2015.0,ONLINE WAGE LIBRARY,63200.00,0.0,Year,Y,N,Y,Y,,PRINCETON,MERCER,NJ,08540,2016-03-29,1.0,63200.00
253,I-200-16271-901057,CERTIFIED,2016-09-27,2016-10-03,H-1B,2016-09-27,2019-09-26,GENOME INTERNATIONAL CORPORATION,,8000 EXCELSIOR DRIVE,MADISON,WI,53717,UNITED STATES OF AMERICA,,6088335855,12,N,",",,,BUSINESS ANALYST,15-1121,COMPUTER SYSTEMS ANALYSTS,541511,1,0,0,0,0,0,1,Y,76835.0,Year,Level I,OES,2016.0,OFLC ONLINE DATA CENTER,77000.00,0.0,Year,Y,N,Y,Y,,SOUTH SAN FRANCISCO,SAN MATEO,CA,94080,,1.0,77000.00
283,I-200-16271-441484,CERTIFIED,2016-09-27,2016-10-03,H-1B,2016-10-03,2019-10-02,"ONCORRE, INC.",,672 U.S. 202/206 NORTH,BRIDGEWATER,NJ,08807,UNITED STATES OF AMERICA,,9084507575,,Y,"MALIK, JITESH",NANUET,NY,BUSINESS ANALYST,15-1132,"SOFTWARE DEVELOPERS, APPLICATIONS",541511,1,1,0,0,0,0,0,Y,65083.0,Year,Level I,OES,2016.0,OFLC ONLINE DATA CENTER,65083.00,0.0,Year,Y,N,Y,,,BRIDGEWATER,SOMERSET,NJ,08807,,1.0,65083.00
346,I-200-16271-038781,CERTIFIED,2016-09-27,2016-10-03,H-1B,2017-03-27,2020-01-29,ICERTIS INC.,,"14711 NE 29TH PLACE, SUITE 100",BELLEVUE,WA,98007,UNITED STATES OF AMERICA,,4258697649,,Y,"CHEETHAM, JANET",SEATTLE,WA,BUSINESS ANALYST-CONTRACT MANAGEMENT...,15-1121,COMPUTER SYSTEMS ANALYSTS,541512,1,0,1,0,0,0,0,Y,115773.0,Year,Level IV,OES,2016.0,OFLC ONLINE DATA CENTER,120000.00,135000.0,Year,N,N,,,,BELLEVUE,KING,WA,98007,,1.0,120000.00
373,I-200-16271-263893,CERTIFIED,2016-09-27,2016-10-03,H-1B,2016-10-01,2019-09-30,"AMERICAN TECHNOLOGY CONSULTING, LLC",,12951 UNIVERSITY AVE.,CLIVE,IA,50325,UNITED STATES OF AMERICA,,5159745619,,Y,"HARDING, RICHARD",CALABASAS HILLS,CA,BUSINESS ANALYST,15-1121,COMPUTER SYSTEMS ANALYSTS,541511,1,0,0,0,0,0,1,Y,55411.0,Year,Level I,Other,2016.0,OFLC ONLINE DATA CENTER,60000.00,70000.0,Year,Y,N,Y,,,RIVERSIDE,RIVERSIDE,CA,92521,,1.0,60000.00
376,I-200-16271-721943,CERTIFIED,2016-09-27,2016-10-03,H-1B,2016-10-01,2019-09-30,"AMERICAN TECHNOLOGY CONSULTING, LLC",,12951 UNIVERSITY AVE.,CLIVE,IA,50325,UNITED STATES OF AMERICA,,5159745619,,Y,"HARDING, RICHARD",CALABASAS HILLS,CA,IT BUSINESS ANALYST,15-1121,COMPUTER SYSTEMS ANALYSTS,541511,1,0,0,0,0,0,1,Y,60195.0,Year,Level I,Other,2016.0,OFLC ONLINE DATA CENTER,61000.00,81000.0,Year,Y,N,Y,,,DES MOINES,POLK,IA,50392,,1.0,61000.00
443,I-200-16270-804958,CERTIFIED,2016-09-27,2016-10-03,H-1B,2016-10-01,2019-10-01,HITACHI CONSULTING CORPORATION,,14643 DALLAS PARKWAY,DALLAS,TX,75254,UNITED STATES OF AMERICA,,2146656830,,N,",",,,SENIOR BUSINESS ANALYST,15-1121,COMPUTER SYSTEMS ANALYSTS,541512,1,1,0,0,0,0,0,Y,78957.0,Year,Level II,OES,2016.0,OFLC ONLINE DATA CENTER,78957.00,0.0,Year,N,N,,Y,,DALLAS,DALLAS,TX,75254,,1.0,78957.00
475,I-200-16271-695245,CERTIFIED,2016-09-27,2016-10-03,H-1B,2016-10-01,2019-10-01,MYTHRI CONSULTING LLC,,1721 W PLANO PKWY,PLANO,TX,75075,UNITED STATES OF AMERICA,,5104745326,,Y,"ALEMU, SAMUEL",SCHAUMBURG,IL,BUSINESS ANALYST,13-1111,MANAGEMENT ANALYSTS,541511,1,0,0,0,0,0,1,Y,55931.0,Year,Level I,OES,2016.0,OFLC ONLINE DATA CENTER,66789.00,0.0,Year,N,N,,Y,,PLANO,COLLIN,TX,75075,,1.0,66789.00
530,I-200-16271-824311,CERTIFIED,2016-09-27,2016-10-03,H-1B,2016-09-27,2019-09-26,"TECH MAHINDRA (AMERICAS),INC.",,4965 PRESTON PARK BOULEVARD,PLANO,TX,75093,UNITED STATES OF AMERICA,,9729912900,,N,",",,,BUSINESS ANALYST,13-1111,MANAGEMENT ANALYSTS,541511,1,0,0,0,0,0,1,Y,60757.0,Year,Level II,OES,2016.0,OFLC ONLINE DATA CENTER,60757.00,0.0,Year,Y,N,Y,Y,,LOUISVILLE,JEFFERSON,KY,40225,,1.0,60757.00
582,I-200-16271-573039,CERTIFIED,2016-09-27,2016-10-03,H-1B,2016-09-27,2019-09-26,"KRIDDHA TECHNOLOGIES, INC.",,5776 STONERIDGE MALL ROAD,PLEASANTON,CA,94588,UNITED STATES OF AMERICA,,4087588658,,Y,"SHEPARD, MICHELLE",WOBURN,MA,IT BUSINESS ANALYST,15-1121,COMPUTER SYSTEMS ANALYSTS,541511,1,0,0,0,0,1,0,Y,75608.0,Year,Level II,Other,2016.0,OFLC ONLINE DATA CENTER,75608.00,0.0,Year,Y,N,Y,,,CARY,WAKE,NC,27513,,1.0,75608.00


#### PLOTLY

In [157]:
df = all_final_viz
for col in df.columns:
    df[col] = df[col].astype(str)

scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

df['text'] = df['EMPLOYER_STATE'] + '<br>' +'Salary '+ df['WAGE_RATE_OF_PAY_FROM']

data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df['EMPLOYER_STATE'],
        z = df['WAGE_RATE_OF_PAY_FROM'].astype(float),
        locationmode = 'USA-states',
        text = df['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Millions USD")
        ) ]

layout = dict(
        title = 'H1 B Heat Map',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot( fig, filename='d3-cloropleth-map' )

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~jonamjar/0 or inside your plot.ly account where it is named 'd3-cloropleth-map'


In [61]:
test = data_h1[data_h1["EMPLOYER_NAME"] == "OVERSTOCK.COM, INC."]
test[test["JOB_TITLE"] =="ASSOCIATE DATA SCIENTIST"].T

Unnamed: 0,210109
CASE_NUMBER,I-200-17052-877782
CASE_STATUS,CERTIFIED
CASE_SUBMITTED,2017-02-27
DECISION_DATE,2017-03-03
VISA_CLASS,H-1B
EMPLOYMENT_START_DATE,2017-08-21
EMPLOYMENT_END_DATE,2020-08-20
EMPLOYER_NAME,"OVERSTOCK.COM, INC."
EMPLOYER_BUSINESS_DBA,
EMPLOYER_ADDRESS,799 WEST COLISEUM WAY


In [62]:
data_h1[data_h1["CHANGE_PREVIOUS_EMPLOYMENT"] != 0]["CASE_STATUS"].value_counts(normalize = True) * 100

CERTIFIED              91.299209
CERTIFIED-WITHDRAWN     4.889081
WITHDRAWN               2.782071
DENIED                  1.029639
Name: CASE_STATUS, dtype: float64

In [63]:
data_h1.columns

Index(['CASE_NUMBER', 'CASE_STATUS', 'CASE_SUBMITTED', 'DECISION_DATE',
       'VISA_CLASS', 'EMPLOYMENT_START_DATE', 'EMPLOYMENT_END_DATE',
       'EMPLOYER_NAME', 'EMPLOYER_BUSINESS_DBA', 'EMPLOYER_ADDRESS',
       'EMPLOYER_CITY', 'EMPLOYER_STATE', 'EMPLOYER_POSTAL_CODE',
       'EMPLOYER_COUNTRY', 'EMPLOYER_PROVINCE', 'EMPLOYER_PHONE',
       'EMPLOYER_PHONE_EXT', 'AGENT_REPRESENTING_EMPLOYER',
       'AGENT_ATTORNEY_NAME', 'AGENT_ATTORNEY_CITY', 'AGENT_ATTORNEY_STATE',
       'JOB_TITLE', 'SOC_CODE', 'SOC_NAME', 'NAICS_CODE', 'TOTAL_WORKERS',
       'NEW_EMPLOYMENT', 'CONTINUED_EMPLOYMENT', 'CHANGE_PREVIOUS_EMPLOYMENT',
       'NEW_CONCURRENT_EMPLOYMENT', 'CHANGE_EMPLOYER', 'AMENDED_PETITION',
       'FULL_TIME_POSITION', 'PREVAILING_WAGE', 'PW_UNIT_OF_PAY',
       'PW_WAGE_LEVEL', 'PW_SOURCE', 'PW_SOURCE_YEAR', 'PW_SOURCE_OTHER',
       'WAGE_RATE_OF_PAY_FROM', 'WAGE_RATE_OF_PAY_TO', 'WAGE_UNIT_OF_PAY',
       'H1B_DEPENDENT', 'WILLFUL_VIOLATOR', 'SUPPORT_H1B', 'LABOR_CON_AGREE',

In [64]:
data_h1[data_h1["EMPLOYER_NAME"] == "OVERSTOCK.COM, INC."]["CASE_STATUS"].value_counts()

CERTIFIED              70
CERTIFIED-WITHDRAWN    18
WITHDRAWN              12
Name: CASE_STATUS, dtype: int64

In [65]:
data_h1.columns

Index(['CASE_NUMBER', 'CASE_STATUS', 'CASE_SUBMITTED', 'DECISION_DATE',
       'VISA_CLASS', 'EMPLOYMENT_START_DATE', 'EMPLOYMENT_END_DATE',
       'EMPLOYER_NAME', 'EMPLOYER_BUSINESS_DBA', 'EMPLOYER_ADDRESS',
       'EMPLOYER_CITY', 'EMPLOYER_STATE', 'EMPLOYER_POSTAL_CODE',
       'EMPLOYER_COUNTRY', 'EMPLOYER_PROVINCE', 'EMPLOYER_PHONE',
       'EMPLOYER_PHONE_EXT', 'AGENT_REPRESENTING_EMPLOYER',
       'AGENT_ATTORNEY_NAME', 'AGENT_ATTORNEY_CITY', 'AGENT_ATTORNEY_STATE',
       'JOB_TITLE', 'SOC_CODE', 'SOC_NAME', 'NAICS_CODE', 'TOTAL_WORKERS',
       'NEW_EMPLOYMENT', 'CONTINUED_EMPLOYMENT', 'CHANGE_PREVIOUS_EMPLOYMENT',
       'NEW_CONCURRENT_EMPLOYMENT', 'CHANGE_EMPLOYER', 'AMENDED_PETITION',
       'FULL_TIME_POSITION', 'PREVAILING_WAGE', 'PW_UNIT_OF_PAY',
       'PW_WAGE_LEVEL', 'PW_SOURCE', 'PW_SOURCE_YEAR', 'PW_SOURCE_OTHER',
       'WAGE_RATE_OF_PAY_FROM', 'WAGE_RATE_OF_PAY_TO', 'WAGE_UNIT_OF_PAY',
       'H1B_DEPENDENT', 'WILLFUL_VIOLATOR', 'SUPPORT_H1B', 'LABOR_CON_AGREE',

In [78]:
data_h1[(data_h1["EMPLOYER_NAME"] == "OVERSTOCK.COM, INC.")]["ACTUAL_SALARY"].mean()

101204.17

In [76]:
data_h1[(data_h1["EMPLOYER_NAME"] == "ANCESTRY.COM OPERATIONS, INC.")| 
            (data_h1["EMPLOYER_NAME"] == "ANCESTRY.COM OPERATIONS, INC.")]["ACTUAL_SALARY"].mean()

122370.0

In [87]:
data_h1[(data_h1["EMPLOYER_NAME"] == "ANCESTRY.COM OPERATIONS, INC.")| 
            (data_h1["EMPLOYER_NAME"] == "ANCESTRY.COM OPERATIONS, INC.")][["JOB_TITLE","ACTUAL_SALARY"]]

Unnamed: 0,JOB_TITLE,ACTUAL_SALARY
13573,SOFTWARE ENGINEER,100000.0
47168,SENIOR ANALYTICS ENGINEER,160000.0
76023,SOFTWARE DEVELOPER,95000.0
92464,SENIOR DATA SCIENTIST,160000.0
143514,SENIOR DATA ANALYST,155000.0
224205,SEGMENTATION ANALYST,81350.0
224214,SYSTEMS ANALYST,56091.0
224216,SOFTWARE DEVELOPER,96518.0
224219,DATA SCIENTIST,96900.0
227173,SOFTWARE ENGINEER,108211.0


In [67]:
data_h1[data_h1["EMPLOYER_NAME"].str.contains("ANCESTRY")][["JOB_TITLE","WAGE_RATE_OF_PAY_FROM","EMPLOYMENT_START_DATE"]]

ValueError: cannot index with vector containing NA / NaN values

In [None]:
# # from pivottablejs import pivot_ui
# # pivot_ui(data_h1.head(10))

# qgrid_widget = qgrid.show_grid(data_h1.head(100), show_toolbar=True)
# qgrid_widget