# Scraping the Data from Indeed

In [1]:
# Load required scripts
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, pandas as pd

In [63]:
# List various salary bandings for data scientist jobs in California
base_url_ca_60 = 'https://www.indeed.com/jobs?q=Data+Scientist+$60,000&l=California&radius=50&jt=fulltime&sort='
base_url_ca_80 = 'https://www.indeed.com/jobs?q=Data+Scientist+$80,000&l=California&radius=50&jt=fulltime&sort='
base_url_ca_95 = 'https://www.indeed.com/jobs?q=Data+Scientist+$95,000&l=California&radius=50&jt=fulltime&sort='
base_url_ca_110 = 'https://www.indeed.com/jobs?q=Data+Scientist+$110,000&l=California&radius=50&jt=fulltime&sort='

# Sort data by date and by start page number (to append later)
sort_by = 'date'          
start_from = '&start='    

# Remove the column limit for pandas
pd.set_option('max_colwidth',500)   

# Pre-establish the database
df = pd.DataFrame()

In [64]:
def scrape(df, base_url, salary_estimate) :
    """ Takes in a dataframe and then scrapes Indeed.com for jobs using the base URL
    provided by the user. For user's own reference, include the salary estimate parameter
    used for the job search. """

    # Scrape page 1 to 100 (last accessible page is 100)
    for page in range(1,101):
        
        # Multiple by 10 as the numbers follow number of jobs listed per page
        page = (page-1) * 10  
        
        # Create full URL
        url = "%s%s%s%d" % (base_url, sort_by, start_from, page)
        
        # Scrape
        target = Soup(urllib.urlopen(url), "lxml") 

        # Get a job from each row
        targetElements = target.findAll('div', attrs={'class':" row result"})
    
        # Try to get each specific job information
        for elem in targetElements: 
            try:
                comp_name = elem.find('span', attrs={'class':'company'}).getText().strip()
            except: 
                comp_name = None
                
            try:
                job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
            except:
                job_title = None
            
            try:
                listed_job_salary = elem.find('span', attrs={'class': "no-wrap"}).getText()
            except:
                listed_job_salary = None
            
            try:
                job_addr = elem.find('span', attrs={'class':'location'}).getText()
            except:
                job_addr = None
            
            try:
                job_summary = elem.find('span', attrs={'class': 'summary'}).getText()
            except:
                job_summary = None


            # Add job info to the data frame
            df = df.append({'comp_name': comp_name, 'job_title': job_title, 
                            'salary_estimated': salary_estimate,'job_summary' : job_summary,
                            'job_location': job_addr, 'listed_job_salary' : listed_job_salary
                           }, ignore_index=True)
    return df

In [None]:
df = scrape(df, base_url_ca_60, 60000)

In [None]:
df = scrape(df, base_url_ca_80, 80000)

In [None]:
df = scrape(df, base_url_ca_95, 95000)

In [None]:
df = scrape(df, base_url_ca_110, 110000)

In [None]:
df = df.replace('\n','', regex=True)

In [None]:
# Save the result to CSV
df.to_csv('../indeed-results.csv', encoding='utf-8')

## Read in CSV so that we don't have to scrape again

In [2]:
df_read = pd.read_csv('../indeed-results.csv')

In [3]:
df_read["listed_job_salary"].value_counts()

$150,000 a year                                 17
$140,000 - $165,000 a year                      13
$120,000 - $150,000 a year                      12
$150,000 - $180,000 a year                      11
$180,000 a year                                 10
$100,000 - $180,000 a year                       9
$130,000 - $150,000 a year                       9
$150,000 - $200,000 a year                       8
$160,000 - $170,000 a year                       7
$140,000 - $160,000 a year                       7
$140,000 - $200,000 a year                       6
$100,000 - $160,000 a year                       6
$140,000 - $180,000 a year                       6
$125,000 - $155,000 a year                       5
$180,000 - $250,000 a year                       5
$180,000 - $200,000 a year                       5
$120,000 - $140,000 a year                       5
$130,000 - $195,000 a year                       5
$180,000 - $210,000 a year                       5
$160,000 - $180,000 a year     

Do some light cleaning here. Since we don't have a lot of jobs that have the listed salary (as expected), we will remove that column. We also suspect duplicates in jobs, so taking the job_summary column, we drop any duplicates found as it is unlikely that two different jobs will have identical, word-for-word descriptions.

In [4]:
df_read.drop(["Unnamed: 0", "listed_job_salary"], axis=1, inplace=True)

In [5]:
df_read.head()

Unnamed: 0,comp_name,job_location,job_summary,job_title,salary_estimated
0,Walmart eCommerce,"San Bruno, CA 94066","Data scientists, front and back-end engineers,...","Director, Retail Learning & Development",60000.0
1,Facebook,"Menlo Park, CA","Work with engineering, data science, and desig...","Product Manager, Advanced Network Planning",60000.0
2,Kaiser Permanente,"Oakland, CA","He or she will manage a team of analysts, data...",Senior Manager Decision Support,60000.0
3,PaxVax,"San Diego, CA",Good data anaylsis skills. Is seeking a Scient...,"Scientist, Process Development - Upstream",60000.0
4,The Aerospace Corporation,"El Segundo, CA 90245",Our state-of-the-art laboratory facilities are...,Associate Software Engineer,60000.0


In [6]:
df_read.drop_duplicates(subset='job_summary', inplace=True)

In [7]:
df_read.shape

(1527, 5)

In [8]:
df_read.salary_estimated.value_counts()

60000.0     725
110000.0    294
80000.0     286
95000.0     222
Name: salary_estimated, dtype: int64

# Question 1

To predict salary you will be building either a classification or regression model, using features like the location, title, and summary of the job. If framing this as a regression problem, you will be estimating the listed salary amounts. You may instead choose to frame this as a classification problem, in which case you will create labels from these salaries (high vs. low salary, for example) according to thresholds (such as median salary).

You have learned a variety of new skills and models that may be useful for this problem:
- NLP
- Unsupervised learning and dimensionality reduction techniques (PCA, clustering)
- Ensemble methods and decision tree models
- SVM models

Whatever you decide to use, the most important thing is to justify your choices and interpret your results. *Communication of your process is key.* Note that most listings **DO NOT** come with salary information. You'll need to able to extrapolate or predict the expected salaries for these listings.

## City Count Vectorizer

In [9]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [10]:
# 60 and 80k will be considered low salary
# 95 and 110k will be considered high salary

df_read['high_salary'] = [1 if a > 80000 else 0 for a in df_read.salary_estimated]

In [32]:
df_read.high_salary.value_counts()

0    1011
1     516
Name: high_salary, dtype: int64

In [33]:
city_dummies = pd.get_dummies(df_read.job_location)

X_city = city_dummies
y_city = df_read.high_salary

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_city, y_city, test_size=0.3)

In [35]:
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)
acc = accuracy_score(y_test, rfc_pred)
print "Accuracy Score:", acc.round(3)

s = cross_val_score(rfc, X_city, y_city, cv=10, n_jobs=-1)
print "Cross Validation Score:\t{:0.3} ± {:0.3}".format(s.mean().round(3), s.std().round(3))

Accuracy Score: 0.647
Cross Validation Score:	0.635 ± 0.024


In [36]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_city.columns).reset_index()
feature_importances.columns = ['feature', 'importance']

feature_medians = []
for i in X_city.columns:
    feature_medians.append(np.median(df_read[df_read.job_location == i].salary_estimated))

feature_importances['median_salary'] = feature_medians
feature_importances['high_or_low'] = [1 if i > 80000 else 0 for i in feature_importances.median_salary]

feature_importances.sort_values('importance', ascending=False).head(20)

Unnamed: 0,feature,importance,median_salary,high_or_low
95,"Mountain View, CA",0.053793,95000.0,1
161,"San Francisco, CA",0.043845,80000.0,0
175,"San Jose, CA",0.025797,80000.0,0
163,"San Francisco, CA 94103 (South Of Market area)",0.024933,95000.0,1
193,"Santa Clara, CA",0.022534,60000.0,0
197,"Santa Clara, CA 95054",0.021547,95000.0,1
34,"Cupertino, CA 95014",0.019346,110000.0,1
209,"South San Francisco, CA 94080",0.015348,60000.0,0
214,"Sunnyvale, CA",0.014948,80000.0,0
33,"Cupertino, CA",0.014594,110000.0,1


In [37]:
feature_importances.shape

(231, 4)

## Summary Count Vectorizer

In [38]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

In [39]:
salaries_w_desc = df_read.copy(deep=False)

X_summ = salaries_w_desc['job_summary']
y_summ = salaries_w_desc['high_salary']

In [40]:
cv = CountVectorizer(stop_words="english")
cv.fit(X_summ)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [41]:
X_summ_trans = pd.DataFrame(cv.transform(X_summ).todense(), columns=cv.get_feature_names())

In [42]:
X_train, X_test, y_train, y_test = train_test_split(np.asmatrix(X_summ_trans), y_summ, test_size=0.3,
                                                    stratify=y_summ)

In [43]:
word_counts = X_summ_trans.sum(axis=0)
word_counts.sort_values(ascending = False).head(25)

data           2540
scientists      539
scientist       431
experience      328
team            258
analysis        227
engineers       205
learning        190
science         178
work            175
machine         173
product         147
research        131
analytics       130
read            122
looking         116
working         108
large           104
big             104
design          104
senior          103
development      93
sets             86
management       86
algorithms       82
dtype: int64

In [44]:
rfc = RandomForestClassifier(300)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)
acc = accuracy_score(y_test, rfc_pred)
print "Accuracy Score:", acc.round(3)

s = cross_val_score(rfc, X_summ_trans.as_matrix(), y_summ.as_matrix(), cv=10, n_jobs=-1)
print "Cross Validation Score: {:0.3} ± {:0.3}".format(s.mean().round(3), s.std().round(3))

Accuracy Score: 0.662
Cross Validation Score: 0.656 ± 0.035


In [45]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_summ_trans.columns).reset_index()
feature_importances.columns = ['feature', 'importance']

feature_medians = []
feature_means = []
for i in X_summ_trans.columns:
    feature_medians.append(np.median(salaries_w_desc[salaries_w_desc['job_summary'].str.lower().str.contains(i)].salary_estimated))
    feature_means.append(np.mean(salaries_w_desc[salaries_w_desc['job_summary'].str.lower().str.contains(i)].salary_estimated))


feature_importances['median_salary'] = feature_medians
feature_importances['mean_salary'] = feature_means
feature_importances['over_or_under'] = [1 if i > 80000 else 0 for i in feature_importances['median_salary']]

feature_importances.sort_values('importance', ascending=False).head(20)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,feature,importance,median_salary,mean_salary,over_or_under
2436,read,0.021229,80000.0,85664.0625,0
779,data,0.012882,80000.0,78854.166667,0
172,analysts,0.007846,95000.0,89000.0,1
1122,experience,0.006452,80000.0,80729.483283,0
2630,science,0.006213,80000.0,81445.783133,0
1765,machine,0.00589,80000.0,83151.515152,0
2639,scientists,0.005753,80000.0,79591.633466,0
2635,scientist,0.005721,80000.0,78763.570567,0
412,building,0.005375,95000.0,91818.181818,1
2959,team,0.005331,80000.0,80300.0,0


In [46]:
feature_importances.shape

(3231, 5)

## Title Count Vectorizer

In [47]:
salaries_w_desc = df_read.copy(deep=False)

X_title = salaries_w_desc['job_title']
y_title = salaries_w_desc['high_salary']

In [48]:
cv = CountVectorizer(stop_words="english")
cv.fit(X_title)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [49]:
X_title_trans = pd.DataFrame(cv.transform(X_title).todense(), columns=cv.get_feature_names())

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X_title_trans, y_title, test_size=0.3)

In [51]:
rfc = RandomForestClassifier(300)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)
acc = accuracy_score(y_test, rfc_pred)
print "Accuracy Score:", acc.round(3)

s = cross_val_score(rfc, X_title_trans.as_matrix(), y_title.as_matrix(), cv=10, n_jobs=-1)
print "Cross Validation Score: {:0.3} ± {:0.3}".format(s.mean().round(3), s.std().round(3))

Accuracy Score: 0.603
Cross Validation Score: 0.614 ± 0.051


In [52]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_title_trans.columns).reset_index()
feature_importances.columns = ['feature', 'importance']

feature_medians = []
feature_means = []
for i in X_title_trans.columns:
    feature_medians.append(np.median(salaries_w_desc[salaries_w_desc["job_title"].str.lower().str.contains(i)].salary_estimated))
    feature_means.append(np.mean(salaries_w_desc[salaries_w_desc["job_title"].str.lower().str.contains(i)].salary_estimated))


feature_importances['median_salary'] = feature_medians
feature_importances['mean_salary'] = feature_means
feature_importances['over_or_under'] = [1 if i > 80000 else 0 for i in feature_importances.median_salary]

feature_importances.sort_values('importance', ascending=False).head(20)

Unnamed: 0,feature,importance,median_salary,mean_salary,over_or_under
227,data,0.040998,80000.0,82282.157676,0
765,senior,0.029808,80000.0,80948.275862,0
301,engineer,0.025954,80000.0,82097.560976,0
751,scientist,0.024905,80000.0,78033.613445,0
791,software,0.02152,80000.0,82544.91018,0
725,research,0.016775,60000.0,74444.444444,0
813,sr,0.01674,80000.0,81170.212766,0
489,learning,0.015385,95000.0,84803.370787,1
669,principal,0.014197,80000.0,81515.151515,0
54,analytics,0.013746,80000.0,81811.594203,0


In [53]:
feature_importances.shape

(930, 5)

## Combining Title CV, Summary CV, and Location CV

In [54]:
salaries_w_desc = df_read.copy(deep=False).reset_index(drop=True)
city_dummies = pd.get_dummies(df_read.job_location)

In [55]:
X = pd.concat([city_dummies.reset_index(drop=True), 
               X_title_trans.reset_index(drop=True), 
               X_summ_trans.reset_index(drop=True)], axis=1)
y = salaries_w_desc.high_salary

In [56]:
print X.shape
print y.shape

(1527, 4392)
(1527,)


In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [58]:
rfc = RandomForestClassifier(300)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)
acc = accuracy_score(y_test, rfc_pred)
print "Accuracy Score:", acc.round(6)

s = cross_val_score(rfc, X, y, cv=10, n_jobs=-1)
print "Cross Validation Score: {:0.3} ± {:0.3}".format(s.mean().round(3), s.std().round(3))

Accuracy Score: 0.675381
Cross Validation Score: 0.671 ± 0.038


In [59]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X.columns).reset_index()
feature_importances.columns = ['feature', 'importance']
df_read.reset_index(drop=True, inplace=True)
salaries_w_desc.reset_index(drop=True, inplace=True)
feature_importances

Unnamed: 0,feature,importance
0,"Agoura Hills, CA 91301",0.000000
1,"Alameda Harbor, CA",0.000000
2,"Alhambra, CA 91803",0.000368
3,"Aliso Viejo, CA",0.000044
4,"Bakersfield, CA",0.000000
5,"Belmont, CA",0.000024
6,"Berkeley, CA",0.000095
7,"Berkeley, CA 94709",0.000002
8,"Berkeley, CA 94710",0.000014
9,"Beverly Hills, CA",0.000427


In [60]:
feature_medians = []
for i in city_dummies.columns:
    feature_medians.append(np.median(salaries_w_desc[salaries_w_desc.job_location == i].salary_estimated))
    
for i in X_title_trans.columns:
    feature_medians.append(np.median(salaries_w_desc[salaries_w_desc["job_title"].str.lower().str.contains(i)].salary_estimated))

for i in X_summ_trans.columns:
    feature_medians.append(np.median(salaries_w_desc[salaries_w_desc['job_summary'].str.lower().str.contains(i)].salary_estimated))

feature_importances['median_salary'] = feature_medians
feature_importances['over_or_under'] = [1 if i > 80000 else 0 for i in feature_importances.median_salary]

feature_importances.sort_values('importance', ascending=False).head(20)

Unnamed: 0,feature,importance,median_salary,over_or_under
3597,read,0.025304,80000.0,0
458,data,0.011343,80000.0,0
1940,data,0.009683,80000.0,0
161,"San Francisco, CA",0.006886,80000.0,0
1333,analysts,0.00631,95000.0,1
734,machine,0.005982,95000.0,1
3791,science,0.005831,80000.0,0
720,learning,0.005527,95000.0,1
1571,build,0.005147,95000.0,1
996,senior,0.004947,80000.0,0


## Regression Model

In [11]:
df_read.head()

Unnamed: 0,comp_name,job_location,job_summary,job_title,salary_estimated,high_salary
0,Walmart eCommerce,"San Bruno, CA 94066","Data scientists, front and back-end engineers,...","Director, Retail Learning & Development",60000.0,0
1,Facebook,"Menlo Park, CA","Work with engineering, data science, and desig...","Product Manager, Advanced Network Planning",60000.0,0
2,Kaiser Permanente,"Oakland, CA","He or she will manage a team of analysts, data...",Senior Manager Decision Support,60000.0,0
3,PaxVax,"San Diego, CA",Good data anaylsis skills. Is seeking a Scient...,"Scientist, Process Development - Upstream",60000.0,0
4,The Aerospace Corporation,"El Segundo, CA 90245",Our state-of-the-art laboratory facilities are...,Associate Software Engineer,60000.0,0


In [12]:
df_rmodel = df_read[["comp_name", "job_location", "salary_estimated", "high_salary"]]
df_rmodel.reset_index(drop=True, inplace=True)

In [13]:
df_rmodel.shape

(1527, 4)

In [14]:
cali_index = df_rmodel[df_rmodel["job_location"] == "California"].index

In [15]:
df_rmodel.drop(df_rmodel.index[cali_index], inplace=True)
df_rmodel.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,comp_name,job_location,salary_estimated,high_salary
0,Walmart eCommerce,"San Bruno, CA 94066",60000.0,0
1,Facebook,"Menlo Park, CA",60000.0,0
2,Kaiser Permanente,"Oakland, CA",60000.0,0
3,PaxVax,"San Diego, CA",60000.0,0
4,The Aerospace Corporation,"El Segundo, CA 90245",60000.0,0
5,Walmart eCommerce,"San Bruno, CA 94066",60000.0,0
6,BeiGene,"San Francisco, CA",60000.0,0
7,Ascent Services Group,"South San Francisco, CA",60000.0,0
8,Remind,"San Francisco, CA",60000.0,0
9,First American,"Agoura Hills, CA 91301",60000.0,0


In [16]:
loc_clean = [a[0:a.find(', CA')] for a in df_rmodel.job_location]

In [17]:
df_rmodel["job_location"] = loc_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
df_rmodel.head()

Unnamed: 0,comp_name,job_location,salary_estimated,high_salary
0,Walmart eCommerce,San Bruno,60000.0,0
1,Facebook,Menlo Park,60000.0,0
2,Kaiser Permanente,Oakland,60000.0,0
3,PaxVax,San Diego,60000.0,0
4,The Aerospace Corporation,El Segundo,60000.0,0


In [19]:
df_rmodel["job_location"].value_counts()

San Francisco             337
San Diego                 112
San Jose                   87
Mountain View              69
Santa Clara                66
Palo Alto                  64
South San Francisco        60
Sunnyvale                  58
Los Angeles                56
El Segundo                 48
Redwood City               44
Irvine                     32
Menlo Park                 27
Emeryville                 25
San Mateo                  21
Livermore                  19
San Bruno                  19
Fremont                    16
Pleasanton                 16
Los Gatos                  15
San Ramon                  14
Foster City                13
Thousand Oaks              13
Santa Monica               13
San Francisco Bay Area     10
Pasadena                   10
Stanford                    9
Oakland                     9
Cupertino                   9
Hayward                     8
                         ... 
San Bernardino              1
Hollywood                   1
Redding   

In [20]:
X = df_rmodel[["comp_name", "job_location"]]
y = df_rmodel["high_salary"]

In [21]:
X = pd.get_dummies(X, drop_first=True)

In [22]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
Xs = ss.fit_transform(X)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.3)

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [24]:
gs_params = {
    'penalty':['l1','l2'],
    'solver':['liblinear'],
    'C':np.logspace(-5,0,100)
}

In [25]:
lr = LogisticRegression()

lr_gridsearch = GridSearchCV(lr, gs_params, cv=5, verbose=1, n_jobs=-1)

In [27]:
lr_gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   29.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([  1.00000e-05,   1.12332e-05, ...,   8.90215e-01,   1.00000e+00]), 'solver': ['liblinear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [28]:
print lr_gridsearch.best_score_

0.671388101983


In [29]:
lr_gridsearch.best_params_

{'C': 0.030538555088334154, 'penalty': 'l1', 'solver': 'liblinear'}

In [30]:
best_gs = lr_gridsearch.best_estimator_
best_gs.score(X_test, y_test)

0.66740088105726869

The base line was binary (high or low), so this is slightly better than 50%. 
How does the model perform when given salary bands instead of high or low?

In [26]:
y = df_rmodel["salary_estimated"]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.3)

In [28]:
lr_gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   36.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  1.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([  1.00000e-05,   1.12332e-05, ...,   8.90215e-01,   1.00000e+00]), 'solver': ['liblinear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [29]:
print lr_gridsearch.best_score_

0.470254957507


In [30]:
lr_gridsearch.best_params_

{'C': 1.0000000000000001e-05, 'penalty': 'l1', 'solver': 'liblinear'}

In [31]:
best_gs = lr_gridsearch.best_estimator_
best_gs.score(X_test, y_test)

0.48458149779735682

Given that there are 4 salary bands, the base is 25%. This is not bad.

# Question 2

Using the job postings you scraped for part 1 (or potentially new job postings from a second round of scraping), identify features in the data related to job postings that can distinguish job titles from each other. There are a variety of interesting ways you can frame the target variable, for example:
- What components of a job posting distinguish data scientists from other data jobs?
- What features are important for distinguishing junior vs. senior positions?
- Do the requirements for titles vary significantly with industry (e.g. healthcare vs. government)?

You may end up making multiple classification models to tackle different questions. Be sure to clearly explain your hypotheses and framing, any feature engineering, and what your target variables are. The type of classification model you choose is up to you. Be sure to interpret your results and evaluate your models' performance.

## Second round of Scraping

In [61]:
# List various salary bandings for exact match "data scientist" jobs in California
url_ca_exact_60 = 'https://www.indeed.com/jobs?q=%22Data+Scientist%22+$60,000&l=California&radius=50&jt=fulltime&sort='
url_ca_exact_80 = 'https://www.indeed.com/jobs?q=%22Data+Scientist%22+$80,000&l=California&radius=50&jt=fulltime&sort='
url_ca_exact_95 = 'https://www.indeed.com/jobs?q=%22Data+Scientist%22+$95,000&l=California&radius=50&jt=fulltime&sort='
url_ca_exact_110 = 'https://www.indeed.com/jobs?q=%22Data+Scientist%22+$110,000&l=California&radius=50&jt=fulltime&sort='

# Pre-establish the database
df2 = pd.DataFrame()

In [67]:
df2

Unnamed: 0,comp_name,job_location,job_summary,job_title,listed_job_salary,salary_estimated
0,CLARA analytics,"Santa Clara, CA","\nThe Data Scientist role involves working on all stages of the data science pipeline, from acquiring and assessing data, selecting appropriate models and...",Data Scientist,,60000.0
1,Shutterfly,"Redwood City, CA",\nThe Data Scientist will be responsible for designing and directing experiments and observational studies to optimize our customer acquisition and engagement...,Data Scientist,,60000.0
2,Lam Research,"Fremont, CA 94538 (Irvington area)","\nDefine data structures, evaluate data quality, perform appropriate data analyses using software such as Python and MATLAB....",Data Scientist 4,,60000.0
3,"GRAIL, Inc.","Menlo Park, CA",\nParticipate in data quality review activities and efforts to resolve data quality issues. Develop and manage interactive data visualization and analytic tools...,Senior Staff Clinical Data Scientist,,60000.0
4,Chatham Group,"San Francisco, CA 94104 (Financial District area)","\nDo you like data? Experience in data analysis, gaming, mobile applications, consulting, or business/financial analysis....",Senior Data Scientist,"\n$70,000 a year",60000.0
5,Petco,"San Diego, CA",\nThis role will supervise the assistant merchant managers and work very closely with the customer data scientist and manager business analytics....,Services Manager,,60000.0
6,Workbridge Associates,"San Jose, CA 95113 (Downtown area)","\nA well-established retail company located in Silicon Valley is looking for a contract Senior Data Scientist to take on a role providing modeling, analysis, and...",Senior Data Scientist (Contract),,60000.0
7,Jobspring Partners,"Palo Alto, CA",\nA Series C Healthcare Startup Located in Palo Alto is on the seeking for a bold Mid-Level Data Scientist to join to the team....,Mid-level Data Scientist,"\n$130,000 - $165,000 a year",60000.0
8,Remind,"San Francisco, CA","\nAnalyze data to identify trends and opportunities, surface actionable insights, and help teams set goals, forecasts and prioritization of initiatives....",Data Scientist,,60000.0
9,FullDeck,"Los Angeles, CA","\n2+ years of experience in data mining and/or data science. A flourishing digital media agency with high profile entertainment clients, has an immediate need for...",Data Scientist,,60000.0


In [65]:
df2 = scrape(df2, url_ca_exact_60, 60000)

In [68]:
df2 = scrape(df2, url_ca_exact_80, 80000)

In [69]:
df2 = scrape(df2, url_ca_exact_95, 95000)

In [70]:
df2 = scrape(df2, url_ca_exact_110, 110000)

In [71]:
df2 = df2.replace('\n','', regex=True)

In [72]:
# Save the result to CSV
df2.to_csv('../indeed-results-exact-ds.csv', encoding='utf-8')

## Read in again

In [73]:
df_exact = pd.read_csv('../indeed-results-exact-ds.csv')

In [74]:
df_exact.drop(["Unnamed: 0", "listed_job_salary"], axis=1, inplace=True)

In [75]:
df_exact.drop_duplicates(subset='job_summary', inplace=True)

In [76]:
df_exact.head()

Unnamed: 0,comp_name,job_location,job_summary,job_title,salary_estimated
0,CLARA analytics,"Santa Clara, CA","The Data Scientist role involves working on all stages of the data science pipeline, from acquiring and assessing data, selecting appropriate models and...",Data Scientist,60000.0
1,Shutterfly,"Redwood City, CA",The Data Scientist will be responsible for designing and directing experiments and observational studies to optimize our customer acquisition and engagement...,Data Scientist,60000.0
2,Lam Research,"Fremont, CA 94538 (Irvington area)","Define data structures, evaluate data quality, perform appropriate data analyses using software such as Python and MATLAB....",Data Scientist 4,60000.0
3,"GRAIL, Inc.","Menlo Park, CA",Participate in data quality review activities and efforts to resolve data quality issues. Develop and manage interactive data visualization and analytic tools...,Senior Staff Clinical Data Scientist,60000.0
4,Chatham Group,"San Francisco, CA 94104 (Financial District area)","Do you like data? Experience in data analysis, gaming, mobile applications, consulting, or business/financial analysis....",Senior Data Scientist,60000.0


In [77]:
df_exact.shape

(851, 5)

In [99]:
df_exact["job_title"] = df_exact.job_title.str.lower()

In [100]:
df_exact.head()

Unnamed: 0,comp_name,job_location,job_summary,job_title,salary_estimated
0,CLARA analytics,"Santa Clara, CA","The Data Scientist role involves working on all stages of the data science pipeline, from acquiring and assessing data, selecting appropriate models and...",data scientist,60000.0
1,Shutterfly,"Redwood City, CA",The Data Scientist will be responsible for designing and directing experiments and observational studies to optimize our customer acquisition and engagement...,data scientist,60000.0
2,Lam Research,"Fremont, CA 94538 (Irvington area)","Define data structures, evaluate data quality, perform appropriate data analyses using software such as Python and MATLAB....",data scientist 4,60000.0
3,"GRAIL, Inc.","Menlo Park, CA",Participate in data quality review activities and efforts to resolve data quality issues. Develop and manage interactive data visualization and analytic tools...,senior staff clinical data scientist,60000.0
4,Chatham Group,"San Francisco, CA 94104 (Financial District area)","Do you like data? Experience in data analysis, gaming, mobile applications, consulting, or business/financial analysis....",senior data scientist,60000.0


In [119]:
"data" in df_exact["job_title"][0] 

True

In [127]:
df_exact["job_title"][0]

'data scientist'

In [129]:
df_exact.reset_index(drop=True, inplace=True)

In [144]:
jobs = []

for a in range(len(df_exact["job_title"])) :
    if "data scientist" in df_exact["job_title"][a] :
        jobs.append(a)

In [145]:
df_ds = df_exact.iloc[jobs]

In [146]:
df_ds.head()

Unnamed: 0,comp_name,job_location,job_summary,job_title,salary_estimated
0,CLARA analytics,"Santa Clara, CA","The Data Scientist role involves working on all stages of the data science pipeline, from acquiring and assessing data, selecting appropriate models and...",data scientist,60000.0
1,Shutterfly,"Redwood City, CA",The Data Scientist will be responsible for designing and directing experiments and observational studies to optimize our customer acquisition and engagement...,data scientist,60000.0
2,Lam Research,"Fremont, CA 94538 (Irvington area)","Define data structures, evaluate data quality, perform appropriate data analyses using software such as Python and MATLAB....",data scientist 4,60000.0
3,"GRAIL, Inc.","Menlo Park, CA",Participate in data quality review activities and efforts to resolve data quality issues. Develop and manage interactive data visualization and analytic tools...,senior staff clinical data scientist,60000.0
4,Chatham Group,"San Francisco, CA 94104 (Financial District area)","Do you like data? Experience in data analysis, gaming, mobile applications, consulting, or business/financial analysis....",senior data scientist,60000.0


In [147]:
df_ds.shape

(691, 5)

In [159]:
searchfor = ['senior', 'lead', 'sr', 'principal']

In [177]:
#[1 if a.str.contains('|'.join(searchfor) else 0 for a in df_ds.job_title)]


tf_vector = df_ds.job_title.str.contains('|'.join(searchfor))

senior_not_senior = [1 if a == True else 0 for a in tf_vector]
df_ds["senior_not_senior"] = senior_not_senior

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [178]:
df_ds.head()

Unnamed: 0,comp_name,job_location,job_summary,job_title,salary_estimated,senior_not_senior
0,CLARA analytics,"Santa Clara, CA","The Data Scientist role involves working on all stages of the data science pipeline, from acquiring and assessing data, selecting appropriate models and...",data scientist,60000.0,0
1,Shutterfly,"Redwood City, CA",The Data Scientist will be responsible for designing and directing experiments and observational studies to optimize our customer acquisition and engagement...,data scientist,60000.0,0
2,Lam Research,"Fremont, CA 94538 (Irvington area)","Define data structures, evaluate data quality, perform appropriate data analyses using software such as Python and MATLAB....",data scientist 4,60000.0,0
3,"GRAIL, Inc.","Menlo Park, CA",Participate in data quality review activities and efforts to resolve data quality issues. Develop and manage interactive data visualization and analytic tools...,senior staff clinical data scientist,60000.0,1
4,Chatham Group,"San Francisco, CA 94104 (Financial District area)","Do you like data? Experience in data analysis, gaming, mobile applications, consulting, or business/financial analysis....",senior data scientist,60000.0,1
