In [1]:
import urllib2
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import patsy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.feature_selection import SelectKBest

In [2]:
# Scrape indeed.com for data scientist jobs in select cities

url_template = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l={}&start={}"
max_results_per_city = 500

results = []

for city in ['New+York', 'Chicago', 'San+Francisco', 'Austin', 'Seattle', 
    'Los+Angeles', 'Philadelphia', 'Atlanta', 'Dallas', 'Pittsburgh', 
    'Portland', 'Phoenix', 'Denver', 'Houston', 'Miami']:
    print city
    for start in range(0, max_results_per_city, 10):
        try:
            url_ = 'http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l='+city+'&start='+str(start)
            soupy = (BeautifulSoup(urllib2.urlopen(url_).read(), 'lxml'))
            results.append(soupy.find_all('div', class_=' row result'))
        except:
            print 'Did not work'

New+York
Did not work
Chicago
San+Francisco
Austin
Seattle
Los+Angeles
Philadelphia
Did not work
Atlanta
Dallas
Did not work
Pittsburgh
Did not work
Did not work
Portland
Phoenix
Denver
Houston
Miami


In [3]:
# Create functions to extract location, company, job title, and salary from each posting

def extract_loc(result):
    dicty = {}
    for idx, div in enumerate(result):
        for idx2, j in enumerate(div.find_all('span', class_='location')):
            dicty[idx2] = j.text
    return dicty

def extract_comp(result):
    dicty = {}
    for idx, div in enumerate(result):
        for idx2, j in enumerate(div.find_all('span', class_='company')):
            dicty[idx2] = j.text
    return dicty
        
def extract_job(result):
    dicty = {}
    for idx, div in enumerate(result):
        for idx2, h in enumerate(div.find_all('h2')):
            dicty[idx2] = h.text
    return dicty

def extract_sal(result):
    dicty = {}
    for idx, div in enumerate(result):
        for idx2, i in enumerate(div.find_all('td', class_='snip')):
            for idx3, j in enumerate(i.find_all('nobr')):
                dicty[idx2] = j.text
    return dicty


In [4]:
# Call each of the above functions to create a pandas dataframe

listy = []
for i in results:
    listy.append(extract_loc(i))
df = pd.DataFrame(listy)

listy = []
for i in results:
    listy.append(extract_job(i))
df['job'] = pd.DataFrame(listy)

listy = []
for i in results:
    listy.append(extract_sal(i))
df['sal'] = pd.DataFrame(listy)

listy = []
for i in results:
    listy.append(extract_comp(i))
df['comp'] = pd.DataFrame(listy)

df.columns = ['city', 'job', 'salary', 'company']
df.head(3)

Unnamed: 0,city,job,salary,company
0,"New York, NY","\nLead Data Scientist, Digital Intelligence\n",,\n\n\n JPMorgan Chase\n
1,"New York, NY 10012 (Little Italy area)",\nQuantitative Scientist\n,"$90,000 - $100,000 a year",\n\n\n Flatiron Health\n
2,"New York, NY 10011 (Chelsea area)",\nAccenture Analytics - Data Science Consultant\n,"$100,000 a year",\n\n\n Accenture\n


In [5]:
# Create separate dataframe of positions with no salary listed

no_sal = df[df['salary'].isnull()]
no_sal.head(3)

Unnamed: 0,city,job,salary,company
0,"New York, NY","\nLead Data Scientist, Digital Intelligence\n",,\n\n\n JPMorgan Chase\n
4,"New York, NY 10154 (Midtown area)",\nCognitive AI Data Scientist\n,,\n\n\n KPMG\n
5,"New York, NY 10011 (Chelsea area)","\nMachine Learning Practice Lead, Google Cloud...",,\n\n\n Google\n


In [6]:
# Drop positions with no salary and duplicate entries in df

df.dropna(how='any', axis=0, inplace=True)
df.drop_duplicates(inplace=True)
print df.shape
df.head()

(190, 4)


Unnamed: 0,city,job,salary,company
1,"New York, NY 10012 (Little Italy area)",\nQuantitative Scientist\n,"$90,000 - $100,000 a year",\n\n\n Flatiron Health\n
2,"New York, NY 10011 (Chelsea area)",\nAccenture Analytics - Data Science Consultant\n,"$100,000 a year",\n\n\n Accenture\n
3,"New York, NY",\nResearch Scientist\n,"$175,000 a year","\n\n\n Amazon Web Services, Inc.\n"
6,"New York, NY 10010 (Gramercy area)",\nPart Time Online Course Mentor: Data Analysis\n,"$80,000 - $100,000 a year",\n\n\n General Assembly\n
8,"New York, NY",\nResearch Analyst - Digital Transformation\n,"$140,000 a year",\n\n Lux Research\n


In [7]:
# Clean job column

listy = []
for i in df['job']:
    listy.append(str(i.encode('ascii', 'ignore')).split())
dicty = {}
for idx, i in enumerate(listy):
    for j in i:
        if '\\n' in j:
            continue
        elif idx not in dicty:
            dicty[idx] = str(j.encode('ascii', 'ignore'))
        else:
            dicty[idx] += ' ' + str(j)
listyy = []
for i in range(len(dicty)):
    listyy.append(dicty[i])
df['job'] = listyy
df.head(3)

Unnamed: 0,city,job,salary,company
1,"New York, NY 10012 (Little Italy area)",Quantitative Scientist,"$90,000 - $100,000 a year",\n\n\n Flatiron Health\n
2,"New York, NY 10011 (Chelsea area)",Accenture Analytics - Data Science Consultant,"$100,000 a year",\n\n\n Accenture\n
3,"New York, NY",Research Scientist,"$175,000 a year","\n\n\n Amazon Web Services, Inc.\n"


In [10]:
# Clean company column

listy = []
for i in df['company']:
    listy.append(str(i.encode('ascii', 'ignore')).split())
dicty = {}
for idx, i in enumerate(listy):
    for j in i:
        if '\\n' in j:
            continue
        elif idx not in dicty:
            dicty[idx] = str(j)
        else:
            dicty[idx] += ' ' + str(j)
listyy = []
for i in range(len(dicty)):
    listyy.append(dicty[i])
df['company'] = listyy
df.head(3)

Unnamed: 0,city,job,salary,company
1,"New York, NY 10012 (Little Italy area)",Quantitative Scientist,"$90,000 - $100,000 a year",Flatiron Health
2,"New York, NY 10011 (Chelsea area)",Accenture Analytics - Data Science Consultant,"$100,000 a year",Accenture
3,"New York, NY",Research Scientist,"$175,000 a year","Amazon Web Services, Inc."


In [11]:
# Clean city column

listy = []
for i in df['city']:
    listy.append(str(i).split(',')[0])
df['city'] = listy
df.head(3)

Unnamed: 0,city,job,salary,company
1,New York,Quantitative Scientist,"$90,000 - $100,000 a year",Flatiron Health
2,New York,Accenture Analytics - Data Science Consultant,"$100,000 a year",Accenture
3,New York,Research Scientist,"$175,000 a year","Amazon Web Services, Inc."


In [12]:
# Clean salary column

listy = []
for i in df['salary']:
    listy.append(str(i).replace('$', '').replace(',', '').replace('-', '').replace(' a year', '')
                .replace('a week', '').replace('a month', '').replace('an hour', ''))

listy = [i.split() for i in listy]
num_list = []
for idx, i in enumerate(listy):
    for j in i:
        num_list += [(idx, float(j))]
num_list
        
dicty = {}
for j, k in num_list:
    if j not in dicty:
        dicty[j] = k
    else:
        dicty[j] += k
        dicty[j] /= 2
dicty
for i in dicty:
    if dicty[i] < 100:
        dicty[i] *= 2080
    elif dicty[i] < 3000:
        dicty[i] *= 52
    elif dicty[i] < 10000:
        dicty[i] *= 12
sals = []
for i in range(len(dicty)):
    sals.append(dicty[i])
    
df['salary'] = sals
df.head(3)

ValueError: could not convert string to float: a

In [248]:
# Create highness column, which splits 

frst_third = df['salary'].quantile(.33)
scnd_third = df['salary'].quantile(.66)
highness = []
for i in df['salary']:
    if i < frst_third:
        highness.append('low')
    elif i < scnd_third:
        highness.append('average')
    else:
        highness.append('high')
df['highness'] = highness
df.head()

listy = []
for i in df['salary']:
    if i < df['salary'].quantile(.5):
        listy.append(1)
    else:
        listy.append(0)
df['is_high'] = listy

In [249]:
df.head(3)

Unnamed: 0,city,job,salary,company,highness,is_high
0,New York,"Data Scientist, Digital Strategy",135000.0,Morgan Stanley,high,0
1,New York,Applied Scientist,76941.5,"Amazon Web Services, Inc.",average,1
2,New York,Quantitative Scientist,95000.0,Flatiron Health,average,0


In [250]:
# Check the baseline of model

val_cnts = df['highness'].value_counts()
print 'The baseline accuracy is', float(max(val_cnts)) / float(sum(val_cnts)) 

The baseline accuracy is 0.35632183908


In [389]:
# Split data into train and test groups

X = df.drop(['highness', 'is_high', 'salary'], axis=1)
y = df['highness']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [390]:
# Initialize TF-IDF vectorizer

vec = CountVectorizer(stop_words='english',ngram_range=(1, 3), max_df=0.2, min_df=2, max_features=5)
vec.fit(X['job'])

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.2, max_features=5, min_df=2,
        ngram_range=(1, 3), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [391]:
# Create function to prepare X_train and X_test for classification models

def prep(X):
    # Convert city column to numerical values
    le = LabelEncoder()
    X['city'] = le.fit_transform(X['city'])
    
    # Vectorize job titles column to create new features
    X_vec = pd.DataFrame(vec.transform(X['job']).todense())
    
    # Concatenate new features to X_train
    X.set_index([range(len(X['job']))], inplace=True)
    X_vec.columns = vec.get_feature_names()
    X = pd.concat([X, X_vec], axis=1)
    
    # Drop job and company columns from X
    X.drop(['job', 'company'], axis=1, inplace=True)
    
    return X

In [392]:
X_train = prep(X_train)
X_test  = prep(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [393]:
skb = SelectKBest(k=5)
skb.fit(X_train, y_train)
mask = skb.get_support

In [394]:
thing = skb.transform(X_train)
print thing.shape

(69, 5)


In [395]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [396]:
gs = GridSearchCV(lr, {'C':[0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000], 'penalty':['l1','l2']}, cv=3)
gs.fit(X_train, y_train)
gs.best_params_

{'C': 0.0001, 'penalty': 'l2'}

In [397]:
lr = LogisticRegression(C=0.0001, penalty='l2')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.44444444444444442

In [398]:
# Trying out random forest classifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [400]:
gs = GridSearchCV(rf, {'max_features':[1, 2, 3, 4, 5, None], 'max_depth':[1,2,3,4,5,None],
                      'n_estimators':[5, 10, 15, 20]}, cv=3)
gs.fit(X_train, y_train)
gs.best_params_

{'max_depth': 1, 'max_features': None, 'n_estimators': 10}

In [402]:
rf = RandomForestClassifier(max_depth=3, max_features=1, n_estimators=5, random_state=42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.44444444444444442