In [1]:
import pandas as pd 
import numpy as np

# Let's change how printing the series works. I need to see all elements in the Series 
# source: http://stackoverflow.com/questions/19124601/is-there-a-way-to-pretty-print-the-entire-pandas-series-dataframe
pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', 70)

jobs = pd.read_csv('data/stackoverflow_jobs.csv')

# Exploring data

In [2]:
jobs.columns

Index([u'jobid', u'title', u'employer', u'location', u'salary', u'description',
       u'tags', u'url', u'date', u'weeknum'],
      dtype='object')

In [3]:
jobs.dtypes

jobid           int64
title          object
employer       object
location       object
salary         object
description    object
tags           object
url            object
date           object
weeknum         int64
dtype: object

In [4]:
jobs.describe()

Unnamed: 0,jobid,weeknum
count,5691.0,5691.0
mean,118535.378141,35.626428
std,11499.058975,3.898488
min,15018.0,30.0
25%,117665.0,32.0
50%,122131.0,35.0
75%,124383.5,40.0
max,127068.0,42.0


In [5]:
jobs.head()

Unnamed: 0,jobid,title,employer,location,salary,description,tags,url,date,weeknum
0,113232,Software Engineer: iOS,Postmates Inc.,"San Francisco, CA","$120,000 - 170,000\r\n\r\n ...",Postmates runs the largest on-demand delivery ...,"[""ios"",""objective-c"",""swift""]",/jobs/113232/software-engineer-ios-postmates-inc,2016-07-25,30
1,121583,Care Coach - Spanish Bilingual,Health Dialog,"Bakersfield, CA",,racking CodeCRCCHSPBKRSFDCA1162Job Description...,"[""cold-fusion""]",/jobs/121583/care-coach-spanish-bilingual-heal...,2016-07-25,30
2,121582,Drupal Developer,C4Media,"Sibiu, Romania",,You are an experienced Drupal developer and we...,"[""ubuntu-lamp"",""drupal""]",/jobs/121582/drupal-developer-c4media,2016-07-25,30
3,109825,Database Administrator - DevOps - SQL,Spot Trading LLC,"Chicago, IL",,Summary\r\nAs Spot Trading continues to assert...,"[""database"",""sql-server"",""nosql"",""cassandra"",""...",/jobs/109825/database-administrator-devops-sql...,2016-07-25,30
4,105355,QA Engineer,TeleTracking Technologies,"Pittsburgh, PA",,The primary purpose of the Quality Assurance E...,"[""c#"",""asp.net"",""qa""]",/jobs/105355/qa-engineer-teletracking-technolo...,2016-07-25,30


In [6]:
jobs.tail()

Unnamed: 0,jobid,title,employer,location,salary,description,tags,url,date,weeknum
5686,123217,Senior Back End Developer - Big Data,INFARE,"Copenhagen, Denmark",,"At INFARE, we are not happy if we simply achie...","[""java"",""scala"",""c#"","".net"",""bigdata""]",/jobs/123217/senior-back-end-developer-big-dat...,2016-10-18,42
5687,111740,Developer C/C++ (Network 2020),Samsung R&D Institute,"Warsaw, Poland",,Who we are:\r\n\r\nWe implement GSMA standards...,"[""c"",""c++"",""git"",""tcp"",""rcs""]",/jobs/111740/developer-c-c-plus-plus-network-2...,2016-10-18,42
5688,116619,QA Automation Engineer (m/f) wanted!,Project A Ventures,"Berlin, Deutschland",,We are looking for cutting-edge QA talents who...,"[""automated-tests"",""performance-testing"",""jira...",/jobs/116619/qa-automation-engineer-m-f-wanted...,2016-10-18,42
5689,104969,AI Programmer,Splash Damage,"Bromley, UK",,As a member of Splash Damage’s AI Programming ...,"[""c++"",""unreal-engine4"",""artificial-intelligen...",/jobs/104969/ai-programmer-splash-damage,2016-10-18,42
5690,123218,Conversion Optimization Specialist (f/m),simplesurance GmbH,"Berlin, Germany",,Are you passionate about improving conversion ...,"[""html"",""css"",""adobe""]",/jobs/123218/conversion-optimization-specialis...,2016-10-18,42


# Salary


## Replacing NA values with empty strings in the salary column

In [49]:
jobs.salary = jobs.salary.fillna('')

## Extracting equity  

In [50]:
jobs['equity'] = jobs['salary'].str.contains('Provides Equity')

## Extracting currency and high - low salary


Need to extract currency, salary_low and salary_high from salary field and copy it to their own columns.

Using regex here to capture parts of the salary field into three columns: 
    - currency will capture zero or more characters that are non digits
    - number_low captures one or more characters that are digits and spearators (currently only comma is used)
    - number high will capture all the numbers plus separators from the dash until the end of the string 

In [51]:
jobs.salary

0       $120,000 - 170,000\r\n\r\n                    ...
1                                                        
2                                                        
3                                                        
4                                                        
5                                                        
6                                                        
7                                                        
8                                                        
9                                                        
10                                                       
11                                                       
12                                                       
13      $80,000 - 105,000\r\n\r\n                     ...
14                                                       
15                                                       
16                                     $120,000 - 170,000
17            

In [52]:
# salary = jobs.salary
salary = jobs.salary.map(lambda x: x.replace("Provides Equity","").replace("/","").strip())

sal = salary.str.extract('(?P<currency>[^\d]*)(?P<number_low>[\d,]+) - (?P<number_high>[\d,]+$)')

sal.number_low = sal.number_low.fillna(0)
sal.number_high = sal.number_high.fillna(0)
sal.currency = sal.currency.fillna('')

# mapping the new columns back
jobs['currency'] = sal.currency
jobs['salary_low'] = sal.number_low
jobs['salary_high'] = sal.number_high


# Location 

We need better location information, so we can do analysis by countries and cities. For this we need to extract country, state and city out of location column. But first let's remove the __na__ values from location column. 

Then use a lambda to split the location into individual fields.  

In [53]:
jobs.location = jobs.location.fillna('') # sometimes we have nothing in the location field. 

location_split = lambda x: pd.Series([i for i in x.split(',')])
locations = jobs['location'].apply(location_split)

locations.rename(columns={0:'city', 1: 'location_1', 2: 'location_2'},inplace=True)

## Fixing US locations

US locations seems to be special. They are in the form of _city, state_, we need this to be in form of _city, state, country_, so let's fix this first. 

If we have a US state in _location_1_ column then put _US_ in _location_2_. 


In [54]:
# Fixing US States
us_states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

locations['location_1'] = locations['location_1'].str.strip()
locations.loc[locations['location_1'].isin(us_states),'location_2'] = "US"

## Filling the state and country columns

If in a row location_2 is null then location_1 contains the country of that location, if location_2 is not empty thne location_2 is going to be the country and location_1 will contain the state. 

In [55]:
# if location_2 is null then location_1 column has the country 
# if location_2 is not null then location_2 has the country and location_1 contains the state 
jobs['country'] = np.where(locations['location_2'].isnull(), locations['location_1'], locations['location_2'])
jobs['state'] = np.where(locations['location_2'].notnull(), locations['location_1'], '')

jobs['city'] = locations['city']

# filling na for country 
jobs.country = jobs.country.fillna('')

# stripping spaces from new columns
jobs['city'] = jobs['city'].str.strip()
jobs['country'] = jobs['country'].str.strip()

Now we can see what countries are posting the most jobs. It seems that the US, Deutschland, Germany and the UK are the top countries. But wait. Aren't Germany and Deutschland are the same country? Let's fix this and some other countries with native names. 

In [56]:
# replacing some of the country names with their english version 
jobs.loc[jobs['country'].str.contains('Deutschland'),'country'] = 'Germany' # Deutschland -> Germany
jobs.loc[jobs['country'].str.contains('Österreich'),'country'] = 'Austria' # Österreich -> Austria
jobs.loc[jobs['country'].str.contains('Suisse'), 'country'] = 'Switzerland' # Suisse -> Switzerland
jobs.loc[jobs['country'].str.contains('Schweiz'), 'country'] = 'Switzerland' # Schweiz -> Switzerland
jobs.loc[jobs['country'].str.contains('Espagne'), 'country'] = 'Spain' # Espagne -> Spain
jobs.loc[jobs['country'].str.contains('République tchèque'), 'country'] = 'Czech Republic' # République tchèque -> Czech Republic
jobs.loc[jobs['country'].str.contains('Niederlande'), 'country'] = 'Netherlands' # Niederlande -> Netherlands

jobs['country'].value_counts().head()

US             2094
Germany         546
UK              331
Canada          154
Netherlands     138
Name: country, dtype: int64

In [57]:
jobs['city'].value_counts()

New York              238
London                186
San Francisco         181
Berlin                166
Seattle               103
München                84
No office location     72
Chicago                69
Toronto                67
Philadelphia           63
Washington             61
Amsterdam              59
Hamburg                50
Boston                 43
Los Angeles            43
Austin                 40
Dublin                 38
Cambridge              37
Atlanta                35
Bellevue               35
Mountain View          34
Sydney                 33
Denver                 32
Vancouver              30
Singapore              30
San Diego              29
Portland               25
Stockholm              24
Helsinki               21
Melbourne              21
Santa Monica           20
Düsseldorf             19
Zürich                 19
Barcelona              18
Bengaluru              18
                     ... 
Grapevine               1
Newport Beach           1
Bury        