# Loading data

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import time 

jobs = pd.read_csv('../data/stackoverflow_jobs_enhanced.csv', thousands=',')

# Filtering for important columns & splitting 

In [2]:
tmp = jobs.loc[:,['jobid', 'city', 'state', 'country', 'tags', 'weeknum', 'salary', 'salary_low', 'salary_high', 'currency', 'equity']]

tag_split = lambda x: pd.Series([i for i in x[1:-1].split(',')])
tag_splitted = tmp['tags'].apply(tag_split)

tag_splitted = tag_splitted.fillna('')

In [3]:
t = []

for i in range(0, len(tmp)):    
    num_tags = len(tag_splitted.iloc[i])
    for j in range(0,num_tags):
        tech = tag_splitted.iloc[i][j]        
        if tech == '': break
        # creating new row 
        new_row = {}
        new_row['jobid'] = tmp.iloc[i]['jobid']
        new_row['city'] = tmp.iloc[i]['city']
        new_row['state'] = tmp.iloc[i]['state']
        new_row['country'] = tmp.iloc[i]['country']
        new_row['weeknum'] = tmp.iloc[i]['weeknum']
        new_row['salary_low'] = tmp.iloc[i]['salary_low']
        new_row['salary_high'] = tmp.iloc[i]['salary_high']
        new_row['equity'] = tmp.iloc[i]['equity']
        new_row['currency'] = tmp.iloc[i]['currency']            
        new_row['tech'] = tech
        # adding 
        t.append(new_row)
        
technologies = pd.DataFrame(t)        

In [4]:
#  removing spaces from the beginning and ending
technologies.tech = technologies.tech.str.lstrip(' ')
technologies.tech = technologies.tech.str.rstrip(' ')

technologies.tech = technologies.tech.str.rstrip('"')
technologies.tech = technologies.tech.str.lstrip('"')

In [5]:
# getting the mean figure for salary
technologies['salary_mean'] = technologies[['salary_high','salary_low']].mean(axis=1)

technologies.fillna('', inplace=True)
# technologies.head(355)

# Top 10 technologies in London 

In [6]:
technologies[technologies.city == 'London'].groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False).nlargest(10)

city    tech               
London  javascript             273
        java                   259
        python                 181
        amazon-web-services    134
        c#                     130
        angularjs               97
        linux                   88
        php                     83
        node.js                 81
        html                    81
Name: jobid, dtype: int64

# ... and Berlin

In [7]:
technologies[technologies.city == 'Berlin'].groupby(['city', 'tech'])['jobid'].count().sort_values(ascending=False).nlargest(10)

city    tech      
Berlin  java          311
        javascript    288
        php           159
        mysql         139
        python        123
        angularjs     111
        sql           104
        css            96
        html           82
        linux          78
Name: jobid, dtype: int64

# Top 10 technologies in Silicon Valley

In [8]:
technologies[technologies.state == 'CA'].groupby('tech')['jobid'].count().sort_values(ascending=False).nlargest(10)

tech
javascript             538
java                   530
python                 527
c++                    278
linux                  230
amazon-web-services    210
sql                    196
angularjs              177
c#                     165
ruby-on-rails          165
Name: jobid, dtype: int64

# ... and Texas

In [9]:
technologies[technologies.state == 'TX'].groupby('tech')['jobid'].count().sort_values(ascending=False).nlargest(10)

tech
javascript             143
java                    93
c#                      86
python                  70
angularjs               51
mysql                   45
amazon-web-services     42
linux                   39
sql                     38
.net                    37
Name: jobid, dtype: int64

# Best place to do machine learning... 

In [10]:
technologies[technologies.tech.str.startswith('machine-learning', na=False)].groupby('city')['city'].count().sort_values(ascending=False).nlargest(10)

city
San Francisco    17
Rotterdam        14
London           10
Berlin           10
Palo Alto         9
MÃ¼nchen           8
Seattle           6
Cambridge         6
Hamburg           5
New York          5
Name: city, dtype: int64

# ... or Apache Spark

In [11]:
technologies[technologies.tech.str.startswith('apache-spark', na=False)].groupby('city')['city'].count().sort_values(ascending=False).nlargest(10)

city
London           19
San Francisco    18
Berlin           14
New York         12
Seattle           7
Stockport         6
Hamburg           6
Toronto           6
Ottawa            5
Singapore         5
Name: city, dtype: int64

In [12]:
# saving the result to csv 
jobs.to_csv('../data/technologies.csv', index = False)