#### Get the JSON data

In [1]:
f = open('data/jobs.json', 'r')
lines = f.readlines()

lines[0][:200]



'[{"id": "132873", "guidislink": false, "links": [{"rel": "alternate", "type": "text/html", "href": "https://stackoverflow.com/jobs/132873/developpeur-web-python-senior-h-f-ooreka?a=IyOladG2aAM"}], "li'

In [2]:
job_data = ''.join([line for line in lines])

In [3]:
import json

jobs = json.loads(job_data)

#### Structure of a job

In [4]:
jobs[0]

{'author': 'Ooreka',
 'author_detail': {'name': 'Ooreka'},
 'authors': [{'name': 'Ooreka'}],
 'guidislink': False,
 'id': '132873',
 'link': 'https://stackoverflow.com/jobs/132873/developpeur-web-python-senior-h-f-ooreka?a=IyOladG2aAM',
 'links': [{'href': 'https://stackoverflow.com/jobs/132873/developpeur-web-python-senior-h-f-ooreka?a=IyOladG2aAM',
   'rel': 'alternate',
   'type': 'text/html'}],
 'location': 'Boulogne-Billancourt, France',
 'published': 'Thu, 11 May 2017 13:42:06 Z',
 'published_parsed': [2017, 5, 11, 13, 42, 6, 3, 131, 0],
 'summary': '<p><strong>Ooreka : la start-up qui monte !</strong><br>Avec 10 millions de visiteurs uniques par mois, Ooreka touche un internaute fran&ccedil;ais sur 5, et est ainsi rentr&eacute; dans le top 50 des sites internet fran&ccedil;ais incontournables.<br>Ooreka r&eacute;pond &agrave; un besoin essentiel en offrant un site o&ugrave; de v&eacute;ritables experts apportent des r&eacute;ponses claires &agrave; toutes les questions de la vie

In [5]:
jobs[0]['tags']

[{'label': None, 'scheme': None, 'term': 'python'},
 {'label': None, 'scheme': None, 'term': 'django'}]

#### Get all tags

In [6]:
tagged_jobs = [j for j in jobs if 'tags' in j.keys()]

In [7]:
flat_tags = [tag for taglist in tagged_jobs for tag in taglist['tags']]

In [8]:
flat_tags[:10]

[{'label': None, 'scheme': None, 'term': 'python'},
 {'label': None, 'scheme': None, 'term': 'django'},
 {'label': None, 'scheme': None, 'term': 'python'},
 {'label': None, 'scheme': None, 'term': 'javascript'},
 {'label': None, 'scheme': None, 'term': 'linux'},
 {'label': None, 'scheme': None, 'term': 'python'},
 {'label': None, 'scheme': None, 'term': 'sql'},
 {'label': None, 'scheme': None, 'term': 'r'},
 {'label': None, 'scheme': None, 'term': 'testing'},
 {'label': None, 'scheme': None, 'term': 'python'}]

In [9]:
tags = [tag['term'] for tag in flat_tags]

In [10]:
tags[:25]

['python',
 'django',
 'python',
 'javascript',
 'linux',
 'python',
 'sql',
 'r',
 'testing',
 'python',
 'api',
 'jenkins',
 'bash',
 'testing',
 'python',
 'php',
 'jenkins',
 'ansible',
 'python',
 'django',
 'postgresql',
 'linux',
 'c++',
 'javascript',
 'single-page-application']

In [11]:
unique_tags = set(tags)

In [12]:
len(unique_tags)

705

#### Use a counter to find the most popular tags

In [13]:
from collections import Counter

In [14]:
tag_counter = Counter()

In [15]:
for tag in tags:
    tag_counter[tag] += 1

In [16]:
tag_counter.most_common()

[('javascript', 275),
 ('python', 259),
 ('linux', 157),
 ('amazon-web-services', 126),
 ('php', 119),
 ('c#', 117),
 ('docker', 107),
 ('node.js', 103),
 ('reactjs', 103),
 ('c++', 99),
 ('mysql', 86),
 ('sysadmin', 82),
 ('git', 69),
 ('sql', 66),
 ('.net', 63),
 ('angularjs', 59),
 ('ruby-on-rails', 56),
 ('css', 56),
 ('ruby', 53),
 ('html', 48),
 ('jenkins', 44),
 ('go', 41),
 ('postgresql', 39),
 ('html5', 39),
 ('agile', 34),
 ('devops', 33),
 ('scala', 32),
 ('django', 31),
 ('c', 30),
 ('tdd', 29),
 ('mongodb', 28),
 ('puppet', 26),
 ('r', 25),
 ('css3', 25),
 ('continuous-integration', 24),
 ('rest', 24),
 ('asp.net-mvc', 24),
 ('testing', 23),
 ('machine-learning', 22),
 ('oop', 22),
 ('sql-server', 22),
 ('ansible', 21),
 ('hadoop', 21),
 ('elasticsearch', 21),
 ('jquery', 21),
 ('cloud', 20),
 ('azure', 19),
 ('microservices', 17),
 ('nosql', 17),
 ('typescript', 17),
 ('chef', 16),
 ('rabbitmq', 15),
 ('cassandra', 15),
 ('api', 14),
 ('unix', 14),
 ('jira', 14),
 ('kuber

In [17]:
tag_counter.most_common?

In [18]:
tag_counter.most_common(n=25)

[('javascript', 275),
 ('python', 259),
 ('linux', 157),
 ('amazon-web-services', 126),
 ('php', 119),
 ('c#', 117),
 ('docker', 107),
 ('node.js', 103),
 ('reactjs', 103),
 ('c++', 99),
 ('mysql', 86),
 ('sysadmin', 82),
 ('git', 69),
 ('sql', 66),
 ('.net', 63),
 ('angularjs', 59),
 ('ruby-on-rails', 56),
 ('css', 56),
 ('ruby', 53),
 ('html', 48),
 ('jenkins', 44),
 ('go', 41),
 ('postgresql', 39),
 ('html5', 39),
 ('agile', 34)]

#### Get the most common skills associated with `python`

In [19]:
jobs_dict = {}
for job in tagged_jobs:
    jobs_dict[job['id']] = [tag['term'] for tag in job['tags']]

In [20]:
list(jobs_dict.items())[:10]

[('132873', ['python', 'django']),
 ('144259', ['python', 'javascript', 'linux']),
 ('144421', ['python', 'sql', 'r']),
 ('144255', ['testing', 'python', 'api', 'jenkins', 'bash']),
 ('141725', ['testing', 'python', 'php', 'jenkins', 'ansible']),
 ('140321', ['python', 'django', 'postgresql', 'linux', 'c++']),
 ('143404', ['javascript', 'single-page-application', 'https', 'dns', 'ajax']),
 ('136002', ['cmake', 'teamcity', 'jenkins', 'python', 'linux']),
 ('143630', ['linux', 'security', 'tcp', 'ip']),
 ('129481', ['python', 'angularjs', 'javascript', 'postgresql', 'html'])]

In [21]:
python_jobs = [jobs_dict[id] for id in jobs_dict if 'python' in jobs_dict[id]]

In [22]:
python_tag_counter = Counter()

In [23]:
for job in python_jobs:
    for tag in job:
        if not tag == 'python': python_tag_counter[tag] += 1

In [24]:
python_tag_counter.most_common(25)

[('c++', 55),
 ('javascript', 51),
 ('linux', 49),
 ('amazon-web-services', 41),
 ('postgresql', 28),
 ('docker', 28),
 ('django', 27),
 ('sysadmin', 25),
 ('sql', 23),
 ('r', 18),
 ('php', 18),
 ('ruby', 18),
 ('go', 18),
 ('mysql', 17),
 ('node.js', 17),
 ('c', 17),
 ('devops', 14),
 ('c#', 14),
 ('angularjs', 13),
 ('hadoop', 13),
 ('machine-learning', 12),
 ('.net', 10),
 ('testing', 9),
 ('jenkins', 9),
 ('scala', 9)]

In [25]:
def get_related_skills(skill, n):
    skill_jobs = [jobs_dict[id] for id in jobs_dict if skill in jobs_dict[id]]
    
    skill_tag_counter = Counter()
    
    for job in skill_jobs:
        for tag in job:
            if not tag == skill: skill_tag_counter[tag] += 1
            
    return [x[0] for x in skill_tag_counter.most_common(n)]

In [26]:
get_related_skills('linux', 5)

['python', 'sysadmin', 'c++', 'docker', 'amazon-web-services']

In [27]:
get_related_skills('docker', 10)

['amazon-web-services',
 'linux',
 'python',
 'sysadmin',
 'jenkins',
 'puppet',
 'node.js',
 'ansible',
 'kubernetes',
 'devops']