In [11]:
import os, sys
parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
from skills_crawling.settings import MONGO_URI

import pymongo

mongo_uri = MONGO_URI
mongo_db = 'data-analysis'
mongo_collection = 'skills-crawling'

client = pymongo.MongoClient(mongo_uri)
db = client[mongo_db]
collection = db[mongo_collection]

In [12]:
skill_set = {
    "programming": [],
    "BI": [],
    "major": [],
    "degree": [],
    "DB": [],
    "cloud": []
}

documents = collection.find({})
count = 0
for item in documents:
    count += 1
    for key in skill_set.keys():
        if key not in item:
            continue
        skill_set[key].extend(item[key])

print(f"Total number of documents retrieved: {count}")

Total number of documents retrieved: 1156


In [13]:
from collections import Counter

for key in skill_set.keys():
    c = Counter(skill_set[key]).most_common(20)
    print(f"{key}: --", end=" ")
    for val in c:
        print(f"{val[0]} ({val[1]/count:.2%})", end=" ")
    print("")

programming: -- Python (58.39%) SQL (51.73%) R (26.47%) Java (10.90%) SAS (8.74%) C++ (8.65%) Scala (5.62%) C (4.84%) JavaScript (4.15%) C# (3.46%) VBA (3.03%) MATLAB (2.42%) HTML (1.82%) .NET (1.64%) XML (1.38%) CSS (1.12%) PHP (0.95%) Perl (0.87%) Julia (0.78%) Bash (0.69%) 
BI: -- Tableau (20.67%) Power BI (20.07%) Excel (18.60%) Looker (4.76%) Qlik (2.77%) Cognos (1.73%) Alteryx (1.56%) Google Data Studio (1.21%) MicroStrategy (0.69%) Spotfire (0.61%) 
major: -- Computer Science (45.59%) Engineering (36.33%) Statistics (29.07%) Mathematics (28.20%) Data Science (17.82%) Economics (10.90%) Science (9.08%) Information Technology (5.62%) Finance (4.93%) Business (4.76%) Information Systems (3.37%) Physics (3.20%) Marketing (2.68%) Business Administration (2.34%) Bioinformatics (1.21%) Operations Research (0.95%) Information Management (0.69%) Information Science (0.61%) Epidemiology (0.43%) Financial engineering (0.43%) 
degree: -- Bachelor (38.15%) Master (22.84%) PhD (7.18%) Diploma

In [8]:
import json

skill_clean = {key: {} for key in skill_set.keys()}

# Load existing mapping from skill_clean.json
try:
    with open("skill_clean.json") as f:
        _sc = json.load(f)
        for k1, v1 in _sc.items():
            for k2, v2 in v1.items():
                skill_clean[k1][k2] = v2
except:
    pass

# Dump all non-existent mapping to skill_clean.json from the database
for key in skill_set.keys():
    c = Counter(skill_set[key]).most_common(30)
    for val in c:
        if val[0] not in skill_clean[key]:
            skill_clean[key][val[0]] = val[0]

with open('skill_clean.json', 'w', encoding='utf-8') as f:
    json.dump(skill_clean, f, ensure_ascii=False, indent=4)


In [10]:
# After manually edit the mapping skill_clean.json, update the database

with open("skill_clean.json") as f:
    sc = json.load(f)

documents = collection.find({})

for item in documents:
    if item.get("cleaned", False):
        continue
    for key in sc.keys():
        if key not in item:
            continue
        new_set = list(set([sc[key][s] for s in item[key] if s in sc[key]]))
        item[key] = new_set
        if len(item[key]) == 0:
            item.pop(key)
    item["cleaned"] = True

    collection.replace_one(
        {"_id": item["_id"]},
        item
    )