In [1]:
import os, sys
parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
from skills_crawling.settings import MONGO_URI

import pymongo

mongo_uri = MONGO_URI
mongo_db = 'data-analysis'
mongo_collection = 'skills-crawling'

client = pymongo.MongoClient(mongo_uri)
db = client[mongo_db]
collection = db[mongo_collection]

In [2]:
skill_set = {
    "programming": [],
    "BI": [],
    "major": [],
    "degree": [],
    "DB": [],
    "cloud": []
}

documents = collection.find({})
count = 0
for item in documents:
    count += 1
    for key in skill_set.keys():
        if key not in item:
            continue
        skill_set[key].extend(item[key])

print(f"Total number of documents retrieved: {count}")

Total number of documents retrieved: 1353


In [3]:
from collections import Counter

for key in skill_set.keys():
    c = Counter(skill_set[key]).most_common(20)
    print(f"{key}: --", end=" ")
    for val in c:
        print(f"{val[0]} ({val[1]/count:.2%})", end=" ")
    print("")

programming: -- Python (49.89%) SQL (44.20%) R (22.62%) Java (9.31%) SAS (7.46%) C++ (7.39%) sql (6.87%) Scala (4.80%) python (4.29%) C (4.14%) JavaScript (3.55%) C# (2.96%) r (2.81%) VBA (2.59%) MATLAB (2.07%) HTML (1.55%) .NET (1.40%) XML (1.18%) sas (1.18%) CSS (0.96%) 
BI: -- Tableau (17.66%) Power BI (17.15%) Excel (15.89%) excel (5.91%) Looker (4.07%) tableau (3.99%) power bi (3.62%) Qlik (2.37%) Cognos (1.48%) Alteryx (1.33%) powerbi (1.26%) Google Data Studio (1.03%) looker (0.67%) MicroStrategy (0.59%) Spotfire (0.52%) une (0.37%) de (0.37%) à (0.37%) google (0.30%) 2021 (0.30%) 
major: -- Computer Science (38.95%) Engineering (31.04%) Statistics (24.83%) Mathematics (24.09%) Data Science (15.23%) Economics (9.31%) Science (7.76%) Information Technology (4.80%) Finance (4.21%) Business (4.07%) computer science (3.92%) Information Systems (2.88%) statistics (2.81%) Physics (2.73%) Marketing (2.29%) Business Administration (2.00%) engineering (1.85%) mathematics (1.63%) economic

In [8]:
import json

skill_clean = {key: {} for key in skill_set.keys()}

# Load existing mapping from skill_clean.json
try:
    with open("skill_clean.json") as f:
        _sc = json.load(f)
        for k1, v1 in _sc.items():
            for k2, v2 in v1.items():
                skill_clean[k1][k2] = v2
except:
    pass

# Dump all non-existent mapping to skill_clean.json from the database
for key in skill_set.keys():
    c = Counter(skill_set[key]).most_common(30)
    for val in c:
        if val[0] not in skill_clean[key]:
            skill_clean[key][val[0]] = val[0]

with open('skill_clean.json', 'w', encoding='utf-8') as f:
    json.dump(skill_clean, f, ensure_ascii=False, indent=4)


In [10]:
# After manually edit the mapping skill_clean.json, update the database

with open("skill_clean.json") as f:
    sc = json.load(f)

documents = collection.find({})

for item in documents:
    if item.get("cleaned", False):
        continue
    for key in sc.keys():
        if key not in item:
            continue
        new_set = list(set([sc[key][s] for s in item[key] if s in sc[key]]))
        item[key] = new_set
        if len(item[key]) == 0:
            item.pop(key)
    item["cleaned"] = True

    collection.replace_one(
        {"_id": item["_id"]},
        item
    )