# Import 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import ast
import os

from IPython.display import display, Markdown
from pymongo.mongo_client import MongoClient
from bson.objectid import ObjectId

# %pip install deep-translator
from deep_translator import GoogleTranslator

In [2]:
# Define the directory containing the CSV files
directory = "csv_backup"

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):  # Check if the file is a CSV
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        ### DROP DUPLICATES
        df = df.drop_duplicates()
        dataframes.append(df)

# Combine all DataFrames into a single DataFrame (optional)
data = pd.concat(dataframes)
data.head(3)

Unnamed: 0,ten_cong_viec,ten_cong_ty,muc_luong,dia_chi,ngay_dang,nganh_nghe,quy_mo_cong_ty,quoc_tich_cong_ty,nam_kinh_nghiem,cap_bac,loai_hinh,loai_hop_dong,cong_nghe_su_dung,quy_trinh_phong_van,mo_ta_cong_viec,thong_tin_cong_ty,url,thoi_gian_hien_tai,url_cong_ty
0,['Automotive Test Engineer'],['Công ty TNHH Yura Corporation Bắc Ninh_Chi n...,[],"['Tầng 10, tòa nhà Richy, tổ 44, Phường Yên Ho...",['Đăng 5 giờ trước'],"['Software, Phần Mềm']",['Hơn 1000 Nhân viên'],['Korea'],['Từ 1 năm'],"['Junior, Middle']",['In Office'],['Fulltime'],"['Tester', 'Test', 'Automotive']","['Vòng 1: Phỏng vấn qua điện thoại', 'Vòng 2: ...","[""Trách nhiệm công việc\nTest cases generation...",Về chúng tôi\nWe at YURA Corporation are recru...,https://topdev.vn/viec-lam/automotive-test-eng...,2024-11-28 20:55:17.541738,https://topdev.vn/vi/nha-tuyen-dung/cong-ty-tn...
1,['.NET Developer'],['CÔNG TY TNHH SAMSUNG SDS VIỆT NAM'],['Thương lượng'],"['Tòa nhà PVI, số 1 Phạm Văn Bạch, Phường Yên ...",['Đăng 5 giờ trước'],['Dịch vụ IT'],['Hơn 1000 Nhân viên'],['South-Korea'],['Từ 6 năm'],"['Middle, Senior']",['In Office'],['Fulltime'],"['ASP.NET', 'C#', 'Git']",['Vòng 1: CV phù hợp sẽ được liên hệ trong 15 ...,['Trách nhiệm công việc\nWork for projects to ...,Về chúng tôi\nSamsung SDS được thành lập năm 1...,https://topdev.vn/viec-lam/net-developer-cong-...,2024-11-28 20:55:17.541738,https://topdev.vn/nha-tuyen-dung/cong-ty-tnhh-...
2,['Full-stack Developer (Java/.NET/PHP - 3 YOE)'],['Allexceed Việt Nam'],['Thương lượng'],"['Tầng 15, Opal Tower, 92 Nguyễn Hữu Cảnh, Phư...",['Đăng 5 giờ trước'],['Phần Mềm'],['25-99 Nhân viên'],['Japan'],['Từ 3 năm'],"['Middle, Senior']",['In Office'],['Fulltime'],"['PHP', 'Java', '.NET']",['Vòng 1: Phỏng vấn kỹ thuật với team phát tri...,['Top 3 reasons to join us\nCơ hội làm việc (n...,Về chúng tôi\nALLEXCEED VIETNAM (thành lập 201...,https://topdev.vn/viec-lam/full-stack-develope...,2024-11-28 20:55:17.541738,https://topdev.vn/nha-tuyen-dung/allexceed-vie...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10469 entries, 0 to 505
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ten_cong_viec        10469 non-null  object
 1   ten_cong_ty          10469 non-null  object
 2   muc_luong            10469 non-null  object
 3   dia_chi              10469 non-null  object
 4   ngay_dang            10469 non-null  object
 5   nganh_nghe           10469 non-null  object
 6   quy_mo_cong_ty       10469 non-null  object
 7   quoc_tich_cong_ty    10469 non-null  object
 8   nam_kinh_nghiem      10469 non-null  object
 9   cap_bac              10469 non-null  object
 10  loai_hinh            10469 non-null  object
 11  loai_hop_dong        10469 non-null  object
 12  cong_nghe_su_dung    10469 non-null  object
 13  quy_trinh_phong_van  10469 non-null  object
 14  mo_ta_cong_viec      10469 non-null  object
 15  thong_tin_cong_ty    10457 non-null  object
 16  url        

# Preprocess

In [4]:
data["muc_luong"] = data["muc_luong"].replace("[]", "['Thương lượng']")
data["thoi_gian_hien_tai"] = data["thoi_gian_hien_tai"].apply(pd.to_datetime)
for col in data.columns:
    if col in ["thong_tin_cong_ty", "url", "url_cong_ty", "thoi_gian_hien_tai"]:
        continue
    data[col] = data[col].apply(ast.literal_eval)


def getOnlyElement(series):
    return series[0]


for col in data.columns:
    if col in ["thong_tin_cong_ty", "url", "url_cong_ty", "thoi_gian_hien_tai"]:
        continue
    if col in [
        "dia_chi",
        "loai_hinh",
        "loai_hop_dong",
        "cong_nghe_su_dung",
        "quy_trinh_phong_van",
    ]:
        continue
    data[col] = data[col].apply(getOnlyElement)

In [5]:
def split_(series):
    return series.split(", ")

def castUSA(series):
    return series.replace("USA", "United States")

data["cap_bac"] = data.cap_bac.apply(split_)
data["quoc_tich_cong_ty"] = data.quoc_tich_cong_ty.apply(castUSA).apply(split_)


In [6]:
def split_domain(series):
    lst = series.split(", ")
    if "Phần Mềm" in lst or "Software" in lst:
        lst = [x for x in lst if x!="Phần Mềm" and x!="Software"]   
        lst.append("Phần Mềm")     
    return lst
data["nganh_nghe"] = data.nganh_nghe.apply(split_domain)

In [7]:
def experience(series):
    num = re.findall(r"\d+", series)
    if len(num) == 0:
        return 0
    num = num[0]
    if "tháng" in series:
        return int(num) * 1.0 / 12
    if "năm" in series:
        return int(num)
    return pd.NA


data["nam_kinh_nghiem"] = data.nam_kinh_nghiem.apply(experience)

In [8]:
def size(series):
    if "." in series:
        series = series.replace(".", "")
    num = re.findall(r"\d+", series)
    if len(num) == 0:
        return None
    res = int(num[0])
    if res < 100:
        return "Nhỏ"
    if res < 1000:
        return "Vừa"
    return "Lớn"

data["quy_mo_cong_ty"] = data.quy_mo_cong_ty.apply(size)

In [9]:
def extract_salary_range(salary):
    if salary == "Thương lượng":
        return [0, 0]
    numbers = re.findall(r"[\d.,]+", salary)
    if numbers:
        numbers = [float(num.replace(".", "").strip()) for num in numbers]
        if len(numbers) == 1:
            if numbers[0] > 1_000_000:
                numbers[0] = numbers[0] / 24_000
            return [0, round(numbers[0], 2)]
        for i in range(len(numbers)):
            if numbers[i] > 1_000_000:
                numbers[i] = numbers[i] / 24_000

        # Lấy giá trị nhỏ nhất và lớn nhất
        min_value = min(numbers)
        max_value = max(numbers)

        return [round(min_value, 2), round(max_value, 2)]
    return [0, 0]


data["muc_luong"] = data["muc_luong"].apply(extract_salary_range)

In [10]:
data.replace("", pd.NA, inplace=True)
data.replace(pd.NA, "Khong co thong tin", inplace=True)

## Translate the job title into English

In [11]:
# 2. Hàm dịch tự động sử dụng Deep Translator
def translate_to_english(text):
    try:
        # Sử dụng Google Translate qua Deep Translator
        translated = GoogleTranslator(source='auto', target='en').translate(text)
        return translated
    except Exception as e:
        print(f"Translation failed for: {text}, Error: {e}")
        return text  # Trả về văn bản gốc nếu dịch lỗi
map_data = data.ten_cong_viec.value_counts().to_frame().reset_index().drop(columns=["count"])
vectorized_func = np.vectorize(translate_to_english)
map_res = vectorized_func(map_data["ten_cong_viec"].to_numpy())

In [12]:
map_data["translated"] = map_res
map_data = map_data.set_index("ten_cong_viec")
map_dict = map_data["translated"].to_dict()

In [13]:
def map_translate_result(text):
    try:
        return map_dict[text]
    except:
        print("Error mapping")
        return ""

# 6. Áp dụng hàm vào cột "Job Title"
data['ten_cong_viec'] = data['ten_cong_viec'].apply(map_translate_result)

# 7. In kết quả
data['ten_cong_viec']

0                               Automotive Test Engineer
1                                         .NET Developer
2           Full-stack Developer (Java/.NET/PHP - 3 YOE)
3                                    Fullstack Developer
4      Platform Solution Specialist - Platform Soluti...
                             ...                        
501                           Senior Front-End Developer
502                           Senior Fullstack Developer
503    [Urgent] SOFTWARE ENGINEER (C#, Prism Library ...
504                       Mobile App Developer (Flutter)
505                                      software tester
Name: ten_cong_viec, Length: 10469, dtype: object

## Cluster the jobs from job titles

In [14]:
# Reusing job group keywords for classification
job_groups = {
    "Management": ["manager", "lead", "coordinator", "scrum", "product owner","administrator","management"],
    "Software/Web/Mobile Development": ["programmer", "coder", "software","web","frontend","backend","game","development","fullstack","dev", "mobile", "ios", "android", "flutter", "react native"],
    "Data & AI": ["data", "ml", "machine learning", "big data", "scientist","ai engineer","ai technical"],
    "Business Analysis":["business analyst","business analysis","business system analyst"],
    "IT Solution & Consulting":["solution","consultant"],
    "QA & Testing": ["qa", "test", "testing", "automation","assurance"],
    "Infrastructure & DevOps": ["devops", "cloud", "system", "administrator", "infrastructure","architect"],
    "Cybersecurity": ["security", "cyber", "pen tester", "forensic", "vulnerability"],
    "Design & UI/UX": ["designer", "ui", "ux", "graphic", "product designer","2D","3D"],
    "Support": ["support", "helpdesk", "service desk"],
    "Other": []
}

# Classify jobs into groups based on keywords
def classify_job(title):
    title_lower = title.lower()
    matched_groups = []
    for group, keywords in job_groups.items():
        if any(keyword in title_lower for keyword in keywords):
            matched_groups.append(group)
    return matched_groups if matched_groups else ["Other"]

job_titles_sample = data["ten_cong_viec"]
# Apply classification to the sample titles
classified_jobs = {title: classify_job(title) for title in job_titles_sample}
data["nhom_cong_viec"] = classified_jobs

In [15]:
data["nhom_cong_viec"] = data["ten_cong_viec"].apply(classify_job)
data.nhom_cong_viec.explode().value_counts()

nhom_cong_viec
Software/Web/Mobile Development    5302
Management                         1752
Infrastructure & DevOps            1427
QA & Testing                       1095
Data & AI                          1049
Cybersecurity                       555
IT Solution & Consulting            516
Business Analysis                   467
Other                               448
Support                             332
Design & UI/UX                      182
Name: count, dtype: int64

In [16]:
data.head()

Unnamed: 0,ten_cong_viec,ten_cong_ty,muc_luong,dia_chi,ngay_dang,nganh_nghe,quy_mo_cong_ty,quoc_tich_cong_ty,nam_kinh_nghiem,cap_bac,loai_hinh,loai_hop_dong,cong_nghe_su_dung,quy_trinh_phong_van,mo_ta_cong_viec,thong_tin_cong_ty,url,thoi_gian_hien_tai,url_cong_ty,nhom_cong_viec
0,Automotive Test Engineer,Công ty TNHH Yura Corporation Bắc Ninh_Chi nhá...,"[0, 0]","[Tầng 10, tòa nhà Richy, tổ 44, Phường Yên Hoà...",Đăng 5 giờ trước,[Phần Mềm],Lớn,[Korea],1.0,"[Junior, Middle]",[In Office],[Fulltime],"[Tester, Test, Automotive]","[Vòng 1: Phỏng vấn qua điện thoại, Vòng 2: Phỏ...",Trách nhiệm công việc\nTest cases generation b...,Về chúng tôi\nWe at YURA Corporation are recru...,https://topdev.vn/viec-lam/automotive-test-eng...,2024-11-28 20:55:17.541738,https://topdev.vn/vi/nha-tuyen-dung/cong-ty-tn...,[QA & Testing]
1,.NET Developer,CÔNG TY TNHH SAMSUNG SDS VIỆT NAM,"[0, 0]","[Tòa nhà PVI, số 1 Phạm Văn Bạch, Phường Yên H...",Đăng 5 giờ trước,[Dịch vụ IT],Lớn,[South-Korea],6.0,"[Middle, Senior]",[In Office],[Fulltime],"[ASP.NET, C#, Git]",[Vòng 1: CV phù hợp sẽ được liên hệ trong 15 n...,Trách nhiệm công việc\nWork for projects to de...,Về chúng tôi\nSamsung SDS được thành lập năm 1...,https://topdev.vn/viec-lam/net-developer-cong-...,2024-11-28 20:55:17.541738,https://topdev.vn/nha-tuyen-dung/cong-ty-tnhh-...,[Software/Web/Mobile Development]
2,Full-stack Developer (Java/.NET/PHP - 3 YOE),Allexceed Việt Nam,"[0, 0]","[Tầng 15, Opal Tower, 92 Nguyễn Hữu Cảnh, Phườ...",Đăng 5 giờ trước,[Phần Mềm],Nhỏ,[Japan],3.0,"[Middle, Senior]",[In Office],[Fulltime],"[PHP, Java, .NET]",[Vòng 1: Phỏng vấn kỹ thuật với team phát triể...,Top 3 reasons to join us\nCơ hội làm việc (ngắ...,Về chúng tôi\nALLEXCEED VIETNAM (thành lập 201...,https://topdev.vn/viec-lam/full-stack-develope...,2024-11-28 20:55:17.541738,https://topdev.vn/nha-tuyen-dung/allexceed-vie...,[Software/Web/Mobile Development]
3,Fullstack Developer,CÔNG TY TNHH SAMSUNG SDS VIỆT NAM,"[0, 0]","[Tòa nhà PVI, số 1 Phạm Văn Bạch, Phường Yên H...",Đăng 6 giờ trước,[Dịch vụ IT],Lớn,[South-Korea],4.0,[Middle],[In Office],[Fulltime],"[Full-Stack, VueJS, Java Core]",[Vòng 1: CV phù hợp sẽ được liên hệ trong 15 n...,Trách nhiệm công việc\nTham gia phát triển và ...,Về chúng tôi\nSamsung SDS được thành lập năm 1...,https://topdev.vn/viec-lam/fullstack-developer...,2024-11-28 20:55:17.541738,https://topdev.vn/nha-tuyen-dung/cong-ty-tnhh-...,[Software/Web/Mobile Development]
4,Platform Solution Specialist - Platform Soluti...,MBBANK,"[0, 0]","[MB Tower, số 18 Lê Văn Lương, Phường Trung Ho...",Đăng 7 giờ trước,[Ngân Hàng],Lớn,[Vietnam],0.0,"[Junior, Middle, Senior]",[In Office],[Fulltime],"[Oracle, Database, Kafka]",[],Trách nhiệm công việc\n Thực hiện việc nghiên ...,"Về chúng tôi\nĐược thành lập từ năm 1994, với ...",https://topdev.vn/viec-lam/chuyen-vien-giai-ph...,2024-11-28 20:55:17.541738,https://topdev.vn/nha-tuyen-dung/mbbank-94346?...,[IT Solution & Consulting]


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10469 entries, 0 to 505
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ten_cong_viec        10469 non-null  object        
 1   ten_cong_ty          10469 non-null  object        
 2   muc_luong            10469 non-null  object        
 3   dia_chi              10469 non-null  object        
 4   ngay_dang            10469 non-null  object        
 5   nganh_nghe           10469 non-null  object        
 6   quy_mo_cong_ty       10469 non-null  object        
 7   quoc_tich_cong_ty    10469 non-null  object        
 8   nam_kinh_nghiem      10469 non-null  float64       
 9   cap_bac              10469 non-null  object        
 10  loai_hinh            10469 non-null  object        
 11  loai_hop_dong        10469 non-null  object        
 12  cong_nghe_su_dung    10469 non-null  object        
 13  quy_trinh_phong_van  10469 non-null  o

# Push to MongoDB Atlas (Cloud Database)

In [18]:
uri = "mongodb+srv://endgame:endgame@cluster0.rkdhc.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Create a new client and connect to the server
client = MongoClient(uri)

# Send a ping to confirm a successful connection
try:
    client.admin.command("ping")
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

db = client["udptdltm"]
collection = db["data"]

Pinged your deployment. You successfully connected to MongoDB!


> Check the len of the database before

In [19]:
object_ids = [doc["_id"] for doc in collection.find({}, {"_id": 1})]
len(object_ids)

0

> Push to cloud

In [20]:
tmp = []
for i in range(data.shape[0]):
    tmp.append(data.iloc[i].to_dict())

collection.insert_many(tmp)

InsertManyResult([ObjectId('6750d6c517a8543ef33f8828'), ObjectId('6750d6c517a8543ef33f8829'), ObjectId('6750d6c517a8543ef33f882a'), ObjectId('6750d6c517a8543ef33f882b'), ObjectId('6750d6c517a8543ef33f882c'), ObjectId('6750d6c517a8543ef33f882d'), ObjectId('6750d6c517a8543ef33f882e'), ObjectId('6750d6c517a8543ef33f882f'), ObjectId('6750d6c517a8543ef33f8830'), ObjectId('6750d6c517a8543ef33f8831'), ObjectId('6750d6c517a8543ef33f8832'), ObjectId('6750d6c517a8543ef33f8833'), ObjectId('6750d6c517a8543ef33f8834'), ObjectId('6750d6c517a8543ef33f8835'), ObjectId('6750d6c517a8543ef33f8836'), ObjectId('6750d6c517a8543ef33f8837'), ObjectId('6750d6c517a8543ef33f8838'), ObjectId('6750d6c517a8543ef33f8839'), ObjectId('6750d6c517a8543ef33f883a'), ObjectId('6750d6c517a8543ef33f883b'), ObjectId('6750d6c517a8543ef33f883c'), ObjectId('6750d6c517a8543ef33f883d'), ObjectId('6750d6c517a8543ef33f883e'), ObjectId('6750d6c517a8543ef33f883f'), ObjectId('6750d6c517a8543ef33f8840'), ObjectId('6750d6c517a8543ef33f88

> Check the len of the database after

In [21]:
object_ids = [doc["_id"] for doc in collection.find({}, {"_id": 1})]
len(object_ids)

10469

In [22]:
collection.distinct("thoi_gian_hien_tai")

[datetime.datetime(2024, 10, 29, 23, 49, 40, 410000),
 datetime.datetime(2024, 10, 30, 21, 12, 27, 898000),
 datetime.datetime(2024, 10, 31, 22, 31, 43, 598000),
 datetime.datetime(2024, 11, 1, 18, 57, 33, 986000),
 datetime.datetime(2024, 11, 2, 21, 31, 49, 226000),
 datetime.datetime(2024, 11, 3, 21, 38, 30),
 datetime.datetime(2024, 11, 4, 22, 16, 5, 607000),
 datetime.datetime(2024, 11, 5, 21, 36, 12, 594000),
 datetime.datetime(2024, 11, 6, 22, 21, 3, 196000),
 datetime.datetime(2024, 11, 7, 21, 16, 27, 26000),
 datetime.datetime(2024, 11, 8, 21, 29, 37, 990000),
 datetime.datetime(2024, 11, 9, 22, 2, 25, 48000),
 datetime.datetime(2024, 11, 10, 20, 31, 45, 149000),
 datetime.datetime(2024, 11, 11, 14, 20, 13, 537000),
 datetime.datetime(2024, 11, 12, 21, 37, 52, 656000),
 datetime.datetime(2024, 11, 13, 13, 39, 32, 143000),
 datetime.datetime(2024, 11, 14, 14, 45, 15, 444000),
 datetime.datetime(2024, 11, 15, 21, 20, 17, 292000),
 datetime.datetime(2024, 11, 16, 20, 54, 43, 49500