In [139]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [140]:
jobs = pd.read_csv('jobs.csv')

In [141]:
jobs.columns = ['Vacancy Category', 'Position Category', 'Position', 'County City', 'Region', 'Number of Supply and Demand', 'Company Name', 'Vacancy Name', 'Job Content',
        'Job Category', 'Job Benefits', 'Work Nature', 'Working Location', 'Management Responsibilities', 'Working Hours', 'Number of People Required', 'Working Experience', 'Education Requirements',
        'Department requirements', 'Proficiency in tools', 'Work skills', 'Other conditions', 'Capital amount', 'Number of employees', 'Company label']

In [142]:
from googletrans import Translator
translator  = Translator()

In [143]:
for column in  jobs.columns:
    print(column, ":", len(jobs[column].unique()))

Vacancy Category : 2
Position Category : 3
Position : 21
County City : 23
Region : 221
Number of Supply and Demand : 5
Company Name : 6736
Vacancy Name : 25514
Job Content : 26049
Job Category : 4906
Job Benefits : 1346
Work Nature : 64
Working Location : 9758
Management Responsibilities : 337
Working Hours : 1335
Number of People Required : 96
Working Experience : 12
Education Requirements : 28
Department requirements : 1345
Proficiency in tools : 9319
Work skills : 4556
Other conditions : 16999
Capital amount : 1506
Number of employees : 482
Company label : 44


In [144]:
jobs = jobs.loc[
    :,
    [
        "Position",
        "County City",
        "Region",
        "Company Name",
        "Job Benefits",
        "Work Nature",
        "Number of People Required",
        "Working Experience",
        "Education Requirements",
        "Proficiency in tools",
        "Number of employees",
    ],
]

In [145]:
short_columns = []

for column in  jobs.columns:
    if len(jobs[column].unique()) < 100:
        short_columns.append(column)
        
print(short_columns)

['Position', 'County City', 'Work Nature', 'Number of People Required', 'Working Experience', 'Education Requirements']


In [146]:
work_nature_count = jobs['Work Nature'].value_counts().reset_index()
work_natures = list(work_nature_count[work_nature_count['count'] > 5]['Work Nature'])

jobs.loc[~jobs['Work Nature'].isin(work_natures), 'Work Nature'] = 'Not Specified'
jobs['Work Nature'].value_counts()

Work Nature
全職                               57616
兼職                                 463
兼職 - 長期工讀                          326
Not Specified                      187
兼職 - 長期工讀、短期工讀                      76
兼職 - 短期工讀                           45
兼職 - 長期工讀、短期工讀、寒假工讀、暑假工讀            26
兼職 - 長期工讀、暑假工讀                      24
兼職 - 長期工讀、短期工讀、假日工讀、寒假工讀、暑假工讀       19
兼職 - 長期工讀、短期工讀、假日工讀                 18
兼職 - 長期工讀、短期工讀、暑假工讀                 15
兼職 - 長期工讀、假日工讀                      12
兼職 - 長期工讀、短期工讀、寒假工讀                  6
Name: count, dtype: int64

In [147]:
already_translated = []

In [149]:
translated_values = {}

short_columns.append('Region')

try:
    for column in jobs[short_columns]:
        print(column)
        if column in already_translated:
            print(jobs[column].unique())
            continue
        for word in jobs[column].unique():
            translated_values[word] = translator.translate(word, src = 'zh-tw', dest='en').text
            jobs[column] = jobs[column].replace(translated_values)
        print(jobs[column].unique())
        already_translated.append(column)
except:
    print('Error ocurred')

Position
['Software Director' 'Nan' 'E -commerce technology supervisor'
 'Communication software engineer' 'Software design engineer'
 'Holler design engineer' 'Internet designer' 'System analyst'
 'Video game designer' 'Other information professionals'
 'Information Assistant' 'BIOS engineer' 'Algorithm development engineer'
 'MIS_ 主 m m m' 'Database manager' 'MIS programmer' 'MES engineer'
 'Internet management engineer' 'System Maintenance'
 'Information equipment control staff' 'Internet security analyst']
County City
['Taipei City' 'Nan' 'New Taipei City' 'Yilan County' 'Taoyuan City'
 'Hsinchu City' 'Hsinchu County' 'Miaoli County' 'Taichung City'
 'Changhua County' 'Nantou County' 'Chiayi City' 'Tainan City' 'Kaohsiung'
 'Pingtung County' 'Hualien County' 'Keelung City' 'Yunlin County'
 'Chiayi County' 'Taitung County' 'Penghu County' 'Kinmen County'
 'Lianjiang County']
Work Nature
['full time' 'Not specify' 'part time' 'Part -time -long -term work'
 'Part -time -long -term wor

In [150]:
jobs.loc[jobs['Work Nature'] == 'full time', 'Work Nature'] = 'Full-time'
jobs.loc[jobs['Work Nature'] == 'part time', 'Work Nature'] = 'Part-time'

jobs.replace('不拘', 'Not specified', inplace=True)
jobs.replace('Nan', 'Not specified', inplace=True)
jobs.replace('Not specify', 'Not specified', inplace=True)

In [151]:
jobs['Number of employees'] = jobs['Number of employees'].str.extract('(\d+)')

In [152]:
jobs.to_csv('jobs_translated.csv', index=False)