In [1]:
import pandas as pd
from gensim import models

ko_model = models.fasttext.load_facebook_model('data/cc.ko.300.bin.gz')
df_job_info = pd.read_csv(
    'data/job_data.csv', encoding='utf-8')
df_job_major_subject = pd.read_csv(
    'data/job_major_subject.csv', encoding='utf-8')
df_major_info = pd.read_csv(
    'data/major_info.csv', encoding='utf-8')
df_subject_info = pd.read_csv('data/subject_info.csv', encoding='utf-8', header=0, names=(['index', 'subject_name', 'description', 'subject_type', 'subject_grade']))


In [4]:

def similarity_function(method, text1, text2):
    # sourcery skip: merge-comparisons, merge-duplicate-blocks, remove-redundant-if
    if method == 'fasttext':
        return ko_model.wv.n_similarity(text1.split(), text2.split())
    elif method == 'SBERT':
        return ko_model.wv.n_similarity(text1.split(), text2.split())
    else:
        print('No method')


def job_major_subject_matching(job_name):

    # job-major matching
    majorlist = list(
        df_job_major_subject[df_job_major_subject['job'] == job_name].major.unique())
    job_major = {job_name: majorlist}

    # major-subject matching
    subjects = df_job_major_subject[df_job_major_subject['job'] == job_name]
    subject_details = subjects.groupby(
        'major', group_keys=False).subject_details.apply(lambda x: ','.join(x))

    major_subject = dict()
    job_subject = dict()

    for idx, line in enumerate(subject_details):
        tmp = line.replace(':', ',')
        tmp = tmp.split(',')
        # '~교과'로 되어 있는 노이즈 데이터 제거
        for e in tmp:
            if '교과' in e:
                tmp.remove(e)
        tmp = [i.replace(' ', '') for i in tmp]

        # '수학II'가 아니라 그냥 'II'라고 되어 있는 과목명을 바로 직전의 과목명을 참고하여 '수학II'로 변경
        for e in tmp:
            if len(e) == 1:
                i = tmp.index(e)
                tmp[i] = tmp[i-1][:-1]+e

        # Add key and value to dictionarry
        major_subject[subject_details.keys()[idx]] = tmp

        subject_set = set()
        for value in major_subject.values():
            for v in value:
                subject_set.add(v)

        job_subject = {job_name: list(subject_set)}

    return job_major, major_subject, job_subject
# from job_major, get major list and calculate similarity between job and major


def get_job_major_similarity(df_job_info, df_major_info, job_major, sim_method):
    job_major_sim = dict()
    for key, value in job_major.items():
        for v in value:
            # get major description
            major_desc = df_major_info[df_major_info['major']
                                       == v].major_summary.values[0]
        # get job description
            job_desc = df_job_info[df_job_info['job']
                                   == key].job_summary.values[0]
        # calculate similarity between job description and major description
            job_major_sim[v] = similarity_function(
                sim_method, major_desc, job_desc)
    return job_major_sim
# from major_subject, get subject list and calculate similarity between major description and subject description


def get_major_subject_similarity(df_major_info, df_subject_info, major_subject, sim_method):
    major_subject_sim = dict()

    for key, value in major_subject.items():
        one_major_sim = dict()
        for v in value:
            # get major description
            major_desc = df_major_info[df_major_info['major']
                                       == key].major_summary.values[0]
        # get subject description(try catch)
            try:
                tmp_df_subject_info = df_subject_info.replace(
                    ' ', '', regex=True)
                subject_desc = df_subject_info[tmp_df_subject_info['subject_name']
                                               == v].description.values[0]
            # calculate similarity between major description and subject description
                one_major_sim[v] = similarity_function(
                    sim_method, major_desc, subject_desc)
            except Exception:
                continue
        major_subject_sim[key] = one_major_sim
    return major_subject_sim


def get_job_subject_similarity_1(df_job_info, df_subject_info, job_subject, sim_method):
    job_subject_sim = dict()
    for key, value in job_subject.items():
        for v in value:
            # get major description
            job_desc = df_job_info[df_job_info['job']
                                   == key].job_summary.values[0]
        # get subject description(try catch)
            try:
                tmp_df_subject_info = df_subject_info.replace(
                    ' ', '', regex=True)
                subject_desc = df_subject_info[tmp_df_subject_info['subject_name']
                                               == v].description.values[0]
            # calculate similarity between major description and subject description
                job_subject_sim[v] = similarity_function(
                    sim_method, job_desc, subject_desc)
            except Exception:
                continue
    return job_subject_sim


def get_job_subject_similarity_2(job_major_sim, major_subject_sim):
    job_subject_sim_2 = dict()
    for major, majorsim in job_major_sim.items():
        for subject, subjectsim in major_subject_sim.get(major).items():
            try:
                job_subject_sim_2[subject] += majorsim*subjectsim
            except Exception:
                job_subject_sim_2[subject] = majorsim*subjectsim
    return job_subject_sim_2


def get_subject_subject_similarity(job_subject_sim, df_subject_info, threshold, sim_method):
    subject_list = list(job_subject_sim.keys())
    num_subject = len(subject_list)
    subject_dict = dict()

    tmp_df_subject_info = df_subject_info.replace(' ', '', regex=True)

    for idx1, subject1 in enumerate(subject_list):
        subject1_desc = df_subject_info[tmp_df_subject_info['subject_name']
                                        == subject1].description.values[0]
        for idx2, subject2 in enumerate(subject_list):
            if idx1 >= idx2:
                continue
            subject2_desc = df_subject_info[tmp_df_subject_info['subject_name']
                                            == subject2].description.values[0]
            similarity = similarity_function(
                sim_method, subject1_desc, subject2_desc)
            if similarity > threshold:
                subject_dict[(subject1, subject2)] = similarity
    return subject_dict
def subject_mask(df, subject_grade):

    if subject_grade is None:
        return df
    mask = (df.subject_grade == subject_grade)
    return df[mask]


def similarity_all(job_name, sim_method, threshold_subject=0.98, grade = None):
    # print(df_job_info)
    df_subject_info_masked = subject_mask(df_subject_info, grade)
    job_major, major_subject, job_subject = job_major_subject_matching(
        job_name)
    job_major_sim = get_job_major_similarity(
        df_job_info, df_major_info, job_major, sim_method)
    major_subject_sim = get_major_subject_similarity(
        df_major_info, df_subject_info_masked, major_subject, sim_method)
    job_subject_sim_1 = get_job_subject_similarity_1(
        df_job_info, df_subject_info_masked, job_subject, sim_method)
    job_subject_sim_2 = get_job_subject_similarity_2(
        job_major_sim, major_subject_sim)
    subject_subject_sim = get_subject_subject_similarity(
        job_subject_sim_1, df_subject_info_masked, threshold_subject, sim_method)

    return job_major_sim, major_subject_sim, job_subject_sim_1, job_subject_sim_2, subject_subject_sim



In [5]:
job_name = "통계학연구원" 
sim_method = 'fasttext'
threshold_subject=0.98
grade = 1

df_subject_info_masked = subject_mask(df_subject_info, grade)
job_major, major_subject, job_subject = job_major_subject_matching(
    job_name)
job_major_sim = get_job_major_similarity(
    df_job_info, df_major_info, job_major, sim_method)
major_subject_sim = get_major_subject_similarity(
    df_major_info, df_subject_info_masked, major_subject, sim_method)
job_subject_sim_1 = get_job_subject_similarity_1(
    df_job_info, df_subject_info_masked, job_subject, sim_method)
job_subject_sim_2 = get_job_subject_similarity_2(
    job_major_sim, major_subject_sim)
subject_subject_sim = get_subject_subject_similarity(
    job_subject_sim_1, df_subject_info_masked, threshold_subject, sim_method)


In [8]:
df_subject_info_masked

Unnamed: 0,index,subject_name,description,subject_type,subject_grade
0,0,국어,국어는 대한민국의 공용어로서 사고와 의사소통의 도구이자 문화 창조와 전승의 기반이다...,common,1
1,1,통합과학,"‘통합과학’은 자연 현상을 통합적으로 이해하고, 이를 기반으로 자연 현상과 인간의 ...",common,1
2,2,과학탐구실험,고등학교 ‘과학탐구실험’은 9학년까지의 ‘과학’을 학습한 학생들을 대상으로 하여 과...,common,1
3,3,한국사,‘한국사’는 우리 역사가 형성·발전되어 온 과정을 이해하여 역사적으로 사고하고 현대...,common,1
4,4,수학,"수학과는 수학의 개념, 원리, 법칙을 이해하고 기능을 습득하여 주변의 여러 가지 현...",common,1
5,5,영어,공통 과목을 포함한 선택 과목으로서의 고등학교 영어 교과는 영어로 의사소통할 수...,common,1
6,6,통합사회,"통합사회는 인간, 사회, 국가, 지구 공동체 및 환경을 개별 학문의 경계를 넘어 통...",common,1


In [7]:
major_subject

{'수학과': ['수학',
  '사회',
  '정보',
  '과학',
  '수학Ⅰ',
  '수학Ⅱ',
  '미적분',
  '확률과통계',
  '사회문화',
  '경제',
  '생활과윤리',
  '정보',
  '논리학',
  '철학',
  '논술',
  '실용경제',
  '교육학',
  '실용수학',
  '기하',
  '경제수학',
  '수학과제탐구',
  '물리학Ⅱ',
  '심화수학Ⅰ',
  '심화수학Ⅱ',
  '고급수학Ⅰ',
  '고급수학Ⅱ',
  '정보과학',
  '정보처리와관리'],
 '통계학과': ['수학',
  '사회',
  '기술가정',
  '생활ㆍ교양',
  '수학Ⅰ',
  '수학Ⅱ',
  '미적분',
  '확률과통계',
  '경제',
  '사회문화',
  '생활과윤리',
  '정보',
  '실용경제',
  '실용수학',
  '기하',
  '경제수학',
  '수학과제탐구',
  '사회문제탐구',
  '심화수학Ⅰ',
  '정보과학',
  '국제경제',
  '국제법',
  '지역이해',
  '세계문제와미래사회',
  '국제관계와국제기구',
  '사회탐구방법및사회과제연구',
  '기업자원통합관리',
  '금융일반',
  '보험일반',
  '예산ㆍ자금']}