In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
from gensim.models.word2vec import Word2Vec
from konlpy.tag import Okt
from pdftotextByPage import mining


<H2>문장들로 이뤄진 데이터 불러오기

In [2]:


train_data = pd.read_table('txtoutput4.txt')

if (train_data.isnull().values.any() == 'True'):
    train_data = train_data.dropna(how = 'any') # Null 값이 존재하는 행 제거

train_data.columns


Index(['Chapter 2:  Operating-System Structures (구조)  '], dtype='object')

In [3]:
train_data

Unnamed: 0,Chapter 2: Operating-System Structures (구조)
0,Chapter 2: Operating-System Structures
1,운영체제 서비스 (Operating System Services)
2,운영체제 사용자 인터페이스 (User Operating System Interface)
3,시스템 콜 (System Calls)
4,시스템 콜 유형 (Types of System Calls)
...,...
399,ROM 또는 EPROM(Erasable Programmable ROM)에 있는 bo...
400,두 단계로 나눠지는 경우도 있음- ROM의 특정한 위치에 있는부트블록(boot bl...
401,GRUB(GRand Unified Bootloader) : Linux 시스템을 위...
402,커널이 적재되어 시스템이 실행(running)되게 된다 1.632.63


<h3> 열이름 지정

In [4]:

train_data.columns=['Document'] 
train_data[:5]

Unnamed: 0,Document
0,Chapter 2: Operating-System Structures
1,운영체제 서비스 (Operating System Services)
2,운영체제 사용자 인터페이스 (User Operating System Interface)
3,시스템 콜 (System Calls)
4,시스템 콜 유형 (Types of System Calls)


<h3>문장에서 명사 추출

In [5]:
from konlpy.tag import Okt

nounsLine = []
nouns=[]
Okt = Okt()

for i in range (len(train_data['Document'])):
                line = train_data['Document'][i]# 한 페이지 씩 읽어 옴
                nouns = Okt.nouns(line) #한 페이지에 있는 명사들 모두 nouns에 저장 (리스트)

                if len(nouns) == 0 : 
                    nounsLine.append(' ')
                    continue #명사 없을 경우 ' ' 저장 후 pass
                line= ' '.join(map(str, nouns))
                nounsLine.append(line)
nounsLine
train_data['tokenizedDocument'] = nounsLine

<h3>결측값 제거

In [6]:
indexing = train_data[train_data['tokenizedDocument'] == ' '].index
train_data = train_data.drop(indexing)

<h2>토큰화

In [7]:
from tqdm import tqdm
from konlpy.tag import Okt

okt = Okt()

tokenized_data = []
for sentence in tqdm(train_data['tokenizedDocument']):
    tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
    tokenized_data.append(tokenized_sentence)

100%|██████████| 346/346 [00:00<00:00, 3088.74it/s]


In [8]:
#토큰 화 된 전체 단어 수 확인
count = len(tokenized_data)
for i in range(len(tokenized_data)):
    count+=len(tokenized_data[i])
count

2101

In [9]:
tokenized_data

[['운영체제', '서비스'],
 ['운영체제', '사용자', '인터페이스'],
 ['시스템', '콜'],
 ['시스템', '콜', '유형'],
 ['시스템', '프로그램'],
 ['운영체제', '설계', '및', '구현'],
 ['운영체제', '구조'],
 ['운영체제', '디버깅'],
 ['운영체제', '생'],
 ['시스템', '부팅'],
 ['목표'],
 ['운영체제', '사용자', '프로세스', '다른', '시스템', '제공', '서비스', '설명'],
 ['운영체제', '구성', '방법', '대해', '논의'],
 ['운영체제', '설치', '맞춤화', '과정', '부팅', '방법'],
 ['운영체제', '서비스'],
 ['운영체제', '사용자', '프로그램', '프로그램', '실행', '환경', '제공'],
 ['사용자', '프로그램', '실행', '시스템', '효율', '운용', '담당'],
 ['운영체제', '서비스'],
 ['사용자', '업무', '수행', '데', '기능', '제공', '운영체제', '서비스'],
 ['사용자', '인터페이스'],
 ['명령어', '라인', '인터페이스', '문자열', '명령', '입력'],
 ['배치', '인터페이스', '명령어', '명령어', '집합', '파일', '제공'],
 ['운영체제', '서비스'],
 ['사용자', '업무', '수행', '데', '기능', '제공', '운영체제', '서비스'],
 ['프로그램', '실행'],
 ['프로그램', '메모리', '적재', '실행', '실행', '종료', '정상', '비정상'],
 ['입출력', '연산'],
 ['프로그램', '실행', '요구', '파일', '입출력', '장치', '등', '입출력', '서비스'],
 ['운영체제', '서비스'],
 ['사용자', '업무', '수행', '데', '기능', '제공', '운영체제', '서비스'],
 ['파일', '시스템'],
 ['파일', '디렉토리', '생', '성과', '삭제'],
 ['파일', '디렉토리',

<h1> 사전 학습된 Word2Vec 모델 사용 </h1>
<h6>1.size = 워드 벡터의 특징 값. 즉, 임베딩 된 벡터의 차원.</h6>
<h6>2.window = 컨텍스트 윈도우 크기</h6>
<h6>3.min_count = 단어 최소 빈도 수 제한 (빈도가 적은 단어들은 학습하지 않는다.)</h6>
<h6>4.workers = 학습을 위한 프로세스 수</h6>
<h6>5.sg = 0은 CBOW, 1은 Skip-gram.</h6>

In [10]:
from gensim.models import Word2Vec

model = Word2Vec.load(".\model\Model_Wikipedia\kor_wikipedia_w2v.model")
model.wv.vectors.shape

(305743, 100)

In [11]:
model.wv.has_index_for('구조')

True

In [12]:
#영벡터 처리 위해 만듦
blankVec = []
for i in range(100):
    blankVec.append(0)
blankVec

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

<h1>문장 벡터 계산

In [13]:
senVecs = [] # 문장 벡터 저장할 리스트
for i in range(len(tokenized_data)): # 데이터 길이 (문장 갯수 만큼 반복)
    senVecs.append(0) # 리스트에 0을 추가하고
    for j in range(len(tokenized_data[i])): # 각각 단어 벡터를 이용해 평균 벡터 추출, 문장 벡터 저장할 리스트에 가져옴
        if(model.wv.has_index_for(tokenized_data[i][j])): #모델에 있는 단어인지 검사
            senVecs[i] += (model.wv.get_vector(tokenized_data[i][j]))/len(tokenized_data[i])

<h3> 영벡터 처리 작업

In [14]:
#senVecs에 배열 형태로 저장이 된다.
# 정수형 타입의 값인 0이 저장된 것을 영벡터로 처리하는 과정이다.

veclist = [] #리스트로 변환된 벡터들 여기에 담음
for i in range(len(senVecs)):
    if (type(senVecs[i]) == int):
        veclist.append(blankVec) #영벡터 처리
        continue
    veclist.append(list(senVecs[i]))

In [15]:
len(veclist[0]) # 차원 수와 같음

100

In [16]:
len(veclist) # 문장 수와 같음

346

<h3>dataframe에 벡터들 담을 열 생성 (100차원이므로 100개 생성)

In [17]:
dmCol = []    #차원 100개 맞춰 생성

for i in range(100):
    dmCol.append('dm%d'%(i+1)) # append로 요소 추가
 
dmCol

['dm1',
 'dm2',
 'dm3',
 'dm4',
 'dm5',
 'dm6',
 'dm7',
 'dm8',
 'dm9',
 'dm10',
 'dm11',
 'dm12',
 'dm13',
 'dm14',
 'dm15',
 'dm16',
 'dm17',
 'dm18',
 'dm19',
 'dm20',
 'dm21',
 'dm22',
 'dm23',
 'dm24',
 'dm25',
 'dm26',
 'dm27',
 'dm28',
 'dm29',
 'dm30',
 'dm31',
 'dm32',
 'dm33',
 'dm34',
 'dm35',
 'dm36',
 'dm37',
 'dm38',
 'dm39',
 'dm40',
 'dm41',
 'dm42',
 'dm43',
 'dm44',
 'dm45',
 'dm46',
 'dm47',
 'dm48',
 'dm49',
 'dm50',
 'dm51',
 'dm52',
 'dm53',
 'dm54',
 'dm55',
 'dm56',
 'dm57',
 'dm58',
 'dm59',
 'dm60',
 'dm61',
 'dm62',
 'dm63',
 'dm64',
 'dm65',
 'dm66',
 'dm67',
 'dm68',
 'dm69',
 'dm70',
 'dm71',
 'dm72',
 'dm73',
 'dm74',
 'dm75',
 'dm76',
 'dm77',
 'dm78',
 'dm79',
 'dm80',
 'dm81',
 'dm82',
 'dm83',
 'dm84',
 'dm85',
 'dm86',
 'dm87',
 'dm88',
 'dm89',
 'dm90',
 'dm91',
 'dm92',
 'dm93',
 'dm94',
 'dm95',
 'dm96',
 'dm97',
 'dm98',
 'dm99',
 'dm100']

In [18]:
df = pd.DataFrame(veclist, columns = dmCol) #dataframe으로 만듦
df

Unnamed: 0,dm1,dm2,dm3,dm4,dm5,dm6,dm7,dm8,dm9,dm10,...,dm91,dm92,dm93,dm94,dm95,dm96,dm97,dm98,dm99,dm100
0,0.714455,-0.531385,2.488015,-0.770630,0.112286,0.774671,2.070887,1.770209,-2.353306,1.783039,...,-0.610353,-1.074397,-0.708204,-0.606940,-1.245999,1.037042,0.395951,0.475926,0.371759,-1.622231
1,1.373194,-2.268541,2.041227,-1.548548,0.202811,0.251074,0.367272,1.672797,-2.467688,1.628929,...,-0.636907,-0.851541,-2.177040,-1.377887,0.282307,1.705836,1.317941,1.302224,-1.182694,-0.189527
2,0.011394,-0.826875,2.973761,-0.366313,-0.839046,0.789231,0.961213,1.684802,-3.608681,0.754566,...,-0.595384,0.178099,1.141711,-1.204290,-0.019782,0.544513,2.611695,1.183234,-0.109376,-0.170399
3,-0.314897,-1.808564,2.901775,-0.102677,-0.380242,0.683322,0.860649,1.900799,-3.395546,0.877328,...,0.153475,0.073037,1.136598,-1.001038,0.101517,0.996975,2.293803,1.101085,-0.827427,0.046568
4,0.597797,-4.119634,4.071386,-0.778798,-0.830832,0.947624,2.868076,3.423932,-5.139441,0.825764,...,1.820633,-0.196410,-0.282522,-2.117036,1.165344,1.074218,3.273731,1.062482,0.584410,-1.640707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,0.222149,-3.372326,3.010398,-1.379068,-1.122083,-1.308177,-0.303563,1.678100,-2.084328,0.348584,...,0.933680,1.303830,-1.493170,-2.017200,0.774388,0.170071,0.344846,2.664122,-0.876150,-2.341106
342,0.212965,-1.827281,3.537088,-1.536552,-0.451717,-1.553486,0.283955,1.621245,-2.918727,0.658755,...,1.060881,-0.256200,-2.471534,-1.501794,-0.710068,0.662465,1.444199,-0.119418,-0.345229,-0.273893
343,0.028034,-1.855329,1.726132,-0.514380,0.055517,-0.897894,-0.397566,1.820389,-1.111108,1.315148,...,1.477395,-0.121360,-0.036039,-0.523707,-0.609615,0.579980,0.493013,0.585409,-0.329471,-0.318402
344,0.375210,-1.566365,1.840266,0.172384,-0.653272,0.393887,0.515321,1.490309,-2.052484,1.067986,...,2.143717,-0.657411,-1.112525,0.238495,0.045189,0.705270,1.014119,-0.178653,0.173723,-0.018322


<h3> 문장과 해당 벡터를 파악할수 있도록 문장정보,벡터 한 데이터 프레임에 합치기

In [19]:

result3 = pd.concat([train_data.reset_index(drop=True),df.reset_index(drop=True)],axis=1) 
# train_Data(본 문장과 문장에서 토큰화 된 단어들 저장), df(벡터 저장) 
# reset_index(drop=True) - 병합 위해 각 데이터 프레임에서의 인덱스 초기화.
result3

Unnamed: 0,Document,tokenizedDocument,dm1,dm2,dm3,dm4,dm5,dm6,dm7,dm8,...,dm91,dm92,dm93,dm94,dm95,dm96,dm97,dm98,dm99,dm100
0,운영체제 서비스 (Operating System Services),운영체제 서비스,0.714455,-0.531385,2.488015,-0.770630,0.112286,0.774671,2.070887,1.770209,...,-0.610353,-1.074397,-0.708204,-0.606940,-1.245999,1.037042,0.395951,0.475926,0.371759,-1.622231
1,운영체제 사용자 인터페이스 (User Operating System Interface),운영체제 사용자 인터페이스,1.373194,-2.268541,2.041227,-1.548548,0.202811,0.251074,0.367272,1.672797,...,-0.636907,-0.851541,-2.177040,-1.377887,0.282307,1.705836,1.317941,1.302224,-1.182694,-0.189527
2,시스템 콜 (System Calls),시스템 콜,0.011394,-0.826875,2.973761,-0.366313,-0.839046,0.789231,0.961213,1.684802,...,-0.595384,0.178099,1.141711,-1.204290,-0.019782,0.544513,2.611695,1.183234,-0.109376,-0.170399
3,시스템 콜 유형 (Types of System Calls),시스템 콜 유형,-0.314897,-1.808564,2.901775,-0.102677,-0.380242,0.683322,0.860649,1.900799,...,0.153475,0.073037,1.136598,-1.001038,0.101517,0.996975,2.293803,1.101085,-0.827427,0.046568
4,시스템 프로그램 (System Programs),시스템 프로그램,0.597797,-4.119634,4.071386,-0.778798,-0.830832,0.947624,2.868076,3.423932,...,1.820633,-0.196410,-0.282522,-2.117036,1.165344,1.074218,3.273731,1.062482,0.584410,-1.640707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,운영체제를 하드웨어가 시작할 수 있도록 하드웨어에게 알려야 한다,운영체제 하드웨어 시작 수 하드웨어,0.222149,-3.372326,3.010398,-1.379068,-1.122083,-1.308177,-0.303563,1.678100,...,0.933680,1.303830,-1.493170,-2.017200,0.774388,0.170071,0.344846,2.664122,-0.876150,-2.341106
342,ROM 또는 EPROM(Erasable Programmable ROM)에 있는 bo...,코드 커널 메모리 적재 커널 구동,0.212965,-1.827281,3.537088,-1.536552,-0.451717,-1.553486,0.283955,1.621245,...,1.060881,-0.256200,-2.471534,-1.501794,-0.710068,0.662465,1.444199,-0.119418,-0.345229,-0.273893
343,두 단계로 나눠지는 경우도 있음- ROM의 특정한 위치에 있는부트블록(boot bl...,두 단계 경우 의 위치 부트 블록 이 메모리 로드 것 디스크 부트스트랩 로더 메모리 적재,0.028034,-1.855329,1.726132,-0.514380,0.055517,-0.897894,-0.397566,1.820389,...,1.477395,-0.121360,-0.036039,-0.523707,-0.609615,0.579980,0.493013,0.585409,-0.329471,-0.318402
344,GRUB(GRand Unified Bootloader) : Linux 시스템을 위...,시스템 위 오픈소스 부트스트랩 프로그램,0.375210,-1.566365,1.840266,0.172384,-0.653272,0.393887,0.515321,1.490309,...,2.143717,-0.657411,-1.112525,0.238495,0.045189,0.705270,1.014119,-0.178653,0.173723,-0.018322


<h1>결과 출력

In [20]:
result3.to_csv('./modelVec.csv',encoding="utf-8-sig")