### data load

In [1]:
import pandas as pd
import numpy as np

news_data = pd.read_csv("computer_science.csv")
news_data.head(5)

Unnamed: 0,section,news_id,title,dateline,provider_link_page
0,web developer,1400201.202,예산군 '기록관' 전국 자치단체 롤-모델 부각,2020-09-10T11:27:44.000+09:00,http://www.daejonilbo.com/news/newsitem.asp?pk...
1,web developer,1400601.202,'건강한 일터 자가진단 모형' 개발,2020-09-10T21:19:32.000+09:00,http://www.ccdailynews.com/news/articleView.ht...
2,web developer,8100201.202,신천지·발레학원이 지식산업?…이상한 '지식산업센터',2020-09-10T20:55:57.000+09:00,https://imnews.imbc.com/replay/2020/nwdesk/art...
3,web developer,1601001.202,산림조합 전북본부 2020 벌초도우미서비스 실시,2020-09-10T20:11:08.000+09:00,http://www.domin.co.kr/news/articleView.html?i...
4,web developer,7101201.202,올해의 천문연구원에 안영숙 박사,2020-09-10T18:59:11.000+09:00,http://www.dt.co.kr/contents.html?article_no=2...


In [3]:
len(news_data)

68618

In [2]:
news_data['section'].unique()

array(['web developer', 'mobile developer', 'embedded developer',
       'data developer', 'AI developer', 'SIS developer', 'SW developer'],
      dtype=object)

### 데이터 전처리

In [3]:
news_data['title'] = news_data['title'].str.replace("[^\w | \s]", "")

In [5]:
news_data.head(5)

Unnamed: 0,section,news_id,title,dateline,provider_link_page
0,web developer,1400201.202,예산군 기록관 전국 자치단체 롤모델 부각,2020-09-10T11:27:44.000+09:00,http://www.daejonilbo.com/news/newsitem.asp?pk...
1,web developer,1400601.202,건강한 일터 자가진단 모형 개발,2020-09-10T21:19:32.000+09:00,http://www.ccdailynews.com/news/articleView.ht...
2,web developer,8100201.202,신천지발레학원이 지식산업이상한 지식산업센터,2020-09-10T20:55:57.000+09:00,https://imnews.imbc.com/replay/2020/nwdesk/art...
3,web developer,1601001.202,산림조합 전북본부 2020 벌초도우미서비스 실시,2020-09-10T20:11:08.000+09:00,http://www.domin.co.kr/news/articleView.html?i...
4,web developer,7101201.202,올해의 천문연구원에 안영숙 박사,2020-09-10T18:59:11.000+09:00,http://www.dt.co.kr/contents.html?article_no=2...


### WPM (WordPiece Model) tokenizaton

In [6]:
import tensorflow_datasets as tfds

tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    news_data['title'], target_vocab_size=2**13)

In [7]:
print(tokenizer.subwords[:100])

['에_', '의_', '도_', '로_', '한_', '는_', '이_', '은_', '다', '이', '서_', '으로_', '과_', '와_', '코로나_', '가_', '출시', '지', '서비스_', '원_', 'AI_', '리', '수', '고_', '고', '을_', '대', '대_', '코로나19_', '기_', '전', '사업_', '자_', '시장_', '등_', '자', '정', '시', '사', '아', '어', '일', '원', '기', '스', '디지털_', '비', '성', '기업_', '부', '장', '인', '온라인_', '나', '유', '가', '도', '시스템_', '주', '미', '개최', '조', '로', '재', '한', '지원', '플랫폼_', '상', '클라우드_', '선정', '비대면_', '기술_', '세', '오', '소', '하는_', '라', '신', '스_', '까지_', '장_', '시대_', '첫_', '서', '동', '것', '수_', '보', '트', '한다', '화', '명', '무', '구축', '시_', '2020_', '구', '한국', '형_', '마']


In [9]:
print('Tokenized sample question: {}'.format(tokenizer.encode(news_data['title'][0])))

Tokenized sample question: [1287, 301, 1178, 248, 293, 1899, 1970, 1968, 771, 50, 470]


In [18]:
tokenizer.encode(news_data['title'][0])

[1287, 301, 1178, 248, 293, 1899, 1970, 1968, 771, 50, 470]

In [21]:
for idx in range(len(news_data)):
    news_data['title'][idx] = [tokenizer.decode([num]) for num in tokenizer.encode(news_data['title'][idx])]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [50]:
news_data.head(6)

Unnamed: 0,section,news_id,title,dateline,provider_link_page
0,web developer,1400201.202,"[예산, 군 , 기록, 관 , 전국 , 자치, 단체 , 롤, 모델 , 부, 각]",2020-09-10T11:27:44.000+09:00,http://www.daejonilbo.com/news/newsitem.asp?pk...
1,web developer,1400601.202,"[건강한 , 일, 터 , 자가진단 , 모, 형 , 개발]",2020-09-10T21:19:32.000+09:00,http://www.ccdailynews.com/news/articleView.ht...
2,web developer,8100201.202,"[신, 천지, 발, 레, 학, 원, 이 , 지식, 산업, 이상, 한 , 지식산업센터]",2020-09-10T20:55:57.000+09:00,https://imnews.imbc.com/replay/2020/nwdesk/art...
3,web developer,1601001.202,"[산림, 조합 , 전북, 본부 , 2020 , 벌초, 도, 우, 미, 서비스 , 실시]",2020-09-10T20:11:08.000+09:00,http://www.domin.co.kr/news/articleView.html?i...
4,web developer,7101201.202,"[올해의 , 천, 문, 연구원, 에 , 안, 영, 숙 , 박사]",2020-09-10T18:59:11.000+09:00,http://www.dt.co.kr/contents.html?article_no=2...
5,web developer,7101201.202,"[SW, 명, 장 , 창업, 에 , 도전, 하다 , 계약, 부터 , 고객, 관리, ...",2020-09-10T18:59:12.000+09:00,http://www.dt.co.kr/contents.html?article_no=2...


In [72]:
news_data['news_id'].unique()

array([1400201.202, 1400601.202, 8100201.202, 1601001.202, 7101201.202,
       7100501.202, 2100601.202, 2100201.202, 2100311.202, 2100851.202,
       1101001.202, 2100801.202, 1400401.202, 1100501.202, 1100701.202,
       1300201.202, 1400701.202, 1400351.202, 2100501.202, 1500701.202,
       1400501.202, 1600501.202, 2100701.202, 1100611.202, 1100201.202,
       8100401.202, 1500301.202, 1500401.202, 1500901.202, 1100901.202,
       1100401.202, 1400551.202, 1500801.202, 1600301.202, 1300101.202,
       1700101.202, 1200101.202, 1600201.202, 1700201.202, 1100101.202,
       1600801.202, 1200201.202, 1500601.202, 1100301.202, 1500501.202,
       1500151.202, 1601101.202, 1500051.202, 8100301.202, 8200101.202,
       8100101.202])

### data preprocessing

In [81]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_data = [TaggedDocument(words=[i.lower() for i in _d], 
                              tags=[idx]) for idx, _d in enumerate(news_data['title'])]

In [82]:
tagged_data[:5]

[TaggedDocument(words=['예산', '군 ', '기록', '관 ', '전국 ', '자치', '단체 ', '롤', '모델 ', '부', '각'], tags=[0]),
 TaggedDocument(words=['건강한 ', '일', '터 ', '자가진단 ', '모', '형 ', '개발'], tags=[1]),
 TaggedDocument(words=['신', '천지', '발', '레', '학', '원', '이 ', '지식', '산업', '이상', '한 ', '지식산업센터'], tags=[2]),
 TaggedDocument(words=['산림', '조합 ', '전북', '본부 ', '2020 ', '벌초', '도', '우', '미', '서비스 ', '실시'], tags=[3]),
 TaggedDocument(words=['올해의 ', '천', '문', '연구원', '에 ', '안', '영', '숙 ', '박사'], tags=[4])]

In [83]:
len(tagged_data)

68618

### Model train

In [84]:
max_epochs = 100
vec_size = 20
alpha = 0.025

d2v_model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
d2v_model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    d2v_model.train(tagged_data,
                total_examples=d2v_model.corpus_count,
                epochs=d2v_model.iter)
    # decrease the learning rate
    d2v_model.alpha -= 0.0002
    # fix the learning rate, no decay
    d2v_model.min_alpha = d2v_model.alpha

d2v_model.save("d2v.model")
print("Model Saved")



iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration 77
iteratio

In [91]:
# tags

d2v_model.docvecs.offset2doctag

[]

In [89]:
# vetors

d2v_model.docvecs.doctag_syn0

  """Entry point for launching an IPython kernel.


array([[ 0.62977177, -6.0020404 , -3.0430608 , ..., -5.310898  ,
         0.26875967, -2.408109  ],
       [ 1.7644455 ,  0.976511  , -1.3608624 , ...,  2.2232146 ,
        -7.3721538 ,  4.118886  ],
       [ 4.5147076 ,  2.2187421 , -2.5592365 , ..., -4.35489   ,
         3.7881103 , -4.6801043 ],
       ...,
       [-2.3975663 , -1.0016649 ,  0.13753901, ...,  2.1369393 ,
         4.0569606 ,  1.6593772 ],
       [-2.9675379 ,  2.806499  , -5.6495695 , ...,  5.55664   ,
         5.066531  ,  5.2266197 ],
       [ 5.094768  ,  1.9410326 , -3.2910101 , ..., -4.487082  ,
        -1.598795  , -1.1193295 ]], dtype=float32)

### title => vetor

In [92]:
for idx, v in enumerate(d2v_model.docvecs.doctag_syn0):
    news_data['title'][idx] = v

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [93]:
news_data.head(5)

Unnamed: 0,section,news_id,title,dateline,provider_link_page
0,web developer,1400201.202,"[0.62977177, -6.0020404, -3.0430608, 7.746626,...",2020-09-10T11:27:44.000+09:00,http://www.daejonilbo.com/news/newsitem.asp?pk...
1,web developer,1400601.202,"[1.7644455, 0.976511, -1.3608624, 1.514463, 0....",2020-09-10T21:19:32.000+09:00,http://www.ccdailynews.com/news/articleView.ht...
2,web developer,8100201.202,"[4.5147076, 2.2187421, -2.5592365, 1.9661222, ...",2020-09-10T20:55:57.000+09:00,https://imnews.imbc.com/replay/2020/nwdesk/art...
3,web developer,1601001.202,"[0.9904615, -0.093818046, 1.7925162, 0.8770222...",2020-09-10T20:11:08.000+09:00,http://www.domin.co.kr/news/articleView.html?i...
4,web developer,7101201.202,"[0.09731135, 5.122567, 0.17382027, 1.8286847, ...",2020-09-10T18:59:11.000+09:00,http://www.dt.co.kr/contents.html?article_no=2...


### section one_hot_vector

In [108]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(news_data['section'])

LabelEncoder()

In [109]:
list(le.classes_)

['AI developer',
 'SIS developer',
 'SW developer',
 'data developer',
 'embedded developer',
 'mobile developer',
 'web developer']

In [111]:
news_data['section'] = le.transform(news_data['section'])

In [112]:
news_data.head(5)

Unnamed: 0,section,news_id,title,dateline,provider_link_page
0,6,1400201.202,"[0.62977177, -6.0020404, -3.0430608, 7.746626,...",2020-09-10T11:27:44.000+09:00,http://www.daejonilbo.com/news/newsitem.asp?pk...
1,6,1400601.202,"[1.7644455, 0.976511, -1.3608624, 1.514463, 0....",2020-09-10T21:19:32.000+09:00,http://www.ccdailynews.com/news/articleView.ht...
2,6,8100201.202,"[4.5147076, 2.2187421, -2.5592365, 1.9661222, ...",2020-09-10T20:55:57.000+09:00,https://imnews.imbc.com/replay/2020/nwdesk/art...
3,6,1601001.202,"[0.9904615, -0.093818046, 1.7925162, 0.8770222...",2020-09-10T20:11:08.000+09:00,http://www.domin.co.kr/news/articleView.html?i...
4,6,7101201.202,"[0.09731135, 5.122567, 0.17382027, 1.8286847, ...",2020-09-10T18:59:11.000+09:00,http://www.dt.co.kr/contents.html?article_no=2...


In [113]:
news_data.tail(5)

Unnamed: 0,section,news_id,title,dateline,provider_link_page
68613,2,7101201.202,"[-0.91330165, -1.7853624, -5.9192033, 2.767834...",2020-08-26T15:20:03.000+09:00,http://www.dt.co.kr/contents.html?article_no=2...
68614,2,1100701.202,"[5.983479, -4.434872, -4.701555, 1.8551528, -0...",2020-08-26T15:30:30.000+09:00,http://www.segye.com/content/html/2020/08/26/2...
68615,2,1100101.202,"[-2.3975663, -1.0016649, 0.13753901, 4.9917946...",2020-08-26T15:30:13.000+09:00,http://news.khan.co.kr/kh_news/khan_art_view.h...
68616,2,1101001.202,"[-2.9675379, 2.806499, -5.6495695, 2.1811996, ...",2020-08-26T15:48:52.000+09:00,http://www.hani.co.kr/arti/area/chungcheong/95...
68617,2,1100611.202,"[5.094768, 1.9410326, -3.2910101, -5.011853, 0...",2020-08-26T15:22:55.000+09:00,http://www.seoul.co.kr/news/newsView.php?id=20...


In [None]:
# list(le.inverse_transform())

### train_test_split

In [298]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(news_data['title'], news_data['section'], shuffle=True, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((51463,), (17155,), (51463,), (17155,))

In [299]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

In [270]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((51463, 1), (17155, 1), (51463,), (17155,))

In [300]:
X_train.shape

(51463,)

In [None]:
X_train.reshape()

### CNN

In [297]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Flatten

model = Sequential()
model.add(Conv1D(256, 3, padding='valid', activation='relu', input_shape=))
model.add(GlobalMaxPooling1D())
model.add(Flatten())
model.add(Dense(7, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrix=['acc'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32)





ValueError: Error when checking input: expected conv1d_31_input to have 3 dimensions, but got array with shape (51463, 1)