In [1]:
"""
- https://wikidocs.net/24603
"""
import pandas as pd
import numpy as np
import re
from io import StringIO

from repia_search_engine import load_rdata, load_di_u_conf, preprocessing, remove_stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = 'word',
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             min_df = 2,
                             ngram_range=(1,3),
                             max_features = 2000,
                             )


di_file_path = '../data/di_u.conf'
file_path = '../data/nc_doc_1_1.1'

#df = pd.DataFrame(load_rdata(file_path), columns=load_di_u_conf(di_file_path))
df = pd.DataFrame(load_rdata(di_file_path, file_path))
data = df.head(2).copy()

In [6]:
data['ntt_sj']

0    후반기 담임장학 결과 보고회 및 교육과정 컨설팅단 협의회
1                  신규교사 협력학습 역량강화 연수
Name: ntt_sj, dtype: object

In [7]:
#load_rdata(di_file_path, file_path)

In [10]:
data['ntt_sn'] = data['ntt_sn'].astype(int)
data['ntt_sn']

0    1349
1    1350
Name: ntt_sn, dtype: int64

In [3]:
data

Unnamed: 0,ntt_sn,ntt_sj,ntt_cn,reg_dt,sys_id,sys_name,bbs_id,menu_navi,menu_url,div_ty,div_nm,bbs_dc,ntt_code
0,1349,후반기 담임장학 결과 보고회 및 교육과정 컨설팅단 협의회,sdfsdfsdf,2018-11-23 20:31:16.000,dgnbe,대구광역시남부교육지원청,1901,직원마당 > 회의실사용신청,https://www.dge.go.kr/dgnbe/na/ntt/selectNttIn...,rnffd,교육지원청,,D1349
1,1350,신규교사 협력학습 역량강화 연수,sdfsdafsdf,2018-11-23 20:35:39.000,dgnbe,대구광역시남부교육지원청,1901,직원마당 > 회의실사용신청,https://www.dge.go.kr/dgnbe/na/ntt/selectNttIn...,rnffd,교육지원청,,D1350


In [8]:
#data['ntt_sj'] = data['ntt_sj'].str.lower()

df['ntt_sj'] = df['ntt_sj'].astype('str')

print(df['ntt_sj'][:1])

#df['ntt_sj'] = df['ntt_sj'].str.lower()

0    후반기 담임장학 결과 보고회 및 교육과정 컨설팅단 협의회
Name: ntt_sj, dtype: object


In [12]:
data.to_csv('./data_df_2.csv', # file path, file name
    sep=',',    # seperator, delimiter (구분자)
    na_rep='NaN', # missing data representation (결측값 표기)
    index=False)

data = pd.read_csv('./data_df_2.csv')

In [4]:
print(data.dtypes)

ntt_sn       object
ntt_sj       object
ntt_cn       object
reg_dt       object
sys_id       object
sys_name     object
bbs_id       object
menu_navi    object
menu_url     object
div_ty       object
div_nm       object
bbs_dc       object
ntt_code     object
dtype: object


In [15]:
#data['ntt_sn'] = data['ntt_sn'].apply(lambda x: x[0])
data.set_index('ntt_sn', inplace=True)



In [16]:
data.head()

Unnamed: 0_level_0,ntt_sj,ntt_cn,reg_dt,sys_id,sys_name,bbs_id,menu_navi,menu_url,div_ty,div_nm,bbs_dc,ntt_code
ntt_sn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1349,후반기 담임장학 결과 보고회 및 교육과정 컨설팅단 협의회,sdfsdfsdf,2018-11-23 20:31:16.000,dgnbe,대구광역시남부교육지원청,1901,직원마당 > 회의실사용신청,https://www.dge.go.kr/dgnbe/na/ntt/selectNttIn...,rnffd,교육지원청,,D1349
1350,신규교사 협력학습 역량강화 연수,sdfsdafsdf,2018-11-23 20:35:39.000,dgnbe,대구광역시남부교육지원청,1901,직원마당 > 회의실사용신청,https://www.dge.go.kr/dgnbe/na/ntt/selectNttIn...,rnffd,교육지원청,,D1350


In [62]:
data.dtypes


ntt_sn        int64
ntt_sj       object
ntt_cn       object
reg_dt       object
sys_id       object
sys_name     object
bbs_id       object
menu_navi    object
menu_url     object
div_ty       object
div_nm       object
bbs_dc       object
ntt_code     object
dtype: object

In [17]:
data['ntt_sj'] = data['ntt_sj'].astype(str)

In [18]:
tfidf = TfidfVectorizer()


In [19]:
# 정규 표현식을 적용하는 함수 수정
def apply_regex(name):
    if isinstance(name, str):
        match = re.match(r'^[A-Za-z]+$', name)
        return match.group() if match else np.nan
    return np.nan

In [20]:
#print(data['ntt_sj'])
#print(data['ntt_sj'].describe())

data['ntt_sj_preprocessing'] = data['ntt_sj'].apply(apply_regex)

In [21]:
tfidf_matrix = tfidf.fit_transform(data['ntt_sj'])

In [22]:
print('TF-IDF 행렬의 크기(shape) :',tfidf_matrix.shape)

TF-IDF 행렬의 크기(shape) : (2, 11)


In [None]:
tfidf = TfidfVectorizer()
#print(data['ntt_sj'])
#print(data['ntt_sj'].describe())

#data['ntt_sj_preprocessing'] = data['ntt_sj'].apply(preprocessing)

#data['ntt_sj_preprocessing'] = data['ntt_sj'].apply(preprocessing)
#data['ntt_sj_preprocessed'] = data['ntt_sj_preprocessing'].apply(remove_stopwords)

#title_feature_vector = vectorizer.fit_transform(data['ntt_sj_preprocessed'])
#tfidf_matrix = tfidf.fit_transform(data['ntt_sj_preprocessed'])
#print(tfidf_matrix.shape)


#tfidf_matrix = tfidf.fit_transform(data['ntt_sj'])

#print('TF-IDF 행렬의 크기(shape) :',tfidf_matrix.shape)


