## 1. d2v_tfidf : tfidf를 이용한 문서벡터 (모든 단어 포함)

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

def d2v_tfidf(filepath,keywords_list):
    cleaned_df=pd.read_csv(filepath, delimiter='\t')
    #cleaned_df=cleaned_df.iloc[:20]
    #1. split data into train, test (DataFrame)
    train, test = train_test_split(cleaned_df, test_size=0.2, stratify=cleaned_df['smk_status'])
    train.reset_index(drop=True)
    test.reset_index(drop=True) 
    
    X_train_doc_list=[eval(doc_str) for doc_str in train['doc_txt'].tolist()]
    X_test_doc_list=[eval(doc_str) for doc_str in test['doc_txt'].tolist()]

    X_train_doc_list=[" ".join(doc) for doc in X_train_doc_list]
    X_test_doc_list=[" ".join(doc) for doc in X_test_doc_list]

    y_train=train['smk_status'].tolist()
    y_test=test['smk_status'].tolist()
    
    #2. generate doc vector by keyword
    tfidf_vectorizer=TfidfVectorizer(encoding='utf-8', ngram_range=(1,2),stop_words=None,lowercase=False,token_pattern='[\S]+',norm='l2',sublinear_tf=True)
    train_matrix=tfidf_vectorizer.fit_transform(X_train_doc_list).toarray()
    test_matrix=tfidf_vectorizer.transform(X_test_doc_list).toarray()
    w2idx=tfidf_vectorizer.vocabulary_
    features=tfidf_vectorizer.get_feature_names()
    
    ##키워드가 아닌 단어들
    words=[word for word in features if word not in keywords_list]
    params=[{'doc':X_train_doc_list,'matrix':train_matrix},{'doc':X_test_doc_list,'matrix':test_matrix}]
    N=[]
    for sep in range(2):
        result=[]
        arr=params[sep]['matrix']
        for i in range(len(params[sep]['doc'])):
            result.append([])
            d = params[sep]['doc'][i]
            for keyword in keywords_list:
                result[-1].append(arr[i][w2idx[keyword]])
            for word in words:
                result[-1].append(arr[i][w2idx[word]])
        N.append(result)
        
    X_train=N[0]
    X_test=N[1]
    #3.linear SVM  
    svc = svm.SVC(kernel='linear',C=10,decision_function_shape ='ovo', random_state=8)
    svc.fit(X_train,y_train)
    #4.predict the labels on validation dataset
    y_pred = svc.predict(X_test)
    y_pred2=svc.predict(X_train)
    print(metrics.classification_report(y_test,y_pred,digits=4))
        
    #return X_train, X_test,words

- test results for 20 docs (mixed_test_data.csv)

In [15]:
keywords_list=['흡연','smk','흡연 negative']
train,test,words=d2v_tfidf('../data/mixed_test_data.csv', keywords_list)

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000         1
           3     0.0000    0.0000    0.0000         1
           4     0.5000    1.0000    0.6667         2

    accuracy                         0.5000         4
   macro avg     0.1667    0.3333    0.2222         4
weighted avg     0.2500    0.5000    0.3333         4



  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
tfidf_ = pd.DataFrame(train, columns = keywords_list+words)
tfidf_.tail()

Unnamed: 0,흡연,smk,흡연 negative,a,a c,a copd,a flutter,a negative,a wnl,ab,...,회정도,회정도 멧돼지,횟수,횟수 줄임,후,후 내원,후 첫방문,후로,후로 드리겠습니다,흡연 positive
11,0.065845,0.0,0.075608,0.054968,0.036855,0.0,0.075608,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.035912,0.040768,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.16209,0.0,0.026672,0.0,0.062117,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.046995,0.053349,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
tfidf_ = pd.DataFrame(test, columns = keywords_list+words)
tfidf_

Unnamed: 0,흡연,smk,흡연 negative,a,a c,a copd,a flutter,a negative,a wnl,ab,...,회정도,회정도 멧돼지,횟수,횟수 줄임,후,후 내원,후 첫방문,후로,후로 드리겠습니다,흡연 positive
0,0.0,0.0,0.0,0.068942,0.078265,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.122436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.106627,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.102837,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.141737,0.0,0.0,0.0,0.0,0.0


## 2.d2v_count : tf 로 문서벡터 표현 (키워드 외 단어 각각 존재)

In [46]:
def d2v_count(filepath,keywords_list):
    from sklearn.feature_extraction.text import CountVectorizer

    cleaned_df=pd.read_csv(filepath, delimiter='\t')
    #cleaned_df=cleaned_df.iloc[:20]
    #1. split data into train, test (DataFrame)
    train, test = train_test_split(cleaned_df, test_size=0.2, stratify=cleaned_df['smk_status'])
    train.reset_index(drop=True)
    test.reset_index(drop=True) 
    
    X_train_doc_list=[eval(doc_str) for doc_str in train['doc_txt'].tolist()]
    X_test_doc_list=[eval(doc_str) for doc_str in test['doc_txt'].tolist()]

    X_train_doc_list=[" ".join(doc) for doc in X_train_doc_list]
    X_test_doc_list=[" ".join(doc) for doc in X_test_doc_list]

    y_train=train['smk_status'].tolist()
    y_test=test['smk_status'].tolist()
    
    #2. generate doc vector by keyword
    count_vectorizer=CountVectorizer(encoding='utf-8', ngram_range=(1,2),stop_words=None,lowercase=False,token_pattern='[\S]+')
    train_matrix=count_vectorizer.fit_transform(X_train_doc_list).toarray()
    test_matrix=count_vectorizer.transform(X_test_doc_list).toarray()
    w2idx=count_vectorizer.vocabulary_
    features=count_vectorizer.get_feature_names()
    
    ##키워드가 아닌 단어들
    words=[word for word in features if word not in keywords_list]
    params=[{'doc':X_train_doc_list,'matrix':train_matrix},{'doc':X_test_doc_list,'matrix':test_matrix}]
    N=[]
    for sep in range(2):
        result=[]
        arr=params[sep]['matrix']
        for i in range(len(params[sep]['doc'])):
            result.append([])
            d = params[sep]['doc'][i]
            for keyword in keywords_list:
                result[-1].append(arr[i][w2idx[keyword]])
            for word in words:
                result[-1].append(arr[i][w2idx[word]])
        N.append(result)
        
    X_train=N[0]
    X_test=N[1]
    #3.linear SVM  
    svc = svm.SVC(kernel='linear',C=10,decision_function_shape ='ovo', random_state=8)
    svc.fit(X_train,y_train)
    #4.predict the labels on validation dataset
    y_pred = svc.predict(X_test)
    y_pred2=svc.predict(X_train)
    print(metrics.classification_report(y_test,y_pred,digits=4))
        
    #return X_train, X_test,words,count_vectorizer

In [48]:
keywords_list=['흡연','smk','흡연 negative']
train,test,words,cvec=d2v_count('../data/mixed_test_data.csv', keywords_list)

              precision    recall  f1-score   support

           1     0.5000    1.0000    0.6667         1
           2     0.0000    0.0000    0.0000         1
           4     1.0000    1.0000    1.0000         2

    accuracy                         0.7500         4
   macro avg     0.5000    0.6667    0.5556         4
weighted avg     0.6250    0.7500    0.6667         4



  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
tfidf_ = pd.DataFrame(train, columns = keywords_list+words)
tfidf_.tail()

Unnamed: 0,흡연,smk,흡연 negative,a,a bo,a c,a copd,a erosive,a negative,a positive,...,회 와인,후,후 재,후 전화상담,후 접종,후 침,흉터,흉터 관련하여,히,히 했다
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,1,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,0,1,0,1,0,1,0,0,0,0,...,0,1,0,0,0,1,1,1,0,0
14,1,0,1,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
15,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
tfidf_ = pd.DataFrame(test, columns = keywords_list+words)
tfidf_

Unnamed: 0,흡연,smk,흡연 negative,a,a bo,a c,a copd,a erosive,a negative,a positive,...,회 와인,후,후 재,후 전화상담,후 접종,후 침,흉터,흉터 관련하여,히,히 했다
0,0,0,0,2,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,3,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


## 3. d2v_count_de : tf 로 문서벡터 표현 (키워드 외 단어는 통합)

In [166]:
def d2v_count_de(filepath,keywords_list):
    cleaned_df=pd.read_csv('../data/mixed_all_data.csv', delimiter='\t')
    cleaned_df=cleaned_df.iloc[:20]
    #1. split data into train, test (DataFrame)
    train, test = train_test_split(cleaned_df, test_size=0.2, stratify=cleaned_df['smk_status'])
    train.reset_index(drop=True)
    test.reset_index(drop=True) 

    X_train_doc_list=[eval(doc_str) for doc_str in train['doc_txt'].tolist()]
    X_test_doc_list=[eval(doc_str) for doc_str in test['doc_txt'].tolist()]

    X_train_doc_list=[" ".join(doc) for doc in X_train_doc_list]
    X_test_doc_list=[" ".join(doc) for doc in X_test_doc_list]

    y_train=train['smk_status'].tolist()
    y_test=test['smk_status'].tolist()
    #2. generate doc vector by keyword
    count_vectorizer=CountVectorizer(encoding='utf-8', ngram_range=(1,2),stop_words=None,lowercase=False,token_pattern='[\S]+')
    train_matrix=count_vectorizer.fit_transform(X_train_doc_list).toarray()
    test_matrix=count_vectorizer.transform(X_test_doc_list).toarray()
    w2idx=count_vectorizer.vocabulary_
    features=count_vectorizer.get_feature_names()
    words=[word for word in features if word not in keywords_list]
    params=[{'doc':X_train_doc_list,'matrix':train_matrix},{'doc':X_test_doc_list,'matrix':test_matrix}]
    N=[]
    for sep in range(2):
        result=[]
        arr=params[sep]['matrix']
        for i in range(len(params[sep]['doc'])):
            result.append([])
            d = params[sep]['doc'][i]
            for keyword in keywords_list:
                result[-1].append(arr[i][w2idx[keyword]])

            result[-1].append(0)
            for word in words:
                result[-1][len(keywords_list)]+=arr[i][w2idx[word]]
        N.append(result)
    X_train=N[0]
    X_test=N[1]
    #3.linear SVM  
    svc = svm.SVC(kernel='linear',C=10,decision_function_shape ='ovo', random_state=8)
    svc.fit(X_train,y_train)
    #4.predict the labels on validation dataset
    y_pred = svc.predict(X_test)
    y_pred2=svc.predict(X_train)
    print(metrics.classification_report(y_test,y_pred,digits=4))
    return X_train, X_test,words,count_vectorizer


In [167]:
keywords_list=['금연','흡연','smk','positive','negative']
train,test,words,cvec=d2v_count_de('../data/mixed_test_data.csv', keywords_list)

              precision    recall  f1-score   support

           1     0.0000    0.0000    0.0000         1
           2     0.0000    0.0000    0.0000         1
           3     0.0000    0.0000    0.0000         0
           4     0.6667    1.0000    0.8000         2

    accuracy                         0.5000         4
   macro avg     0.1667    0.2500    0.2000         4
weighted avg     0.3333    0.5000    0.4000         4



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [171]:
tfidf_ = pd.DataFrame(train, columns = keywords_list+['others'])
tfidf_.tail()

Unnamed: 0,금연,흡연,smk,positive,negative,others
11,0,0,0,1,15,349
12,0,0,0,6,14,511
13,0,1,0,2,12,206
14,0,0,0,0,2,73
15,2,0,0,0,6,281


In [172]:
tfidf_ = pd.DataFrame(test, columns = keywords_list+['others'])
tfidf_.tail()

Unnamed: 0,금연,흡연,smk,positive,negative,others
0,0,1,0,7,11,139
1,0,0,0,5,7,108
2,0,0,0,4,9,221
3,0,0,0,0,0,55
