In [106]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
import pandas as pd

In [107]:
cleaned_train_df=pd.read_csv('../data/mixed_train_data.csv', delimiter='\t')
cleaned_test_df=pd.read_csv('../data/mixed_test_data.csv', delimiter='\t')
cleaned_train_df

Unnamed: 0,doc_num,doc_txt,smk_status,dep
0,426,"['wks', 'f', 'u', 'kn', 'hypertension', 'diabe...",4,fam
1,1520,"['bp', 'ht', 'bwt', 'p', 'i', 'pmhx', 'diabete...",1,fam
2,1658,"['대진', '아드님', '다음', '주', '신경과', '진료', '예정', '인...",4,fam
3,981,"['bp', 'wt', 'kilogram', 'kilogram', 'diabetes...",4,fam
4,3578,"['s', 'so', 'so', '기침', '이', '조금', '증가', 'o', ...",4,pul
...,...,...,...,...
3763,2716,"['bp', '구정자', '환자', '배우자', '로', '함께', '본원', 'f...",2,fam
3764,3917,"['ct', 'abnl', 'ya', 'mmrc', 'ya', 'vbs', 's',...",3,pul
3765,2343,"['bp', 'ht', 'centimeter', 'bwt', 'kilogram', ...",4,fam
3766,3055,"['hypertension', 'diabetes', 'mellitus', 'hypo...",4,fam


In [108]:
cleaned_test_df

Unnamed: 0,doc_num,doc_txt,smk_status,dep
0,4542,"['worse', 'hemoptysis', 'cat', 'act', 'x', 'mm...",4,pul
1,469,"['bp', 'bwt', 'kilogram', 'kilogram', 'obesity...",1,fam
2,2139,"['s', '식사', '조절', '거의', '못함', '술', '거의', '매일',...",4,fam
3,3737,"['cough', 'ma', 'exertional', 'positive', 'col...",2,pul
4,1871,"['bp', 'wt', '체지방률', 'kilogram', 'kilogram', '...",1,fam
...,...,...,...,...
938,2573,"['bp', 's', 'p', 'agc', 'diabetes', 'mellitus'...",1,fam
939,554,"['uacr', 'hpdp', '폐', '대장', '포함', '당뇨', '는', '...",3,fam
940,3025,"['smk', 'positive', 'p', 'per', 'day', 'yr', '...",3,fam
941,2472,"['s', '지난', '한달', '동안', '크루즈', '여행을', '다녀오느라',...",4,fam


## For Tfidf-Transforming

In [109]:
features_train_doc_list=[eval(doc_str) for doc_str in cleaned_train_df['doc_txt'].tolist()]
features_test_doc_list=[eval(doc_str) for doc_str in cleaned_test_df['doc_txt'].tolist()]

tdidf_features_train_doc_list=[" ".join(doc) for doc in features_train_doc_list]
tdidf_features_test_doc_list=[" ".join(doc) for doc in features_test_doc_list]

labels_train_doc_list=cleaned_train_df['smk_status'].tolist()
labels_test_doc_list=cleaned_test_df['smk_status'].tolist()

In [110]:
train_df=pd.DataFrame({"features":tdidf_features_train_doc_list,"label":labels_train_doc_list},index=cleaned_train_df['doc_num'].tolist())
train_df

Unnamed: 0,features,label
426,wks f u kn hypertension diabetes mellitus on m...,4
1520,bp ht bwt p i pmhx diabetes mellitus hypertens...,1
1658,대진 아드님 다음 주 신경과 진료 예정 인데 약 부족하여 내원 함,4
981,bp wt kilogram kilogram diabetes mellitus on m...,4
3578,s so so 기침 이 조금 증가 o b wt kilogram spo pr min ...,4
...,...,...
2716,bp 구정자 환자 배우자 로 함께 본원 follow up 원하 여 내원 hx of ...,2
3917,ct abnl ya mmrc ya vbs s c w ct emphysema posi...,3
2343,bp ht centimeter bwt kilogram kilogram kilogra...,4
3055,hypertension diabetes mellitus hypothy on med ...,4


In [111]:
test_df=pd.DataFrame({"features":tdidf_features_test_doc_list,"label":labels_test_doc_list},index=cleaned_test_df['doc_num'].tolist())
test_df

Unnamed: 0,features,label
4542,worse hemoptysis cat act x mmrc ae whz positiv...,4
469,bp bwt kilogram kilogram obesity abdominal obe...,1
2139,s 식사 조절 거의 못함 술 거의 매일 반병정도 운동 공단검진 검사 결과 안저검사 ...,4
3737,cough ma exertional positive cold exposure pos...,2
1871,bp wt 체지방률 kilogram kilogram kilogram hyperten...,1
...,...,...
2573,bp s p agc diabetes mellitus c management hype...,1
554,uacr hpdp 폐 대장 포함 당뇨 는 약을 병합하는게 좋습니다 lsm 하면서 m...,3
3025,smk positive p per day yr alc positive 매일 소주 병...,3
2472,s 지난 한달 동안 크루즈 여행을 다녀오느라 늦게 왔다 여행 다니면서 식사 조절 을...,4


#### TD-IDF Vectorizer 생성

In [224]:
min_df_list = [0.01] #너무 적게 나오는 것 제외
max_df_list = [1.,0.9]        #너무 많이 나오는 것 제외
max_features_list = [3000,4500]   #None : 제한 없음

for min_df in min_df_list:
    for max_df in max_df_list:
        for max_features in max_features_list:
            max_features=3000
            tdidf_vectorizer=pickle.load(open("./pickle/tfidf_unibi_max%.2f_min%.2f_features%i.pkl"%(max_df, min_df,max_features), "rb"))

In [225]:
features_train=tdidf_vectorizer.fit_transform(tdidf_features_train_doc_list) 
features_test=tdidf_vectorizer.transform(tdidf_features_test_doc_list)
labels_train=labels_train_doc_list
labels_test=labels_test_doc_list
print(tdidf_vectorizer)
print(features_train.shape)
print(features_test.shape)

TfidfVectorizer(lowercase=False, max_df=0.9, max_features=3000, min_df=0.01,
                ngram_range=(1, 2), sublinear_tf=True)
(3768, 2287)
(943, 2287)


   - X_train : features_train 
   - Y_train : labels_train    
   - x_test  : features_test  
   - y_test  : labels_test    

## Cross-Validation (SVM )

In [None]:
C = [.0001, .001, .01, .1]
degree = [3, 4, 5]
gamma = [1, 10, 100]
probability = [True]

param_grid = [
  {'C': C, 'kernel':['linear'], 'probability':probability},
  {'C': C, 'kernel':['poly'], 'degree':degree, 'probability':probability},
  {'C': C, 'kernel':['rbf'], 'gamma':gamma, 'probability':probability}
]

# Create a base model
svc = svm.SVC(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1,n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.3min


In [None]:
print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

In [None]:
'''
 그리드 서치 전체 결과 조회
 
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
'''

In [None]:
best_svc = grid_search.best_estimator_
best_svc

In [None]:
best_svc.fit(features_train, labels_train)

In [None]:
svc_pred = best_svc.predict(features_test)

In [None]:
print("Training accuracy:")
print(accuracy_score(labels_train, best_svc.predict(features_train)))
print("Test accuracy:")
print(accuracy_score(labels_test, svc_pred))
print(best_svc)
#print(grid_search.cv_results_['params'])

In [None]:
# Classification report
print("Classification report")
print(classification_report(labels_test,svc_pred))

In [None]:
base_model = svm.SVC(random_state = 8)
base_model.fit(features_train, labels_train)
accuracy_score_bm=accuracy_score(labels_test, base_model.predict(features_test))
print(accuracy_score_bm)
print(base_model)
from sklearn.metrics import f1_score


In [None]:
d = {
     'Model': best_svc,
     'Training Set Accuracy': accuracy_score(labels_train, best_svc.predict(features_train)),
     'Test Set Accuracy': accuracy_score(labels_test, svc_pred),
      'TF-IDF Model':tdidf_vectorizer,
    'F1 score:':f1_score(labels_test,  svc_pred, average = "macro")
}
idx=1
#vectorizer_df=pd.DataFrame(tdidf_vectorizer.get_params())
df_models_svc =pd.DataFrame(d, index=[idx])
df_models_total=pd.concat([df_models_svc,df_models_svc])
idx+=1
bestsvc_dict=grid_search.best_params_
df_models_total=df_models_svc

In [None]:
with open('./svm_model/best_svc_unibi_C(%.4f)_deg(%i)_K(%s).pickle'%(bestsvc_dict['C'],bestsvc_dict['degree'],bestsvc_dict['kernel']), 'wb') as output:
    pickle.dump(best_svc, output)
    
with open('./svm_model/df_models_svc_unibi_C(%.4f)_deg(%i)_K(%s).pickle'%(bestsvc_dict['C'],bestsvc_dict['degree'],bestsvc_dict['kernel']), 'wb') as output:
    pickle.dump(df_models_svc, output)

### >> Cross_Validation (TD-IDF & SVM)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV

tfidf=TfidfVectorizer()
clf=svm.SVC()

pipeline=Pipeline([
    ('tfidf',tfidf),
    ('clf',clf)
])

min_df_list = [0.01,0.05,0.1,0.2] #너무 적게 나오는 것 제외
max_df_list = [1.,0.9,0.8]        #너무 많이 나오는 것 제외
max_features_list = [3000,4500]
C = [.0001, .001, .01, .1]
degree = [3, 4, 5]
gamma = [1, 10, 100]
probability = [True]
'''
param_grid = [
  {'C': C, 'kernel':['linear'], 'probability':probability},
  {'C': C, 'kernel':['poly'], 'degree':degree, 'probability':probability},
  {'C': C, 'kernel':['rbf'], 'gamma':gamma, 'probability':probability}
]
'''
param_grid=[{
        'tfidf__ngram_range':[(1,1),(1,2)],
        'tfidf__min_df':min_df_list,
        'tfidf__max_df':max_df_list,
        'tfidf__max_features':max_features_list,
        'clf__C':C,
        'clf__probability':probability,
        'clf__kernel':['linear']
    },
    {
        'tfidf__ngram_range':[(1,1),(1,2)],
        'tfidf__min_df':min_df_list,
        'tfidf__max_df':max_df_list,
        'tfidf__max_features':max_features_list,
        'clf__C':C,
        'clf__probability':probability,
        'clf__kernel':['poly'],
        'clf__degree':degree
    },{
        'tfidf__ngram_range':[(1,1),(1,2)],
        'tfidf__min_df':min_df_list,
        'tfidf__max_df':max_df_list,
        'tfidf__max_features':max_features_list,
        'clf__C':C,
        'clf__probability':probability,
        'clf__kernel':['rbf'],
        'clf__gamma':gamma
    }
    ]

grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid,n_jobs=4)
grid_search.fit(features_train, labels_train)