In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [2]:
train_df = pd.read_pickle("clean_Train_Textual_VAL.pk")  # 要讀取 *_VAL，字詞不重複
test_df = pd.read_pickle("clean_Test_Intuitive_VAL.pk")
val_df = pd.read_pickle("clean_Validation_VAL.pk")
val_df.head()

Unnamed: 0,Filename,clean_words
0,ID_1159.txt,"[inderal, great, year, abdomen, history, assoc..."
1,ID_1160.txt,"[leg, sigmoid, inflammatory, suggest, hand, ne..."
2,ID_1162.txt,"[appointment, disposition, episode, cva, levo,..."
3,ID_1167.txt,"[urologist, dilate, diuretic, holosystolic, ye..."
4,ID_1168.txt,"[therapeutic, dilate, adriamycin, great, year,..."


# 建立權重列表想法:
## 1. obesity、obese為肥胖重點單字，所以給予50分評價，如果這篇文章有出現一個肥胖字(總分>=50)就是肥胖

## 2. morbidly、morbid、hyperlipidemia和肥胖相關字最相近，所以給予28分權重，只有出現兩個相近字(>=50)就是肥胖

## 3. asthma、htn不這麼接近，所以給予20分權重，要出現3次(>=50分)才會是肥胖

In [4]:
keyWords={"obesity":50,
        "obese":50,
        "morbidly":28,
        "morbid":28,
        "hyperlipidemia":28,
        "asthma":20,
        "htn":20,
#         "elderly":20,
}

# 測試集資料結果

In [None]:
# test_data_val
points = []
preds = []
for clean_words in test_df.clean_words:   
    point = 0
    for word in clean_words:
        if word in keyWords:              ## <<== 算分數
            point+=keyWords[word]
    
    pred = 0
    if point >= 50:                       ## <<== 總分大於50就是肥胖
        pred = 1
    
    points.append(point)
    preds.append(pred)
    
test_df["points"] = points
test_df["preds"] = preds
test_df.to_csv("point_test.csv")          ##<<== 儲存預測結果 


pred = preds
true = list(test_df.y)

## 計算confusion_matrix和相關驗證數值
c = confusion_matrix(true, pred)
print("confusion_matrix: \n", c)
accuracy = accuracy_score(true, pred)
precision = precision_score(true, pred)
recall = recall_score(true, pred)
f1 = f1_score(true, pred)
print("precison:", precision)
print("recall:", recall)
print("accuracy:", accuracy)
print("f1:", f1)

# 訓練集驗證結果

In [None]:
# test_data_val
points = []
preds = []
for clean_words in train_df.clean_words:   
    point = 0
    for word in clean_words:
        if word in keyWords:              ## <<== 算分數
            point+=keyWords[word]
    
    pred = 0
    if point >= 50:                       ## <<== 總分大於50就是肥胖
        pred = 1
    
    points.append(point)
    preds.append(pred)
    
train_df["points"] = points
train_df["preds"] = preds
train_df.to_csv("point_train.csv")          ##<<== 儲存預測結果 


pred = preds
true = list(train_df.y)

## 計算confusion_matrix和相關驗證數值
c = confusion_matrix(true, pred)
print("confusion_matrix: \n", c)
accuracy = accuracy_score(true, pred)
precision = precision_score(true, pred)
recall = recall_score(true, pred)
f1 = f1_score(true, pred)
print("precison:", precision)
print("recall:", recall)
print("accuracy:", accuracy)
print("f1:", f1)

# 驗證集結果，儲存 >> sample_submission.csv  此方法只有0.48分

In [5]:
# validation_data_val
points = []
preds = []
for clean_words in val_df.clean_words:   
    point = 0
    for word in clean_words:
        if word in keyWords:              ## <<== 算分數
            point+=keyWords[word]
    
    pred = 0
    if point >= 50:                       ## <<== 總分大於50就是肥胖
        pred = 1
    
    points.append(point)
    preds.append(pred)


val_df["Obesity"] = preds
val_df = val_df.drop(columns=["clean_words"])                ## 刪除文字那一行
val_df.to_csv("sample_submission.csv", index=False)          ##<<== 儲存預測結果 