In [34]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [43]:
# Load pre-trained Word2Vec model.
model = gensim.models.Word2Vec.load("w2v_model/word2vec_WordNumBig2.model")

In [44]:
vector_size = model.wv.vector_size
vector_size

100

# 驗證集

In [45]:
val_df = pd.read_pickle("clean_Validation.pk")
text2vec = []
for text in val_df.clean_words:
    wordNumber = 0
    totalWordVec = np.zeros(vector_size)
    for word in text:
        try:
            totalWordVec += model.wv[word]
            wordNumber += 1
        except:
            continue
    text2vec.append(totalWordVec/wordNumber)
x_val = np.array(text2vec)

# 訓練集

In [46]:
keyWords={"obesity":4,
          "obese":4,
          "morbidly":2,
          "morbid":2,
          "hyperlipidemia":2,
          "asthma":1,
          "htn":1
}

In [47]:
test_df = pd.read_pickle("clean_Test_Intuitive_VAL.pk")
train_df = pd.read_pickle("clean_Train_Textual.pk")

In [48]:
text2vec = []
for text in test_df.clean_words:
    wordNumber = 0
    totalWordVec = np.zeros(vector_size)
    for word in text:
        try:
            if word in keyWords:
                totalWordVec += keyWords[word] * model.wv[word]    # 加權
            else:
                totalWordVec += 0.1 * model.wv[word]
            wordNumber += 1
        except:
            continue
    text2vec.append(totalWordVec/wordNumber)

# 多train
# for text in train_df.clean_words:
#     wordNumber = 0
#     totalWordVec = np.zeros(vector_size)
#     for word in text:
#         try:
#             if word in keyWords:
#                 totalWordVec += keyWords[word] * model.wv[word]    # 加權
#             else:
#                 totalWordVec += 0.1 * model.wv[word]
#             wordNumber += 1
#         except:
#             continue
#     text2vec.append(totalWordVec/wordNumber)    
    
    


X = np.array(text2vec)
y = test_df.y
# y = np.concatenate((test_df.y , train_df.y), axis=0)

# 分割資料成 X_train, X_test, y_train, y_test

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# Random Forest

In [51]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=300, n_estimators=400)
clf.fit(X_train, y_train)

# test: 計算confusion_matrix和相關驗證數值
pred = clf.predict(X_test)
c = confusion_matrix(y_test, pred)
print("confusion_matrix: \n", c)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
print("precison:", precision)
print("recall:", recall)
print("accuracy:", accuracy)
print("f1:", f1)

# val
pred = clf.predict(x_val)
sample = val_df
sample["Obesity"] = pred
sample = sample.drop(["clean_words"], axis=1)
sample.to_csv("sample_submission.csv", index=False)

confusion_matrix: 
 [[30  3]
 [ 9 38]]
precison: 0.926829268292683
recall: 0.8085106382978723
accuracy: 0.85
f1: 0.8636363636363636


# Naive Bayes

In [10]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

# test: 計算confusion_matrix和相關驗證數值
pred = gnb.predict(X_test)
c = confusion_matrix(y_test, pred)
print("confusion_matrix: \n", c)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
print("precison:", precision)
print("recall:", recall)
print("accuracy:", accuracy)
print("f1:", f1)


# val
pred = gnb.predict(x_val)
sample = val_df
sample["Obesity"] = pred
sample = sample.drop(["clean_words"], axis=1)
sample.to_csv("sample_submission.csv", index=False)

confusion_matrix: 
 [[21  0]
 [ 1 18]]
precison: 1.0
recall: 0.9473684210526315
accuracy: 0.975
f1: 0.972972972972973


# XGboost

In [12]:
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(
                    n_estimators=200,    #樹的個數
                    learning_rate= 0.3,  # 如同學習率
                    max_depth=50         # 構建樹的深度，越大越容易過擬合 
                    )
xgb.fit(X_train, y_train)

# test: 計算confusion_matrix和相關驗證數值
pred = xgb.predict(X_test)
c = confusion_matrix(y_test, pred)
print("confusion_matrix: \n", c)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
print("precison:", precision)
print("recall:", recall)
print("accuracy:", accuracy)
print("f1:", f1)


# val
pred = xgb.predict(x_val)
sample = val_df
sample["Obesity"] = pred
sample = sample.drop(["clean_words"], axis=1)
sample.to_csv("sample_submission.csv", index=False)




confusion_matrix: 
 [[21  0]
 [ 0 19]]
precison: 1.0
recall: 1.0
accuracy: 1.0
f1: 1.0
