In [1]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load pre-trained Word2Vec model.
model = gensim.models.Word2Vec.load("w2v_model/word2vec_WordNumBig2.model")

In [3]:
vector_size = model.wv.vector_size
vector_size

100

# 驗證集

In [4]:
val_df = pd.read_pickle("clean_Validation.pk")
text2vec = []
for text in val_df.clean_words:
    wordNumber = 0
    totalWordVec = np.zeros(vector_size)
    for word in text:
        try:
            totalWordVec += model.wv[word]
            wordNumber += 1
        except:
            continue
    text2vec.append(totalWordVec/wordNumber)
x_val = np.array(text2vec)

# 訓練集

In [5]:
keyWords={"obesity":4,
          "obese":4,
          "morbidly":2,
          "morbid":2,
          "hyperlipidemia":2
}

In [6]:
test_df = pd.read_pickle("clean_Test_Intuitive_VAL.pk")

In [7]:
text2vec = []
for text in test_df.clean_words:
    wordNumber = 0
    totalWordVec = np.zeros(vector_size)
    for word in text:
        try:
            if word in keyWords:
                totalWordVec += keyWords[word] * model.wv[word]    # 加權
            else:
                totalWordVec += model.wv[word]
            wordNumber += 1
        except:
            continue
    text2vec.append(totalWordVec/wordNumber)

X = np.array(text2vec)
y = test_df.y

# 分割資料成 X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=300, n_estimators = 400)
clf.fit(X_train, y_train)

# test: 計算confusion_matrix和相關驗證數值
pred = clf.predict(X_test)
c = confusion_matrix(y_test, pred)
print("confusion_matrix: \n", c)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
print("precison:", precision)
print("recall:", recall)
print("accuracy:", accuracy)
print("f1:", f1)

# val
pred = clf.predict(x_val)
sample = val_df
sample["Obesity"] = pred
sample = sample.drop(["clean_words"], axis=1)
sample.to_csv("sample_submission.csv", index=False)

confusion_matrix: 
 [[16  5]
 [ 6 13]]
precison: 0.7222222222222222
recall: 0.6842105263157895
accuracy: 0.725
f1: 0.7027027027027027


# Naive Bayes

In [10]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

# test: 計算confusion_matrix和相關驗證數值
pred = gnb.predict(X_test)
c = confusion_matrix(y_test, pred)
print("confusion_matrix: \n", c)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
print("precison:", precision)
print("recall:", recall)
print("accuracy:", accuracy)
print("f1:", f1)


# val
pred = gnb.predict(x_val)
sample = val_df
sample["Obesity"] = pred
sample = sample.drop(["clean_words"], axis=1)
sample.to_csv("sample_submission.csv", index=False)

confusion_matrix: 
 [[16  5]
 [ 7 12]]
precison: 0.7058823529411765
recall: 0.631578947368421
accuracy: 0.7
f1: 0.6666666666666667


# XGboost

In [13]:
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(
                    n_estimators=200,    #樹的個數
                    learning_rate= 0.3,  # 如同學習率
                    max_depth=50         # 構建樹的深度，越大越容易過擬合 
                    )
xgb.fit(X_train, y_train)

# test: 計算confusion_matrix和相關驗證數值
pred = xgb.predict(X_test)
c = confusion_matrix(y_test, pred)
print("confusion_matrix: \n", c)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
print("precison:", precision)
print("recall:", recall)
print("accuracy:", accuracy)
print("f1:", f1)


# val
pred = xgb.predict(x_val)
sample = val_df
sample["Obesity"] = pred
sample = sample.drop(["clean_words"], axis=1)
sample.to_csv("sample_submission.csv", index=False)




confusion_matrix: 
 [[16  5]
 [ 5 14]]
precison: 0.7368421052631579
recall: 0.7368421052631579
accuracy: 0.75
f1: 0.7368421052631579
