##  Team 13 TCM SVM ver.

This is the model built by SVM.

### Step 1. Package import

Import the necessary packages.

In [2]:
import pandas as pd
import numpy as np
import random

from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

### Step 2. Data preparation

1. Read data from the .csv file as a Pandas DataFrame.
2. Change the columns with texts into numeric values using LabelEncoder.
3. Split the data into the status/diagnoses/symptoms and the prescriptions.
4. Delete the CHMs that are not often used enough.
5. Split the data into training data and validation data.

In [12]:
# Basic information about the dataset(REMINDER: The dataset is edited):
# Index(not used): 0
# Body status: 1~3
# Diagnosis: 4~7
# Prescription_text(not used): 8~10
# Symptom: 11~125
# Prescription: 126~226

# Total patient data: 797

# 1. Read data from the .csv file as a Pandas DataFrame.
def ReadData(FILENAME):
    data = pd.read_csv(FILENAME)
    return data

# 2. Change the columns with texts into numeric values using LabelEncoder.
def TextConvert(data):
    # Converting: 1~7 (Body status + Diagnosis)
    label_encoder = LabelEncoder()
    categorical_columns = list(range(1, 8))
    for i in categorical_columns:
        data.iloc[:, i] = label_encoder.fit_transform(data.iloc[:, i])
    return data

# 3. Split the data into the status/diagnoses/symptoms and the prescriptions.
def SplitXY(data):
    # X: 1~7 + 11~124 (Body status + Diagnosis + Symptom)
    # Y: 125~226 (Prescription)
    split_X = list(range(1, 8)) + list(range(11, 126))
    split_Y = list(range(125, 227))
    X = data.iloc[1:, split_X]
    y = data.iloc[1:, split_Y]
    return X, y

# 4. Delete the CHMs that are not often used enough.
def DeleteMedicine(y, threshold):
    for col in y.columns:
        if y[col].sum() < threshold:
            y = y.drop(col, axis=1)
    print(f"There are {y.shape[1]} medicines.")
    return y

# 5. Split the data into training data and validation data.
def SplitTrainValid(X, y):
    random_state = 114514
    random.seed(random_state)
    train_rate = 0.8
    border = int(X.shape[0] * train_rate)
    
    X_temp = X.sample(n=X.shape[0], random_state=random_state)
    y_temp = y.sample(n=y.shape[0], random_state=random_state)
    X_train = X_temp[:border - 1]
    X_test = X_temp[border:]
    y_train = y_temp[:border - 1]
    y_test = y_temp[border:]
    
    return X_train, X_test, y_train, y_test

### Step 3. Build the SVM model

For the information about sklearn SVC, go to: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

1. Build the SVM models.
    - Type 1: Multiple models: Build models according to every single columns.
    - Type 2: MultiOutputClassifier model: Build the model according to all the columns.
    - Type 3: Weighted model: Build the weighted model to handle the imbalanced data.
2. Use the model to predict the data.
3. Evaluate the model's performance, including the accuracy and the f1-score.

In [4]:
# 1. Build the SVM models.
# Type 1: Multiple Models
def SVMModel_1(X_train, y_train, C=5):
    svm_model = SVC(kernel='rbf', C=C)
    svm_model.fit(X_train, y_train)
    return svm_model

# Type 2: MultiOutputClassifier model
def SVMModel_2(X_train, y_train, C=5):
    svm_model = MultiOutputClassifier(SVC(kernel='rbf', C=C))
    svm_model.fit(X_train, y_train)
    return svm_model

# Type 3: weighted model
def SVMModel_3(X_train, y_train, C=5, bias_const=1):
    # Generate the dictionary of weight.
    class_weight_dic = {}
    unique_values, counts = np.unique(y_train, return_counts=True)
    value_frequency_dict = dict(zip(unique_values, counts))
    total = value_frequency_dict[0] + value_frequency_dict[1]
    class_weight_dic = {0: value_frequency_dict[1] / total, 1: bias_const * value_frequency_dict[0] / total}
    
    svm_model = SVC(kernel='rbf', C=C, class_weight=class_weight_dic)
    svm_model.fit(X_train, y_train)
    return svm_model

# 2. Use the model to predict the data.
def Predict(model, X_test):
    y_pred = pd.DataFrame(model.predict(X_test))
    return y_pred

# 3. Evaluate the models' performance, including the accuracy and the f1-score.
def Evaluate(y_test, y_pred, model_build_principle, acc=False, acc_detail=False, f1=False, f1_detail=False):
    total_count = 0
    total_accuracy = 0 # Calculate the partial and overall accuracy.
    data = [0, 0, 0, 0] # [TP, FP, TN, FN]. Calculate the TP, FP, TN, FN, and f1-score.
    for i in range(y_test.shape[0]):
        accuracy_temp = accuracy_score(y_test.iloc[i], y_pred.iloc[i])
        data_temp = [0, 0, 0, 0]
        total_count += 1
        total_accuracy += accuracy_temp
        if acc_detail == True:
            print(f'The accuracy for column {i} is {accuracy_temp}.')
        for j in range(y_test.shape[1]):
            if y_test.iloc[i, j] == 1 and y_pred.iloc[i, j] == 1:
                data_temp[0] += 1
            elif y_test.iloc[i, j] == 0 and y_pred.iloc[i, j] == 1:
                data_temp[1] += 1
            elif y_test.iloc[i, j] == 0 and y_pred.iloc[i, j] == 0:
                data_temp[2] += 1
            elif y_test.iloc[i, j] == 1 and y_pred.iloc[i, j] == 0:
                data_temp[3] += 1
        if f1_detail == True:
            print(f'TP, FP, TN, and FN for column {i} are [{data_temp[0]}, {data_temp[1]}, {data_temp[2]}, {data_temp[3]}].')
        for j in range(4):
            data[j] += data_temp[j]
    accuracy = total_accuracy / total_count
    if acc == True:
        print(f'Type {model_build_principle}: The overall accuracy = {accuracy}.')
    if f1 == True:
        print(f'Type {model_build_principle}:')
        print(f'The overall TP, FP, TN, and FN are [{data[0]}, {data[1]}, {data[2]}, {data[3]}].')
        precision = data[0] / (data[0] + data[1])
        recall = data[0] / (data[0] + data[3])
        f1_score = 2 / (1 / precision + 1 / recall)
        print(f'The overall f1-score is {f1_score}.')

### Step 4. Main code and model training

In [15]:
# Step 2 functions
FILENAME = './process_data.csv'
data = ReadData(FILENAME)
data = TextConvert(data)
X, y = SplitXY(data)
y = DeleteMedicine(y, threshold=250)
X_train, X_test, y_train, y_test = SplitTrainValid(X, y)
print("Data preparation completed.")

There are 10 medicines.
Data preparation completed.


In [17]:
# Single-time model building
model_build_principle = 3
# Step 3 functions
if model_build_principle == 1:
    models = []
    for col in y_train.columns:
        models.append(SVMModel_1(X_train, y_train[col], C=10))
    y_pred = pd.DataFrame()
    for i in range(len(models)):
        y_temp = Predict(models[i], X_test)
        y_pred = pd.concat([y_pred, y_temp], axis=1)
elif model_build_principle == 2:
    model = SVMModel_2(X_train, y_train, C=10)
    y_pred = Predict(model, X_test)
elif model_build_principle == 3:
    models = []
    for col in y_train.columns:
        models.append(SVMModel_3(X_train, y_train[col], C=19, bias_const=1.1))
    y_pred = pd.DataFrame()
    for i in range(len(models)):
        y_temp = Predict(models[i], X_test)
        y_pred = pd.concat([y_pred, y_temp], axis=1)
    
Evaluate(y_test, y_pred, model_build_principle, acc=True, f1=True)

Type 3: The overall accuracy = 0.6781250000000002.
Type 3:
The overall TP, FP, TN, and FN are [434, 227, 651, 288].
The overall f1-score is 0.6276211135213304.


In [None]:
# Iterative model building - parameter adjustment
for c in range(19, 31):
    for model_build_principle in range(3, 4):
        if model_build_principle == 3:
            for bias_const in range(7, 13):
                models = []
                for col in y_train.columns:
                    models.append(SVMModel_3(X_train, y_train[col], C=c, bias_const=bias_const / 10))
                y_pred = pd.DataFrame()
                for i in range(len(models)):
                    y_temp = Predict(models[i], X_test)
                    y_pred = pd.concat([y_pred, y_temp], axis=1)
                print(f'C = {c}, bias_const = {bias_const}.')
                Evaluate(y_test, y_pred, model_build_principle, acc=True, f1=True)
        elif model_build_principle == 2:
            model = SVMModel_2(X_train, y_train, C=c)
            y_pred = Predict(model, X_test)
            print(f'C = {c}.')
            Evaluate(y_test, y_pred, model_build_principle, acc=True, f1=True)

In [None]:
# DeleteMedicine: threshold/data number
# threshold=7, data number=102
# threshold=10, data number=89
# threshold=250, data number=10

# SVMModel_2: threshold/C/accuracy/f1_score
# threshold=10, C=4, accuracy=0.891, f1_score=0.453
# threshold=10, C=5, accuracy=0.891, f1_score=0.465
# threshold=10, C=6, accuracy=0.890, f1_score=0.472
# threshold=10, C=7, accuracy=0.889, f1_score=0.473
# threshold=10, C=8, accuracy=0.888, f1_score=0.475
# threshold=10, C=9, accuracy=0.888, f1_score=0.479
# threshold=10, C=10, accuracy=0.888, f1_score=0.482

# SVMModel_3: threshold/C/bias_const/accuracy/f1_score
# threshold=10, C=14, bias_const=0.6, accuracy=0.877, f1_score=0.473
# threshold=10, C=14, bias_const=0.9, accuracy=0.870, f1_score=0.484
# threshold=10, C=15, bias_const=0.7, accuracy=0.875, f1_score=0.480
# threshold=10, C=15, bias_const=1.2, accuracy=0.867, f1_score=0.486
# threshold=10, C=16, bias_const=0.7, accuracy=0.876, f1_score=0.481
# threshold=10, C=16, bias_const=1.2, accuracy=0.869, f1_score=0.489
# threshold=10, C=17, bias_const=0.7, accuracy=0.877, f1_score=0.480
# threshold=10, C=17, bias_const=1.2, accuracy=0.869, f1_score=0.486
# threshold=10, C=18, bias_const=0.7, accuracy=0.877, f1_score=0.481
# threshold=10, C=18, bias_const=0.8, accuracy=0.875, f1_score=0.485
# threshold=10, C=19, bias_const=0.8, accuracy=0.875, f1_score=0.483
# threshold=10, C=19, bias_const=1.1, accuracy=0.872, f1_score=0.487
# threshold=10, C=20, bias_const=0.8, accuracy=0.875, f1_score=0.481
# threshold=10, C=20, bias_const=1.1, accuracy=0.872, f1_score=0.485

# TODO:
# Completed.