##  Team 13 TCM SVM ver.

This is the model built by SVM.
Powered by ChatGPT. LOL.

### Step 1. Package import

Import the necessary packages.

In [119]:
import pandas as pd
import numpy as np
import random

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

### Step 2. Data preparation

1. Read data from the .csv file as a Pandas DataFrame.
2. Change the columns with texts into numeric values using LabelEncoder.
3. Split the data into the status/diagnoses/symptoms and the prescriptions.
4. Delete the CHMs that are not often used enough.
5. Split the data into training data and validation data.

In [120]:
# Basic information about the dataset(REMINDER: The dataset is edited):
# Index(not used): 0
# Body status: 1~3
# Diagnosis: 4~7
# Prescription_text(not used): 8~10
# Symptom: 11~125
# Prescription: 126~226

# Total patient data: 797

# 1. Read data from the .csv file as a Pandas DataFrame.
def ReadData(FILENAME):
    data = pd.read_csv(FILENAME)
    # Debug
    # print(f'ReadData: Shape of data = {data.shape}.')
    return data

# 2. Change the columns with texts into numeric values using LabelEncoder.
def TextConvert(data):
    # Converting: 1~7 (Body status + Diagnosis)
    label_encoder = LabelEncoder()
    categorical_columns = list(range(1, 8))
    for i in categorical_columns:
        data.iloc[:, i] = label_encoder.fit_transform(data.iloc[:, i])
    return data

# 3. Split the data into the status/diagnoses/symptoms and the prescriptions.
def SplitXY(data):
    # X: 1~7 + 11~124 (Body status + Diagnosis + Symptom)
    # Y: 125~226 (Prescription)
    split_X = list(range(1, 8)) + list(range(11, 126))
    split_Y = list(range(125, 227))
    X = data.iloc[1:, split_X]
    y = data.iloc[1:, split_Y]
    # Debug
    # print(f'SplitXY: Shape of X and y = {X.shape}, {y.shape}')
    return X, y

# 4. Delete the CHMs that are not often used enough.
def DeleteMedicine(y, threshold):
    for col in y.columns:
        # Debug
        # print(f'Number of {col} is {y[col].sum()}.')
        if y[col].sum() < threshold:
            y = y.drop(col, axis=1)
    # Debug
    # print(f'DeleteMedicine: shape of y is {y.shape}.')
    return y

# 5. Split the data into training data and validation data.
def SplitTrainValid(X, y, stratify=False):
    random_state = 114514
    train_rate = 0.8
    test_rate = 1 - train_rate
    
    if stratify == True:
        # TODO
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_rate, random_state=random_state, stratify=y)
    else:
        random.seed(random_state)
        border = int(X.shape[0] * train_rate)
    
        X_temp = X.sample(n=X.shape[0], random_state=random_state)
        y_temp = y.sample(n=y.shape[0], random_state=random_state)

        X_train = X_temp[:border - 1]
        X_test = X_temp[border:]
        y_train = y_temp[:border - 1]
        y_test = y_temp[border:]
    # Debug
    # print(f'SplitTrainValid: shape of X_train, X_Test, y_train, y_test are {X_train.shape}, {X_test.shape}, {y_train.shape}, {y_test.shape}.')
    
    return X_train, X_test, y_train, y_test

### Step 3. Build the SVM model

For the information about sklearn SVC, go to: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

1. Build the SVM models according to every single class.
    - Type 1: Multiple models: Build models according to every single class.
    - Type 2: MultiOutputClassifier model: Build the model according to every single class.
    - Type 3: compute_class_weight model: Build the model using compute_class_weight.
2. Use the model(s) to predict the data.
3. Evaluate the model's performance.

In [126]:
# 1. Build the SVM classfiers according to every single class.
# Type 1: Multiple Models
def SVMModel_1(X_train, y_train):
    svm_model = SVC(kernel='rbf', C=5)
    svm_model.fit(X_train, y_train)
    return svm_model

# Type 2: MultiOutputClassifier model
def SVMModel_2(X_train, y_train):
    svm_model = MultiOutputClassifier(SVC(kernel='rbf', C=5))
    svm_model.fit(X_train, y_train)
    return svm_model

# Type 3: compute_class_weight model
def SVMModel_3(X_train, y_train):
    classes = np.unique(y_train)
    class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
    class_weight_dict = dict(zip(classes, class_weights))
    svm_model = SVC(kernel='rbf', C=5, class_weight=class_weight_dict)
    svm_model.fit(X_train, y_train)
    return svm_model

# 2. Use the models to predict the data.
def Predict(model, X_test):
    y_pred = pd.DataFrame(model.predict(X_test))
    return y_pred

# 3. Evaluate the models' performance.
def Evaluate(y_test, y_pred, acc=False, cla=False):
    total_count = 0
    total_accuracy = 0
    for i in range(y_test.shape[0]):
        accuracy_temp = accuracy_score(y_test.iloc[i], y_pred.iloc[i])
        total_count += 1
        total_accuracy += accuracy_temp
        if acc == True:
            print(f'The accuracy for column {i} is {accuracy_temp}.')
    accuracy = total_accuracy / total_count
    if cla == True:
        class_report = classification_report(y_test, y_pred)
        print(f'Classification Report:\n{class_report}')
    return accuracy

### Step 4. Main code and model training

In [122]:
# Step 2 functions
FILENAME = './process_data.csv'
data = ReadData(FILENAME)
data = TextConvert(data)
X, y = SplitXY(data)
y = DeleteMedicine(y, threshold=10)
X_train, X_test, y_train, y_test = SplitTrainValid(X, y)
print("Data preparation completed.")

Data preparation completed.


In [132]:
model_build_principle = 3
# Step 3 functions
if model_build_principle == 1:
    models = []
    for col in y_train.columns:
        models.append(SVMModel_1(X_train, y_train[col]))
    y_pred = pd.DataFrame()
    
    for i in range(len(models)):
        y_temp = Predict(models[i], X_test)
        y_pred = pd.concat([y_pred, y_temp], axis=1)
elif model_build_principle == 2:
    model = SVMModel_2(X_train, y_train)
    y_pred = Predict(model, X_test)
elif model_build_principle == 3:
    models = []
    for col in y_train.columns:
        models.append(SVMModel_3(X_train, y_train[col]))
    y_pred = pd.DataFrame()
    
    for i in range(len(models)):
        y_temp = Predict(models[i], X_test)
        y_pred = pd.concat([y_pred, y_temp], axis=1)
    
eval = Evaluate(y_test, y_pred, acc=True)
print(f'Type {model_build_principle}: The overall accuracy = {eval}.')

The accuracy for column 0 is 0.8089887640449438.
The accuracy for column 1 is 0.9101123595505618.
The accuracy for column 2 is 0.9101123595505618.
The accuracy for column 3 is 0.9662921348314607.
The accuracy for column 4 is 0.797752808988764.
The accuracy for column 5 is 0.9662921348314607.
The accuracy for column 6 is 0.8539325842696629.
The accuracy for column 7 is 0.8426966292134831.
The accuracy for column 8 is 0.898876404494382.
The accuracy for column 9 is 0.797752808988764.
The accuracy for column 10 is 0.9101123595505618.
The accuracy for column 11 is 0.898876404494382.
The accuracy for column 12 is 0.8426966292134831.
The accuracy for column 13 is 0.8876404494382022.
The accuracy for column 14 is 0.7191011235955056.
The accuracy for column 15 is 0.8089887640449438.
The accuracy for column 16 is 0.9213483146067416.
The accuracy for column 17 is 0.7865168539325843.
The accuracy for column 18 is 0.8539325842696629.
The accuracy for column 19 is 0.8314606741573034.
The accuracy f

In [None]:
# Threshold/data number
# Threshold = 10, data number = 89
# Thrshold = 250, data number = 10

# TODO:
# 1. (Completed) Type 1: Determine the kernel and C in SVC()