##  Team 13 TCM SVM ver.

This is the model built by SVM.
Powered by ChatGPT. LOL.

### Step 1. Package import

Import the necessary packages.

In [104]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

### Step 2. Data preparation

1. Read data from the .csv file as a Pandas DataFrame.
2. Change the columns with texts into numeric values using LabelEncoder.
3. Split the data into the status/diagnoses/symptoms and the prescriptions.
4. Delete the CHMs that are not often used enough.
5. Split the data into training data and validation data.

In [121]:
# Basic information about the dataset(REMINDER: The data is edited):
# Indexing(not used): 0
# Body status: 1~3
# Diagnosis: 4~7
# Prescription-text(not used): 8~10
# Symptom: 11~125
# Prescription: 126~226

# Total patient data: 797

# 1. Read data from the .csv file as a Pandas DataFrame.
def ReadData(FILENAME):
    data = pd.read_csv(FILENAME)
    # Debug
    print("ReadData:")
    print(f'Shape of data = ({data.shape[0]}, {data.shape[1]}).')
    return data

# 2. Change the columns with texts into numeric values using LabelEncoder.
def TextConvert(data):
   # Body status: 1~3, Diagnosis: 4~7
    label_encoder = LabelEncoder()
    categorical_columns = list(range(1, 8))
    for i in categorical_columns:
        data.iloc[:, i] = label_encoder.fit_transform(data.iloc[:, i])
    return data

# 3. Split the data into the status/diagnoses/symptoms and the prescriptions.
def SplitXY(data):
    # Body status: 1~3, Diagnosis: 4~7, Symptom: 11~124
    # Prescription: 125~226
    split_X = list(range(1, 8)) + list(range(11, 126))
    split_Y = list(range(125, 227))
    X = data.iloc[1:, split_X]
    y = data.iloc[1:, split_Y]
    # Debug
    print("SplitXY:")
    print(f'Shape of X = ({X.shape[0]}, {X.shape[1]}). First 10 data of X:')
    print(X.iloc[:10, :10])
    print(f'Shape of y = ({y.shape[0]}, {y.shape[1]}). First 10 data of y:')
    print(y.iloc[:10, :10])
    return X, y

# 4. Delete the CHMs that are not often used enough.
def DeleteMedicine(y):
    threshold = 10
    for col in y.columns:
        if y[col].sum() < threshold:
            y = y.drop(col, axis=1)
    

# 5. Split the data into training data and validation data.
def SplitTrainValid(X, y):
    state = 114514
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=state, stratify=y)
    # Debug
    print("SplitTrainValid:")
    print(f'shape of X_train is ({X_train.shape[0]}, {X_train.shape[1]}).')
    print(f'shape of X_test is ({X_test.shape[0]}, {X_test.shape[1]}).')
    print(f'shape of y_train is ({y_train.shape[0]}, {y_train.shape[1]}).')
    print(f'shape of y_test is ({y_test.shape[0]}, {y_test.shape[1]}).')
    
    return X_train, X_test, y_train, y_test

### Step 3. Build the SVM model

For the information about sklearn SVC, go to: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
1. Build the SVM classfier.
2. Use the model to predict the data.
3. Evaluate the model's performance.

In [122]:
# 1. Build the SVM classfier.
def SVMModel(X_train, y_train):
    svm_model = MultiOutputClassifier(SVC(kernel='rbf', C=1.0)) # TODO: Determine the kernel function.
    svm_model.fit(X_train, y_train)
    return svm_model

# 2. Use the model to predict the data.
def Predict(model, X_test):
    y_pred = model.predict(X_test)
    return y_pred

# 3. Evaluate the model's performance.
def Evaluate(y_test, y_pred) :
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    print('Accuracy: ' + accuracy)
    print(f'Confusion Matrix:\n' + conf_matrix)
    print(f'Classification Report:\n' + class_report)

### Step 4. Main code and model training

In [123]:
# Step 2 functions
FILENAME = './process_data.csv'
data = ReadData(FILENAME)
data = TextConvert(data)
X, y = SplitXY(data)
DeleteMedicine(y)
X_train, X_test, y_train, y_test = SplitTrainValid(X, y)
print("Data preparation completed.")

# Step 3 functions
model = SVMModel(X_train, y_train)
y_pred = Predict(model, X_test)
Evaluate(y_test, y_pred)

ReadData:
Shape of data = (797, 227).
SplitXY:
Shape of X = (796, 122). First 10 data of X:
   性別 年齡及體型 月經 脈診 舌診 眼診 耳診  乳癌  肺癌  胰臟癌
1   0     0  1  0  3  1  1   0   0    0
2   0     0  1  0  3  1  1   0   0    0
3   0     2  1  1  3  1  1   0   0    0
4   0     2  1  0  0  1  1   0   0    0
5   0     2  1  1  3  1  1   0   0    0
6   0     2  1  0  3  1  1   0   0    0
7   0     0  1  0  3  1  1   0   0    0
8   0     0  1  0  3  1  1   0   0    0
9   0     0  0  1  2  1  1   0   0    0
10  0     0  1  1  3  1  1   0   0    0
Shape of y = (796, 102). First 10 data of y:
    麻黃  桂枝  荊芥  防風  細辛  白芷  生薑  辛夷  葛根  升麻
1    0   1   0   0   1   1   0   0   0   0
2    0   1   0   0   0   0   1   0   0   0
3    0   0   0   0   0   0   0   0   0   0
4    0   1   0   0   0   0   1   1   0   0
5    0   1   0   0   0   0   0   0   0   0
6    0   0   0   0   0   0   0   0   0   0
7    0   1   0   0   0   0   0   0   0   0
8    0   1   0   0   0   0   0   0   0   0
9    0   1   0   0   1   0   0   0  

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.