In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Load the data
data = pd.read_csv('Annotated_data.csv')

# undersample the "no distortion" class so there's only 250 samples
no_distortion = data[data['Dominant Distortion'] == 'No Distortion']
distortion = data[data['Dominant Distortion'] != 'No Distortion']
no_distortion = no_distortion.sample(n=250, random_state=42)
data = pd.concat([no_distortion, distortion])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Patient Question'], data['Dominant Distortion'], test_size=0.2, random_state=42)

# Create a label encoder object
le = LabelEncoder()

# Fit the encoder to the training labels
y_train_encoded = le.fit_transform(y_train)

# Transform the testing labels
y_test_encoded = le.transform(y_test)

# Create the vectorizer and model objects
vectorizer = CountVectorizer()
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(le.classes_), n_jobs=-1, colsample_bytree=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.5)

# Preprocess the text data and vectorize using the provided vectorizer
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the model on the training data
model.fit(X_train_vectorized, y_train_encoded)

# Predict the labels for the testing data
y_pred = model.predict(X_test_vectorized)

# Calculate the F1-score
f1 = f1_score(y_test_encoded, y_pred, average='weighted')

# Output the F1-score
print(f"F1-score: {f1}")

# f1 before using best parameters: 0.1926
# f1 after using best parameters: 0.172

F1-score: 0.1719859377256203


In [6]:
# hyperparameter tuning on XGBoost using undersampled data
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import RandomizedSearchCV, HalvingGridSearchCV

# Create a dictionary of parameters to test using only arrays
params = {
    'learning_rate': [0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'subsample': [0.5, 0.7, 1],
    'colsample_bytree': [0.5, 0.7, 1],
    'min_child_weight': [1, 3]
    
}

"""

    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'subsample': [0.5, 0.7, 1],
    'colsample_bytree': [0.5, 0.7, 1],
    'min_child_weight': [1, 3]
"""

# Create a HalvingGridSearchCV and a RandomizedSearchCV object with the model and vectorizer
search = HalvingGridSearchCV(xgb.XGBClassifier(objective='multi:softmax', num_class=len(le.classes_), n_jobs=-1), params, cv=5, n_jobs=-1, verbose=1)
#search = RandomizedSearchCV(xgb.XGBClassifier(objective='multi:softmax', num_class=len(le.classes_), n_jobs=-1), params, n_iter=100, cv=5, n_jobs=-1, verbose=1)

# Train and test the model
search.fit(X_train_vectorized, y_train_encoded)

# Predict the labels for the testing data
y_pred = search.predict(X_test_vectorized)

# Calculate the F1-score
f1 = f1_score(y_test_encoded, y_pred, average='weighted')

# Output the F1-score
print(f"F1-score: {f1}")

# Print the best parameters
print(search.best_params_)

n_iterations: 3
n_required_iterations: 5
n_possible_iterations: 3
min_resources_: 110
max_resources_: 1477
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 162
n_resources: 110
Fitting 5 folds for each of 162 candidates, totalling 810 fits
----------
iter: 1
n_candidates: 54
n_resources: 330
Fitting 5 folds for each of 54 candidates, totalling 270 fits
----------
iter: 2
n_candidates: 18
n_resources: 990
Fitting 5 folds for each of 18 candidates, totalling 90 fits
F1-score: 0.16393040345996615
{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}


In [20]:

def tuneParams(model, params, X_train, y_train, X_test, y_test):
    # Create a HalvingGridSearchCV and a RandomizedSearchCV object with the model and vectorizer
    search = HalvingGridSearchCV(model, params, cv=5, n_jobs=-1, verbose=1)
    #search = RandomizedSearchCV(model, params, n_iter=100, cv=5, n_jobs=-1, verbose=1)

    # Train and test the model
    search.fit(X_train, y_train)

    # Predict the labels for the testing data
    y_pred = search.predict(X_test)

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Predict the labels for the testing data
    y_pred = model.predict(X_test)

    # Calculate the F1-score
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Output the F1-score
    print(f"{model.__class__.__name__} F1-score: {f1}")

    # Print the best parameters
    print(search.best_params_)

In [23]:
# Create a label encoder object
le = LabelEncoder()

# Fit the encoder to the training labels
y_train_encoded = le.fit_transform(y_train)

# Transform the testing labels
y_test_encoded = le.transform(y_test)
tuneParams(xgb.XGBClassifier(objective='multi:softmax', num_class=len(le.classes_), n_jobs=-1), params, X_train_vectorized, y_train_encoded, X_test_vectorized, y_test_encoded)

n_iterations: 3
n_required_iterations: 5
n_possible_iterations: 3
min_resources_: 110
max_resources_: 1477
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 162
n_resources: 110
Fitting 5 folds for each of 162 candidates, totalling 810 fits
----------
iter: 1
n_candidates: 54
n_resources: 330
Fitting 5 folds for each of 54 candidates, totalling 270 fits
----------
iter: 2
n_candidates: 18
n_resources: 990
Fitting 5 folds for each of 18 candidates, totalling 90 fits
XGBClassifier F1-score: 0.192562687169555
{'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 1}


In [24]:
# turn parameters of LinearSVC
from sklearn.svm import LinearSVC

vectorizer = TfidfVectorizer()
model = LinearSVC(C=1)

X_train, X_test, y_train, y_test = train_test_split(data['Patient Question'], data['Dominant Distortion'], test_size=0.2, random_state=42)

# Preprocess the text data and vectorize using the provided vectorizer
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the model on the training data
model.fit(X_train_vectorized, y_train)

# Predict the labels for the testing data
y_pred = model.predict(X_test_vectorized)

# Calculate the F1-score
f1 = f1_score(y_test, y_pred, average='weighted')

# Output the F1-score
print(f"F1-score: {f1}")


tuneParams(LinearSVC(), {'C': [0.1, 1, 10, 100]}, X_train_vectorized, y_train, X_test_vectorized, y_test)



F1-score: 0.19165301643143118
n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 492
max_resources_: 1477
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 4
n_resources: 492
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 1
n_candidates: 2
n_resources: 1476
Fitting 5 folds for each of 2 candidates, totalling 10 fits
LinearSVC F1-score: 0.19165301643143118
{'C': 0.1}




In [None]:
# Load the data
data = pd.read_csv('Annotated_data.csv')

# train with no non-distortion data
distortion = data[data['Dominant Distortion'] != 'No Distortion']
data = distortion

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Patient Question'], data['Dominant Distortion'], test_size=0.2, random_state=42)

# Create a label encoder object
le = LabelEncoder()

# Fit the encoder to the training labels
y_train_encoded = le.fit_transform(y_train)

# Transform the testing labels
y_test_encoded = le.transform(y_test)

# Create the vectorizer and model objects
vectorizer = CountVectorizer()
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(le.classes_), n_jobs=-1, colsample_bytree=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.5)

# Preprocess the text data and vectorize using the provided vectorizer
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)