In [1]:
# This file is intended to implement Class Weighting and Hyperparameter Optimization for the best performing model found in the baseline.ipynb notebook, LinearSVC with TF-IDF Vectorization.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Load the data
data = pd.read_csv('Annotated_data.csv')
discreteData = data[['Id_Number', 'Patient Question', 'Dominant Distortion']].copy()
discreteData.loc[discreteData['Dominant Distortion'] != 'No Distortion', 'Dominant Distortion'] = 'Distorted'


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(discreteData['Patient Question'], discreteData['Dominant Distortion'], test_size=0.2, random_state=42)

# Vectorize the data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(zip(np.unique(y_train), class_weights))

# Create the model
model = LinearSVC(class_weight=class_weights)

# Fit the model
model.fit(X_train, y_train)

# Predict the test set
y_pred = model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt="d")
plt.show()

# Hyperparameter Optimization
# Define the hyperparameters
param_distributions = {
    'C': np.logspace(-4, 4, 20),
    'loss': ['hinge', 'squared_hinge'],
    'dual': [True, False]
}

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(LinearSVC(class_weight=class_weights), param_distributions, n_iter=100, scoring='f1', n_jobs=-1, cv=5, random_state=42)

# Fit the RandomizedSearchCV object
random_search.fit(X_train, y_train)

# Print the best hyperparameters
print(random_search.best_params_)
print(random_search.best_score_)
print(random_search.best_estimator_)
print(random_search.best_index_)

# Predict the test set
y_pred = random_search.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt="d")
plt.show()

TypeError: compute_class_weight() takes 1 positional argument but 3 were given