In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [85]:
# Load data
data = pd.read_csv('/kaggle/input/medical-text/data.csv')
X = data['medical_abstract']
y = data['condition_label']

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Dataset shape: (11550, 2)


In [86]:
# Define the SVM with adjusted parameters and pipeline
pipeline = ImbPipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=1500, ngram_range=(1, 2))),
    ('smote', SMOTE(random_state=42)),
    ('classifier', SVC(class_weight='balanced'))
])

# Parameters for GridSearchCV focusing on tighter regularization
parameters = {
    'classifier__C': [0.1, 1],  # Lower C values for stronger regularization
    'classifier__gamma': ['scale', 0.1],  # Scale uses 1 / (n_features * X.var()) as value of gamma
    'classifier__kernel': ['rbf']
}

In [87]:
# Running grid search on the training and validation set
grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy', verbose=3)
grid_search.fit(X_train, y_train)
print("Grid search complete with best parameters:", grid_search.best_params_)

# Predicting on the validation set
y_pred_validate = grid_search.predict(X_validate)

# Evaluating the model on the validation set
print("Validation - Confusion Matrix:\n", confusion_matrix(y_validate, y_pred_validate))
print("Validation - F1 Score:", f1_score(y_validate, y_pred_validate, average='weighted'))
print("Validation - Classification Report:\n", classification_report(y_validate, y_pred_validate))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END classifier__C=1, classifier__class_weight=balanced, classifier__gamma=0.1, classifier__kernel=rbf;, score=0.613 total time= 1.1min
[CV 2/5] END classifier__C=1, classifier__class_weight=balanced, classifier__gamma=0.1, classifier__kernel=rbf;, score=0.584 total time= 1.1min
[CV 3/5] END classifier__C=1, classifier__class_weight=balanced, classifier__gamma=0.1, classifier__kernel=rbf;, score=0.587 total time= 1.1min
[CV 4/5] END classifier__C=1, classifier__class_weight=balanced, classifier__gamma=0.1, classifier__kernel=rbf;, score=0.612 total time= 1.3min
[CV 5/5] END classifier__C=1, classifier__class_weight=balanced, classifier__gamma=0.1, classifier__kernel=rbf;, score=0.595 total time= 1.2min
[CV 1/5] END classifier__C=1, classifier__class_weight=balanced, classifier__gamma=1.0, classifier__kernel=rbf;, score=0.567 total time= 1.0min
[CV 2/5] END classifier__C=1, classifier__class_weight=balanced, classifier_