# ***Logistic Regression:***

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stopwords and apply lemmatization
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath)
    data['processed_text'] = data['medical_abstract'].apply(preprocess_text)
    return data

data = load_and_preprocess_data('data.csv')


In [None]:
from sklearn.model_selection import train_test_split

train_val, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_val, test_size=0.25, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_data['processed_text']).toarray()
X_val_tfidf = vectorizer.transform(val_data['processed_text']).toarray()
X_test_tfidf = vectorizer.transform(test_data['processed_text']).toarray()

y_train = train_data['condition_label']
y_val = val_data['condition_label']
y_test = test_data['condition_label']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data['processed_text'])
y_train = train_data['condition_label']

# Initialize and train the logistic regression model
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Evaluate the model on the validation set
X_val = vectorizer.transform(val_data['processed_text'])
y_val = val_data['condition_label']
y_val_pred = log_reg_model.predict(X_val)

# Print classification report to evaluate the model performance
print("Classification Report on Validation Set:")
print(classification_report(y_val, y_val_pred))

# Optionally, you can also evaluate the model on the test set
X_test = vectorizer.transform(test_data['processed_text'])
y_test = test_data['condition_label']
y_test_pred = log_reg_model.predict(X_test)

# Print classification report on the test set
print("Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred))


Classification Report on Validation Set:
              precision    recall  f1-score   support

           1       0.74      0.74      0.74       526
           2       0.51      0.29      0.37       230
           3       0.52      0.32      0.40       296
           4       0.67      0.62      0.65       485
           5       0.45      0.59      0.51       773

    accuracy                           0.57      2310
   macro avg       0.58      0.51      0.53      2310
weighted avg       0.58      0.57      0.56      2310

Classification Report on Test Set:
              precision    recall  f1-score   support

           1       0.69      0.73      0.71       479
           2       0.54      0.36      0.43       224
           3       0.49      0.36      0.42       295
           4       0.68      0.64      0.66       520
           5       0.48      0.58      0.52       792

    accuracy                           0.57      2310
   macro avg       0.58      0.53      0.55      2310
w

### Hyperparameter Tuning:

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],               # Penalty type
    'solver': ['liblinear', 'saga']        # Solver for optimization
}

# Initialize logistic regression model
log_reg_model = LogisticRegression(max_iter=1000)

# Initialize GridSearchCV with logistic regression model and hyperparameter grid
grid_search = GridSearchCV(estimator=log_reg_model, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train final model with best hyperparameters on entire training set
best_log_reg_model = LogisticRegression(**best_params, max_iter=1000)
best_log_reg_model.fit(X_train, y_train)

# Evaluate the final model on the validation set
y_val_pred = best_log_reg_model.predict(X_val)
print("Classification Report on Validation Set:")
print(classification_report(y_val, y_val_pred))
