In [1]:
import pandas as pd

# Load the datasets
train_file_path = '/home/jupyter/DrugPredictions/drugsComTrain_raw.csv'
test_file_path = '/home/jupyter/DrugPredictions/drugsComTest_raw.csv' 

# Load training and test datasets
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Display basic info about both datasets
train_info = train_data.info()
test_info = test_data.info()

# Display the first few rows of the datasets to examine their structure
train_head = train_data.head()
test_head = test_data.head()

# Output results
train_info, train_head, test_info, test_head


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   uniqueID     161297 non-null  int64 
 1   drugName     161297 non-null  object
 2   condition    160398 non-null  object
 3   review       161297 non-null  object
 4   rating       161297 non-null  int64 
 5   date         161297 non-null  object
 6   usefulCount  161297 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 8.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53766 entries, 0 to 53765
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   uniqueID     53766 non-null  int64 
 1   drugName     53766 non-null  object
 2   condition    53471 non-null  object
 3   review       53766 non-null  object
 4   rating       53766 non-null  int64 
 5   date         53766 non-null  object
 6   usefulCount  53766 no

(None,
    uniqueID                  drugName                     condition  \
 0    206461                 Valsartan  Left Ventricular Dysfunction   
 1     95260                Guanfacine                          ADHD   
 2     92703                    Lybrel                 Birth Control   
 3    138000                Ortho Evra                 Birth Control   
 4     35696  Buprenorphine / naloxone             Opiate Dependence   
 
                                               review  rating       date  \
 0  "It has no side effect, I take it in combinati...       9  20-May-12   
 1  "My son is halfway through his fourth week of ...       8  27-Apr-10   
 2  "I used to take another oral contraceptive, wh...       5  14-Dec-09   
 3  "This is my first time using any form of birth...       8   3-Nov-15   
 4  "Suboxone has completely turned my life around...       9  27-Nov-16   
 
    usefulCount  
 0           27  
 1          192  
 2           17  
 3           10  
 4         

In [6]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy as np
import re
# Make copies of the datasets for preprocessing
train_cleaned = train_data.copy()
test_cleaned = test_data.copy()

# Alternative: Basic cleaning without relying on NLTK stopwords
# Define a simple list of common stopwords
basic_stopwords = set([
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 
    'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with'
])

def clean_review_basic(text):
    # Remove special characters and convert to lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", str(text)).lower()
    # Tokenize text
    tokens = text.split()
    # Remove stopwords
    filtered_tokens = [word for word in tokens if word not in basic_stopwords]
    return " ".join(filtered_tokens)

# Apply the cleaning function to the 'review' column
train_cleaned['review'] = train_cleaned['review'].apply(clean_review_basic)
test_cleaned['review'] = test_cleaned['review'].apply(clean_review_basic)

# Verify that the cleaning process worked
train_cleaned.head(), test_cleaned.head()


(   uniqueID                  drugName                     condition  \
 0    206461                 Valsartan  Left Ventricular Dysfunction   
 1     95260                Guanfacine                          ADHD   
 2     92703                    Lybrel                 Birth Control   
 3    138000                Ortho Evra                 Birth Control   
 4     35696  Buprenorphine / naloxone             Opiate Dependence   
 
                                               review  rating       date  \
 0  no side effect i take combination bystolic mg ...       9  20-May-12   
 1  my son halfway through his fourth week intuniv...       8  27-Apr-10   
 2  i used take another oral contraceptive which h...       5  14-Dec-09   
 3  this my first time using any form birth contro...       8   3-Nov-15   
 4  suboxone completely turned my life around i fe...       9  27-Nov-16   
 
    usefulCount  
 0           27  
 1          192  
 2           17  
 3           10  
 4           37  ,

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')  # Limit features to top 5000 terms

# Fit and transform the 'review' column into numerical features
tfidf_train = tfidf_vectorizer.fit_transform(train_cleaned['review'])
tfidf_test = tfidf_vectorizer.transform(test_cleaned['review'])

# Convert the TF-IDF matrices into DataFrames for easier handling
tfidf_train_df = pd.DataFrame(tfidf_train.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_test_df = pd.DataFrame(tfidf_test.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Add drug-condition interaction as a feature
train_cleaned['drug_condition_interaction'] = train_cleaned['drugName'] * train_cleaned['condition']
test_cleaned['drug_condition_interaction'] = test_cleaned['drugName'] * test_cleaned['condition']

# Combine TF-IDF features and original dataset features (excluding the original review text)
train_features = pd.concat(
    [train_cleaned[['drugName', 'condition', 'usefulCount', 'drug_condition_interaction']], tfidf_train_df], axis=1
)
test_features = pd.concat(
    [test_cleaned[['drugName', 'condition', 'usefulCount', 'drug_condition_interaction']], tfidf_test_df], axis=1
)

# Check the shape of the resulting feature sets
train_features_shape = train_features.shape
test_features_shape = test_features.shape

train_features_shape, test_features_shape


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 1. Define Satisfaction Categories
def categorize_rating(rating):
    if rating <= 0.3:
        return 0  # Low
    elif rating <= 0.7:
        return 1  # Medium
    else:
        return 2  # High

train_cleaned['satisfaction_category'] = train_cleaned['rating'].apply(categorize_rating)
test_cleaned['satisfaction_category'] = test_cleaned['rating'].apply(categorize_rating)

# 2. Split Train/Validation Data
X_train, X_val, y_train, y_val = train_test_split(
    train_features.values, train_cleaned['satisfaction_category'].values, test_size=0.2, random_state=42
)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# 3. Define Neural Network Model
class SatisfactionPredictor(nn.Module):
    def __init__(self, input_size):
        super(SatisfactionPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 3)  # 3 classes: Low, Medium, High
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)  # No activation (raw logits for CrossEntropyLoss)
        return x

# Model initialization
input_size = X_train_tensor.shape[1]
model = SatisfactionPredictor(input_size)

# 4. Define Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 5. Train the Model
epochs = 10
for epoch in range(epochs):
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Validation
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_predictions = torch.argmax(val_outputs, axis=1)
        val_accuracy = accuracy_score(y_val_tensor, val_predictions)
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {val_accuracy:.4f}")

# 6. Save the Model
torch.save(model.state_dict(), "satisfaction_predictor.pth")


In [None]:
from sklearn.metrics import f1_score, roc_auc_score, classification_report

# Convert test features and labels to PyTorch tensors
X_test_tensor = torch.tensor(test_features.values, dtype=torch.float32)
y_test_tensor = torch.tensor(test_cleaned['satisfaction_category'].values, dtype=torch.long)

# Load the trained model for evaluation
model.eval()  # Set model to evaluation mode

# Make predictions on the test set
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_predictions = torch.argmax(test_outputs, axis=1)

# 1. F1-Score (Macro and Weighted)
f1_macro = f1_score(y_test_tensor, test_predictions, average='macro')
f1_weighted = f1_score(y_test_tensor, test_predictions, average='weighted')

# 2. ROC-AUC (One-vs-Rest for multi-class classification)
# Convert logits to probabilities using softmax
softmax = nn.Softmax(dim=1)
test_probs = softmax(test_outputs).numpy()

# Calculate ROC-AUC for each class
y_test_one_hot = np.eye(3)[y_test_tensor.numpy()]  # Convert to one-hot encoding
roc_auc = roc_auc_score(y_test_one_hot, test_probs, multi_class='ovr')

# 3. Classification Report
classification_report_summary = classification_report(y_test_tensor, test_predictions, target_names=['Low', 'Medium', 'High'])

# Display Evaluation Metrics
f1_macro, f1_weighted, roc_auc, classification_report_summary
