In [None]:
# huggingface-cli login
# HF_TOKEN=some_token_here

In [None]:

from datasets import load_dataset
import json
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("clinicalnlplab/DDI2013_test")


print(ds)


def processing(dataset, output_path):
    res = []
    for index, item in enumerate(dataset):
        text = item['query'].split("INPUT: ")[-1].split("What is the relationship between")[0].strip()
        answer = item['answer']
        gold = item['gold']

        dict_item = {
            'id': index,
            'sentence': text,
            'answer': answer,
            'label': gold
        }
        res.append(dict_item)

    with open(output_path, 'w') as f:
        json.dump(res, f, indent=4)


processing(ds['train'], 'train.json')
processing(ds['test'], 'test.json')
processing(ds['valid'], 'val.json')


data/train-00000-of-00001.parquet:   0%|          | 0.00/3.88M [00:00<?, ?B/s]

data/valid-00000-of-00001.parquet:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18779 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/7244 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5761 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'query', 'answer', 'choices', 'gold'],
        num_rows: 18779
    })
    valid: Dataset({
        features: ['id', 'query', 'answer', 'choices', 'gold'],
        num_rows: 7244
    })
    test: Dataset({
        features: ['id', 'query', 'answer', 'choices', 'gold'],
        num_rows: 5761
    })
})


In [None]:
import pandas as pd

train_df = pd.read_json('train.json')
val_df = pd.read_json('val.json')
test_df = pd.read_json('test.json')

print("Train data:")
display(train_df.head())
print("\nValidation data:")
display(val_df.head())
print("\nTest data:")
display(test_df.head())

Train data:


Unnamed: 0,id,sentence,answer,label
0,0,Both efavirenz and nevirapine have been compar...,none,4
1,1,Etonogestrel may interact with the following m...,none,4
2,2,Other concomitant therapy Although specific in...,none,4
3,3,Co-administration of aliskiren did not signifi...,none,4
4,4,No interactions have been observed with beta-r...,none,4



Validation data:


Unnamed: 0,id,sentence,answer,label
0,0,@DRUG1$: Cholestyramine may increase the clear...,none,4
1,1,Thyroid Physiology: The following agents may a...,none,4
2,2,"Agents that have been found, or are expected t...",none,4
3,3,The following agents may increase certain acti...,none,4
4,4,"Agents that have been found, or are expected t...",none,4



Test data:


Unnamed: 0,id,sentence,answer,label
0,0,Absorption of drugs from the stomach may be di...,none,4
1,1,"ECG intervals (PR, QRS, and QT) were not affec...",none,4
2,2,@DRUG1$: In 12 normal-weight subjects receivin...,none,4
3,3,Penicillin blood levels may be prolonged by co...,none,4
4,4,Human pharmacologic studies have shown that @D...,mechanism,1


In [None]:
print("Train set label distribution:")
display(train_df['label'].value_counts())

print("\nValidation set label distribution:")
display(val_df['label'].value_counts())

print("\nTest set label distribution:")
display(test_df['label'].value_counts())

Train set label distribution:


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
4,15842
0,1212
1,946
2,633
3,146



Validation set label distribution:


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
4,6240
0,396
1,373
2,193
3,42



Test set label distribution:


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
4,4782
0,360
1,302
2,221
3,96


In [None]:
# Calculate sentence lengths
train_df['sentence_length'] = train_df['sentence'].apply(len)
val_df['sentence_length'] = val_df['sentence'].apply(len)
test_df['sentence_length'] = test_df['sentence'].apply(len)

print("Train set sentence length statistics:")
display(train_df['sentence_length'].describe())

print("\nValidation set sentence length statistics:")
display(val_df['sentence_length'].describe())

print("\nTest set sentence length statistics:")
display(test_df['sentence_length'].describe())

Train set sentence length statistics:


Unnamed: 0,sentence_length
count,18779.0
mean,251.411737
std,135.585102
min,15.0
25%,156.0
50%,220.0
75%,304.0
max,626.0



Validation set sentence length statistics:


Unnamed: 0,sentence_length
count,7244.0
mean,358.342214
std,246.631101
min,15.0
25%,165.0
50%,262.0
75%,493.0
max,848.0



Test set sentence length statistics:


Unnamed: 0,sentence_length
count,5761.0
mean,272.509807
std,136.210966
min,21.0
25%,164.0
50%,247.0
75%,366.0
max,657.0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000) # You can adjust max_features
X_train = vectorizer.fit_transform(train_df['sentence'])
X_val = vectorizer.transform(val_df['sentence'])
X_test = vectorizer.transform(test_df['sentence'])

y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (18779, 5000)
Shape of X_val: (7244, 5000)
Shape of X_test: (5761, 5000)


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the SVM model
svm_model = SVC(kernel='linear') # You can choose other kernels as well
svm_model.fit(X_train, y_train)

print("SVM model trained successfully!")

SVM model trained successfully!


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Evaluate the model on the test set
y_pred = svm_model.predict(X_test)

# Print the evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8208644332581149

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.21      0.26       360
           1       0.50      0.01      0.02       302
           2       0.53      0.22      0.31       221
           3       0.00      0.00      0.00        96
           4       0.85      0.96      0.90      4782

    accuracy                           0.82      5761
   macro avg       0.44      0.28      0.30      5761
weighted avg       0.77      0.82      0.78      5761



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
