In [2]:
import warnings

warnings.filterwarnings('ignore')



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load your data
df = pd.read_json('drug_dictionary_number.json')



In [4]:
import pandas as pd


# Assuming df is your original DataFrame

# Expanding the 'uses' dictionary into separate DataFrame columns
uses_df = df['uses'].apply(pd.Series)
uses_df = uses_df.add_prefix('uses_')

# Expanding the 'categorized_serious_effects' dictionary into separate DataFrame columns
serious_effects_df = df['categorized_serious_effects'].apply(pd.Series)
serious_effects_df = serious_effects_df.add_prefix('side_effects_')

# Assuming 'ingredients' is also a dictionary and needs to be expanded into separate columns
ingredients_df = df['ingredients'].apply(pd.Series)
ingredients_df = ingredients_df.add_prefix('ingredient_')

# Combine the expanded columns with the original dataframe, excluding the original dictionary columns
df = pd.concat([df.drop(['uses', 'ingredients', 'categorized_serious_effects'], axis=1)], axis=1)
# For each drug class, create a new column in df

# Assuming df is your DataFrame and 'drug_classes' is a column of lists
all_drug_classes = set()

# Iterate over the column to collect unique drug classes
for drug_classes in df['drug_classes']:
    all_drug_classes.update(drug_classes)

# Create a new DataFrame for the binary columns
binary_columns = pd.DataFrame()

# Generate binary columns
for drug_class in all_drug_classes:
    binary_columns[drug_class] = df['drug_classes'].apply(lambda x: 1 if drug_class in x else 0)

# Concatenate the new columns with the original DataFrame
df = pd.concat([df, binary_columns], axis=1)

# Optionally, drop the original 'drug_classes' column
df = df.drop('drug_classes', axis=1)


# Fill NA values with 0 or another appropriate value
df = df.fillna(0)

# Display the first few rows of the updated DataFrame
df.head()


Unnamed: 0,name,status,generic_name,betweenness_score,closeness_score,degree_score,Antiviral combinations,Upper respiratory combinations,Urea cycle disorder agents,Ophthalmic antihistamines and decongestants,...,Methylxanthines,Aminopenicillins,Nasal antihistamines and decongestants,Glycopeptide antibiotics,Topical non-steroidal anti-inflammatories,Miscellaneous antihyperlipidemic agents,Selective serotonin reuptake inhibitors,Local injectable anesthetics,Sclerosing agents,Miscellaneous antimalarials
0,Azulfidine,Prescription only,sulfasalazine,0.070434,0.80684,0.280775,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Abstral,Discontinued,fentanyl,0.046356,0.892619,0.679413,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Aczone,Prescription only,dapsone-topical,0.075351,0.71796,0.108434,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Adzynma,Prescription only,"adamts13, recombinant-krhn",0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Adzenys XR-ODT,Prescription only,amphetamine,0.011006,0.774746,0.242535,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Define your target variable
y = df['status'].apply(lambda x: 1 if x == 'Prescription only' or x == 'Discontinued' else 0)
X = df.drop(['status', 'generic_name', 'name'], axis=1)  # Assuming all other columns are features


In [7]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming df is your DataFrame and preprocessing has been done
names = df['name']

# Specify the number of folds for k-fold cross-validation
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# List to store predictions for each test set
predictions_list = []
# List to store the scores
scores_list = []

fold = 0
for train_index, test_index in kf.split(X):
    fold += 1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    names_test = names.iloc[test_index]  # Extracting the corresponding names 

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate different evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    # Append the scores to the scores list
    scores_list.append({
        'Fold': fold,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

    fold_results = pd.DataFrame({
        'Name': names_test,
        'Fold': fold,
        'Actual': y_test,
        'Predicted': y_pred
    })

    predictions_list.append(fold_results)

# Concatenate predictions from each fold into one DataFrame
all_predictions = pd.concat(predictions_list, ignore_index=True)

# Create a DataFrame from scores list
scores_df = pd.DataFrame(scores_list)

# Save the predictions with names to a CSV file
all_predictions.to_csv('cross_validation_predictions_with_names.csv', index=False)

# Display the scores for each fold
print(scores_df)

# Optionally save the scores to a CSV file
scores_df.to_csv('cross_validation_scores.csv', index=False)


   Fold  Accuracy  Precision    Recall  F1 Score
0     1  0.897208   0.937615  0.915771  0.926564
1     2  0.890863   0.931227  0.910909  0.920956
2     3  0.884518   0.933460  0.897623  0.915191
3     4  0.895939   0.926655  0.926655  0.926655
4     5  0.884518   0.933945  0.902482  0.917944
5     6  0.914975   0.931532  0.946886  0.939146
6     7  0.904822   0.928044  0.933210  0.930620
7     8  0.887056   0.923792  0.911927  0.917821
8     9  0.880559   0.907273  0.920664  0.913919
9    10  0.909784   0.933945  0.935662  0.934803


In [8]:
average_scores = scores_df.mean()
average_scores

Fold         5.500000
Accuracy     0.895024
Precision    0.928749
Recall       0.920179
F1 Score     0.924362
dtype: float64