In [51]:
import warnings

warnings.filterwarnings('ignore')



In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load your data
df = pd.read_json('drug_dictionary_number.json')



In [77]:
import pandas as pd


# Assuming df is your original DataFrame

# Expanding the 'uses' dictionary into separate DataFrame columns
uses_df = df['uses'].apply(pd.Series)
uses_df = uses_df.add_prefix('uses_')

# Expanding the 'categorized_serious_effects' dictionary into separate DataFrame columns
serious_effects_df = df['categorized_serious_effects'].apply(pd.Series)
serious_effects_df = serious_effects_df.add_prefix('side_effects_')

# Assuming 'ingredients' is also a dictionary and needs to be expanded into separate columns
ingredients_df = df['ingredients'].apply(pd.Series)
ingredients_df = ingredients_df.add_prefix('ingredient_')

# Combine the expanded columns with the original dataframe, excluding the original dictionary columns
df = pd.concat([df.drop(['uses', 'ingredients', 'categorized_serious_effects'], axis=1), uses_df, ingredients_df, serious_effects_df], axis=1)
# For each drug class, create a new column in df

# Assuming df is your DataFrame and 'drug_classes' is a column of lists
all_drug_classes = set()

# Iterate over the column to collect unique drug classes
for drug_classes in df['drug_classes']:
    all_drug_classes.update(drug_classes)

# Create a new DataFrame for the binary columns
binary_columns = pd.DataFrame()

# Generate binary columns
for drug_class in all_drug_classes:
    binary_columns[drug_class] = df['drug_classes'].apply(lambda x: 1 if drug_class in x else 0)

# Concatenate the new columns with the original DataFrame
df = pd.concat([df, binary_columns], axis=1)

# Optionally, drop the original 'drug_classes' column
df = df.drop('drug_classes', axis=1)


# Fill NA values with 0 or another appropriate value
df = df.fillna(0)

# Display the first few rows of the updated DataFrame
df.head()


Unnamed: 0,name,status,generic_name,betweenness_score,closeness_score,degree_score,uses_Infectious Diseases,uses_Neurological Disorders,uses_Gastrointestinal Disorders,uses_Cardiovascular Diseases,...,Tetracyclines,Otic steroids,Recombinant human erythropoietins,Non-iodinated contrast media,Catecholamines,Protease inhibitors,Interleukin inhibitors,GI stimulants,Growth hormone receptor blockers,Loop diuretics
0,Azulfidine,Prescription only,sulfasalazine,0.070434,0.80684,0.280775,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Abstral,Discontinued,fentanyl,0.046356,0.892619,0.679413,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Aczone,Prescription only,dapsone-topical,0.075351,0.71796,0.108434,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Adzynma,Prescription only,"adamts13, recombinant-krhn",0.0,0.0,0.0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,Adzenys XR-ODT,Prescription only,amphetamine,0.011006,0.774746,0.242535,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
# Define your target variable
y = df['status'].apply(lambda x: 1 if x == 'Prescription only' or x == 'Discontinued' else 0)
X = df.drop(['status', 'generic_name', 'name'], axis=1)  # Assuming all other columns are features


In [54]:
# Data preprocessing steps here...


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)


In [55]:
# Predict and evaluate
predictions = model.predict(X_test)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

0.9035532994923858
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       468
           1       0.94      0.92      0.93      1108

    accuracy                           0.90      1576
   macro avg       0.88      0.89      0.89      1576
weighted avg       0.91      0.90      0.90      1576



In [73]:
# Define your target variable
y = df['status'].apply(lambda x: 1 if x == 'Prescription only' or x == 'Discontinued' else 0)
X = df.drop(['status', 'generic_name', 'name'], axis=1)  # Assuming all other columns are features

In [83]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Assuming df is your DataFrame and preprocessing has been done
names = df['name']  # This will be used to include the 'name' in the results

# Specify the number of folds for k-fold cross-validation
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize a list to store predictions for each test set
predictions_list = []

scores_list = []

fold = 0
for train_index, test_index in kf.split(X):
    fold += 1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    names_test = names.iloc[test_index]  # Get the drug names for the test set

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    # Calculate different evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    # Append the scores to the scores list
    scores_list.append({
        'Fold': fold,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

    # Store fold results with names and predictions in the DataFrame
    fold_results = pd.DataFrame({
        'Name': names_test,            # Drug names
        'Fold': fold,                  # Fold number
        'Actual': y_test,
        'Predicted': y_pred
    })

    predictions_list.append(fold_results)

# Concatenate predictions from each fold into one DataFrame
all_predictions = pd.concat(predictions_list, ignore_index=True)


# Create a DataFrame from scores list
scores_df = pd.DataFrame(scores_list)

# Save the predictions with names to a CSV file
all_predictions.to_csv('cross_validation_predictions_with_names.csv', index=False)

# Display the scores for each fold
print(scores_df)

# Optionally save the scores to a CSV file
scores_df.to_csv('cross_validation_scores.csv', index=False)


   Fold  Accuracy  Precision    Recall  F1 Score
0     1  0.906091   0.938406  0.928315  0.933333
1     2  0.908629   0.950943  0.916364  0.933333
2     3  0.911168   0.940850  0.930530  0.935662
3     4  0.921320   0.950998  0.937388  0.944144
4     5  0.902284   0.948435  0.913121  0.930443
5     6  0.931472   0.945652  0.956044  0.950820
6     7  0.920051   0.947368  0.935065  0.941176
7     8  0.906091   0.933702  0.930275  0.931985
8     9  0.911055   0.938662  0.931734  0.935185
9    10  0.935197   0.945750  0.961397  0.953510


In [87]:
average_scores = scores_df.drop('Fold', axis=1).mean()
average_scores

Accuracy     0.915336
Precision    0.944077
Recall       0.934023
F1 Score     0.938959
dtype: float64

In [90]:
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

# ... (setup your DataFrame and preprocessing steps)

scores_list = []
full_report = []

kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
fold = 0
for train_index, test_index in kf.split(X):
    fold += 1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Get the classification report for this fold
    report = classification_report(y_test, y_pred, output_dict=True)
    full_report.append(report)
    
    # Extract the scores for later averaging
    scores_list.append({
        'Fold': fold,
        'Accuracy': report['accuracy'],
        'Precision_0': report['0']['precision'],
        'Recall_0': report['0']['recall'],
        'F1_Score_0': report['0']['f1-score'],
        'Precision_1': report['1']['precision'],
        'Recall_1': report['1']['recall'],
        'F1_Score_1': report['1']['f1-score'],
        'Support_0': report['0']['support'],
        'Support_1': report['1']['support']
    })

# Average the scores from all folds
avg_scores = pd.DataFrame(scores_list).drop(columns="Fold").mean()

# Print the average report in the requested format
avg_report = {
    'precision': [avg_scores['Precision_0'], avg_scores['Precision_1']],
    'recall': [avg_scores['Recall_0'], avg_scores['Recall_1']],
    'f1-score': [avg_scores['F1_Score_0'], avg_scores['F1_Score_1']],
    'support': [avg_scores['Support_0'], avg_scores['Support_1']]
}


# Convert list of reports to DataFrames
report_frames = [pd.DataFrame(report).transpose() for report in full_report]

# Concatenate all DataFrames vertically
combined_report = pd.concat(report_frames)

# Group by the index (which corresponds to the class labels, 'macro avg', 'weighted avg', etc.)
# and calculate the mean of the scores for each group
average_report = combined_report.groupby(combined_report.index).mean()

# Save the average classification report to a CSV file
average_report.to_csv('average_classification_report.csv')

# At this point, ensure that the avg_report dictionary is set up correctly
# and that there are exactly four values for each label (0 and 1).

# The `try-except` block has been added for diagnostic purposes.

# Add any additional macro/micro averages and weighted averages as required
# This part is left as an exercise for the reader as it requires calculating or retrieving additional averages
