In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import joblib

In [None]:
# obtain datasets

df_herbs = pd.read_csv('properties updated herbs.csv')
herb_index = {}
for r in range(0, len(df_herbs)):
    herb_index[df_herbs.loc[r, 'id']] = r

In [None]:
mingredients = pd.read_csv('meridian ingredients updated.csv')
descriptors = pd.read_csv('cancer ingredients mordred descriptors + morgan fp dropped columns.csv')
drop = []
for i in range(0, len(mingredients)):
    if mingredients.loc[i, 'IDs'] not in descriptors['IDs'].to_list():
        drop.append(i)
        
mingredients = mingredients.drop(drop).reset_index(drop=True)

In [None]:
herb_dict = {}
for r in range(0, len(df_herbs)):
    try:
        items = df_herbs.loc[r, 'Ingredients'].split(', ')
        name = df_herbs.loc[r, 'id']
        herb_dict[name] = items
    except:
        continue

In [None]:
index_dict = {}
for r in range(0, len(descriptors)):
    index_dict[descriptors.loc[r, 'IDs']] = r

In [None]:
names = ['Warm','Pungent','Bitter','Mild','Hot','Sweet','Cold','Neutral','Punkery','Salty','Cool','Sour']

In [None]:
# construct dataset

table = []
results = []
for r in range(0, len(mingredients)):
    if r % 500 == 0:
        print(r)
    compound = mingredients.loc[r, 'IDs']
    index = index_dict[compound]
    table.append(descriptors.loc[index].to_list()[1:])
    herbs = []
    for i in list(herb_dict):
        if compound in herb_dict[i]:
            herbs.append(i)
    
    meridians = []
    for i in herbs:
        m = df_herbs.loc[herb_index[i], 'Properties']
        if pd.isnull(m) is False:
            m = m.split('; ')
            for i in m:
                if i not in meridians:
                    meridians.append(i)
    temp = []
    if 'Warm' in meridians:
        temp.append(1)
    else:
        temp.append(0)
    if 'Pungent' in meridians:
        temp.append(1)
    else:
        temp.append(0)
    if 'Bitter' in meridians:
        temp.append(1)
    else:
        temp.append(0)
    if 'Mild' in meridians:
        temp.append(1)
    else:
        temp.append(0)
    if 'Hot' in meridians:
        temp.append(1)
    else:
        temp.append(0)
    if 'Sweet' in meridians:
        temp.append(1)
    else:
        temp.append(0)
    if 'Cold' in meridians:
        temp.append(1)
    else:
        temp.append(0)
    if 'Neutral' in meridians:
        temp.append(1)
    else:
        temp.append(0)
    if 'Punkery' in meridians:
        temp.append(1)
    else:
        temp.append(0)
    if 'Salty' in meridians:
        temp.append(1)
    else:
        temp.append(0)
    if 'Cool' in meridians:
        temp.append(1)
    else:
        temp.append(0)
    if 'Sour' in meridians:
        temp.append(1)
    else:
        temp.append(0)
        
    results.append(temp)
    
file = np.array(table)
file = pd.DataFrame(file, columns=descriptors.columns.to_list()[1:])

In [None]:
# train models

models = {}
accuracies = []
rocs = []
f1s = []
scalers = {}
test_sets = []
test_results = []
x = file

for r in range(0, 12):
    print(f'iteration: {r + 1}')
    y = [x[r] for x in results]
    x_train, x_test, y_train, y_test = train_test_split(x, y)
    pre_save = x_test
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    for r1 in range(0, 10):
        print(f'sub iteration: {r1 + 1}')
        clf = RandomForestClassifier()
        clf.fit(x_train, y_train)
        predictions = clf.predict(x_test)
        if r1 == 0 or f1_score(y_test, predictions) > best_score:
            best_score = f1_score(y_test, predictions)
            best_accuracy = accuracy_score(y_test, predictions)
            # best_roc = roc_auc_score(y_test, predictions)
            best_model = clf
            
    models[f'model{names[r]}'] = best_model
    scalers[f'scaler{names[r]}'] = scaler
    rocs.append(best_roc)
    accuracies.append(best_accuracy)
    f1s.append(best_score)
    test_sets.append(pre_save)
    test_results.append(y_test)

In [None]:
# save models, scalers, and test sets

for i in range(0, len(test_sets)):
    test_sets[i].insert(0, 'Results', test_results[i])

In [None]:
for i in [0, 4, 6, 10]:
    test_sets[i].to_csv(f'heat models/test sets/test set {names[i]}.csv', index=False)

In [None]:
for i in [list(scalers)[x] for x in [0, 4, 6, 10]]:
    joblib.dump(scalers[i], f'heat models/scalers/{i}.save')

In [None]:
for i in [list(models)[x] for x in [0, 4, 6, 10]]:
    joblib.dump(models[i], f'heat models/{i}.save')