In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import linregress
from statistics import mean, median
from math import isfinite

# ignore 'nan' values rather than throw a Runtime warning
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
df = pd.read_csv('data-final.csv')
# 28k single points across ~700 molecules and a lot of methods

In [3]:
df.columns

Index(['name', 'geom', 'natoms', 'dlpno', 'mp2', 'wb97', 'b973c', 'pbe',
       'pbeSVP', 'pbeh3c', 'b3lypTZ', 'b3lypSVP', 'gfn0', 'gfn1', 'gfn2',
       'pm7E', 'pm7HOF', 'mmff', 'mmffNew', 'uff', 'gaff', 'ani1x', 'ani1cc',
       'ani2'],
      dtype='object')

In [4]:
df.dtypes # make sure the numbers are all float64

name         object
geom         object
natoms        int64
dlpno       float64
mp2         float64
wb97        float64
b973c       float64
pbe         float64
pbeSVP      float64
pbeh3c      float64
b3lypTZ     float64
b3lypSVP    float64
gfn0        float64
gfn1        float64
gfn2        float64
pm7E        float64
pm7HOF      float64
mmff        float64
mmffNew     float64
uff         float64
gaff        float64
ani1x       float64
ani1cc      float64
ani2        float64
dtype: object

In [8]:
# this list allows you to pick only a subset for analysis
methods = ['mmff', 'uff', 'gaff', 'pm7HOF', 'ani1x', 'ani1cc', 'ani2', 'gfn0', 'gfn1', 'gfn2', 'pbe', 'pbeSVP', 'pbeh3c', 'b3lypTZ', 'b3lypSVP', 'wb97', 'mp2', 'dlpno']

In [6]:
names = np.unique(df['name']) # all the molecule names
print(names)

['astex_1g9v' 'astex_1gkc' 'astex_1gm8' 'astex_1hnn' 'astex_1hp0'
 'astex_1hq2' 'astex_1hvy' 'astex_1hwi' 'astex_1ia1' 'astex_1ig3'
 'astex_1j3j' 'astex_1jd0' 'astex_1jje' 'astex_1jla' 'astex_1k3u'
 'astex_1ke5' 'astex_1kzk' 'astex_1l2s' 'astex_1l7f' 'astex_1lpz'
 'astex_1lrh' 'astex_1m2z' 'astex_1meh' 'astex_1mmv' 'astex_1mzc'
 'astex_1n1m' 'astex_1n2j' 'astex_1n2v' 'astex_1n46' 'astex_1nav'
 'astex_1of1' 'astex_1of6' 'astex_1opk' 'astex_1oq5' 'astex_1owe'
 'astex_1oyt' 'astex_1p2y' 'astex_1p62' 'astex_1pmn' 'astex_1q1g'
 'astex_1q41' 'astex_1q4g' 'astex_1r1h' 'astex_1r55' 'astex_1r58'
 'astex_1r9o' 'astex_1s3v' 'astex_1sg0' 'astex_1sj0' 'astex_1sq5'
 'astex_1t40' 'astex_1t46' 'astex_1t9b' 'astex_1tow' 'astex_1tt1'
 'astex_1tz8' 'astex_1u1c' 'astex_1u4d' 'astex_1uml' 'astex_1unl'
 'astex_1uou' 'astex_1v0p' 'astex_1v48' 'astex_1v4s' 'astex_1vcj'
 'astex_1w1p' 'astex_1w2g' 'astex_1xm6' 'astex_1xoq' 'astex_1xoz'
 'astex_1y6b' 'astex_1ygc' 'astex_1yqy' 'astex_1yvf' 'astex_1ywr'
 'astex_1z

In [11]:
# we're going to loop through method X and method Y
print("Method A, Method B, Mean R2, Median R2, Mean Spearman, Median Spearman, Mean Absolute Energy")
for i in range(len(methods)):
    for j in range(i+1, len(methods)):
        r2 = []
        sp = []
        
        # now we loop through each of the unique molecules
        with open('%s_%s_stats.csv' % (methods[i], methods[j]), 'w') as out:
            print("name, len, rsq, spearman, slope, intercept", sep=',', file=out)
            for name in names:
                x = df[df['name'] == name][methods[i]]
                y = df[df['name'] == name][methods[j]]
                # create a temporary data frame, but drop any 'nan' values
                d = pd.DataFrame({'x': x, 'y': y})
                d.dropna(subset=['y'], inplace = True)
                mask = ~np.isnan(x) & ~np.isnan(y)
                if len(y[mask]) < 3:
                    continue # this molecule has no values (e.g., no DLPNO energies at all)
                    
                spearman = d.corr(method='spearman').values[0, 1]
                slope, intercept, r_value, p_value, std_err = linregress(x[mask], y[mask])
                rsq = r_value**2
                
                # calculate the relative energies
                #minX = 
                
                print(name, len(x), rsq, spearman, slope, intercept, sep=',', file=out)

                if isfinite(rsq):
                    r2.append(rsq)
                if isfinite(spearman):
                    sp.append(spearman)
        # okay, now summarize
        if len(r2) > 0 and len(sp) > 0:
            print(methods[i], methods[j], mean(r2), median(r2), mean(sp), median(sp), sep=', ')
        # instead of just getting mean(r2) or median(spearman)
        # .. generate a histogram from the list of r2 and list of spearman
        

Method A, Method B, Mean R2, Median R2, Mean Spearman, Median Spearman, Mean Absolute Energy
mmff, uff, 0.6395337350401585, 0.7414391989559452, 0.6084988220648094, 0.7398333340491294
mmff, gaff, 0.4166856208624982, 0.37014391756034615, -0.3922690983895583, -0.4800568148193976
mmff, pm7HOF, 0.4630272023916747, 0.45819894647188486, -0.4846319409826624, -0.5509542924460678
mmff, ani1x, 0.39259825772151435, 0.3180389796130046, -0.33201262606740584, -0.4146418546569908
mmff, ani1cc, 0.4032730117886259, 0.3345358851780318, -0.3620461351719351, -0.45454545454545453
mmff, gfn0, 0.3108587857079591, 0.18609508155912768, -0.048907800483395436, -0.027695578491268365
mmff, gfn1, 0.29765412741339725, 0.18048378790811925, 0.0049878432058838304, 0.06634984474500864
mmff, gfn2, 0.2894912666139334, 0.17131085000489366, 0.02328569723359199, 0.11065617431192237
mmff, pbe, 0.4303822047346006, 0.370594481353568, -0.39840981652679225, -0.5272727272727272
mmff, pbeSVP, 0.40339639388651966, 0.32763310407143986

In [None]:
print(len(r2), r2)