In [1]:
# Preliminaries to work with the data.   
%matplotlib inline
import os
import sys
curr_path = os.getcwd()
gerkin_path = os.path.split(curr_path)[0]
olfaction_prediction_path = os.path.split(gerkin_path)[0]
sys.path.append(olfaction_prediction_path)
import opc_python
from opc_python.utils import loading, scoring
from opc_python.gerkin import dream
import numpy as np
import matplotlib.pyplot as plt
import pandas

### Figure 3D

In [2]:
# Load the data file "PredInsights_Agg_sorted_odor_data.txt".  
df = pandas.read_csv('../../data/PredInsights_Agg_sorted_odor_data.txt', 
                     delimiter='\t')
# First 5 rows of loaded data.  
df.head()

Unnamed: 0,CID,Mean Correlation,Upper(95),Lower(95)
0,11567,0.99247,0.996283,0.988656
1,7797,0.990325,0.993448,0.987202
2,18467,0.986352,0.988279,0.984425
3,17898,0.982546,0.996641,0.968451
4,637758,0.982176,0.986457,0.977895


In [3]:
worst5 = df.iloc[-5:,0].as_matrix()
print("5 CIDs with lowest mean correlation:\n%s" % worst5)
rest64 = df.iloc[:64,0].as_matrix()
print("64 CIDs with highest mean correlation:\n%s" % rest64)

5 CIDs with lowest mean correlation:
[ 753 7657 8025 5962 5862]
64 CIDs with highest mean correlation:
[   11567     7797    18467    17898   637758    61523     7793    60998
     7137     6544  5281168   251531  5318599    12020  2733294     8878
     7770     8815     6669    62351      323 10857465     3314  5364231
    31283    31276     7092     7559  5371102   853433  5352837     7476
    62089     8094    62465     1031    21363     8468    31219    15654
      264      750     8438      702     6322    10886    27440    12265
  5281167     9012     8419    12377      262      180     6561      962
     8049    16537   440917     7302     6506     6274  1549025     3102]


In [4]:
# Load the CIDs and dilutions for the testset.  
CIDs = loading.get_CIDs("testset")
CID_dilutions = loading.get_CID_dilutions("testset",target_dilution=-3) # Dilution is irrelevant here.  

In [5]:
# Load the molecular descriptors data.  
molecular_headers, molecular_data = loading.load_molecular_data()
molecular_vectors = dream.get_molecular_vectors(molecular_data,CID_dilutions)

In [6]:
worst5_features = np.array([molecular_vectors[x] for x in worst5])
print("Feature matrix for worst 5 molecules has shape (%d,%d)" % worst5_features.shape)

rest64_features = np.array([molecular_vectors[x] for x in rest64])
print("Feature matrix for other 64 molecules has shape (%d,%d)" % rest64_features.shape)

Feature matrix for worst 5 molecules has shape (5,4869)
Feature matrix for other 64 molecules has shape (64,4869)


In [7]:
from scipy.stats import ttest_ind as ttest
n_features = worst5_features.shape[1]
p_values = {}
for i in range(n_features):
    t,p = ttest(worst5_features[:,i], rest64_features[:,i], axis=0)
    feature = molecular_headers[i+1] # +1 to avoid the CID index.  
    p_values[feature] = {'p':p, 
                         'bad_mean':worst5_features[:,i].mean(), 
                         'rest_mean':rest64_features[:,i].mean()}

In [8]:
df = pandas.DataFrame(data=p_values).transpose()
df.sort_values('p',inplace=True)
df = df[['p', 'bad_mean', 'rest_mean']]
df.head() # Smallest 5 p-values.  

Unnamed: 0,p,bad_mean,rest_mean
CATS2D_03_DD,3e-06,0.8,0.0625
G(O..S),0.000196,1.4778,0.0
S-106,0.000196,0.2,0.0
B03[N-S],0.000196,0.2,0.0
F07[N-O],0.000196,0.4,0.0


In [9]:
# Compare to the results in the pre-computed "PredInsights_Fdiffodor.txt" file.  
df_static = pandas.read_csv('../../data/PredInsights_Fdiffodor.txt', 
                     delimiter='\t')
# First 5 rows of loaded data.  
df_static.head()

Unnamed: 0,Feature,p-value,BadMean,RestMean
0,CATS2D_03_DD,3e-06,0.8,0.0625
1,HATS4s,5.5e-05,3.9588,1.6255
2,R4s+,6.6e-05,0.9992,0.396031
3,SsSH,0.000196,0.7298,0.0
4,G(O..S),0.000196,1.4778,0.0


##### Some of the entries from "PredInsights_Fdiffodor.txt" are there, but some are missing.

In [10]:
# Here is one that is in the "PredInsights_Fdiffodor.txt" file as the second smallest p-values, 
# but has missing data in the Dragon feature list.  
df.loc['HATS4s']

p               NaN
bad_mean     3.9588
rest_mean       NaN
Name: HATS4s, dtype: float64