In [1]:
import os
import sys

# Get the current working directory
current_dir = os.getcwd()
# Add the ./src folder to the Python module search path
sys.path.append(os.path.join(current_dir, '..', 'src'))

from utils import *

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb

sns.set_style('ticks')

In [3]:
input_path = '../Data'

feature_file = 'deepnose_features.npy'
CID_file = 'molecules_train_cid.npy'

mixture_file = 'Mixure_Definitions_Training_set.csv'
intensity_file = 'Mixure_Definitions_Intensity_Training_set.csv'
training_task_file = 'TrainingData_mixturedist.csv'

features = np.load(os.path.join(input_path, feature_file))
training_set = pd.read_csv(os.path.join(input_path, training_task_file))
mixtures_IDs = pd.read_csv(os.path.join(input_path, mixture_file))
molecule_intensities = pd.read_csv(os.path.join(input_path, intensity_file))

features_CIDs = np.load(os.path.join(input_path, CID_file))

In [5]:
extended_features = np.load(os.path.join(input_path, 'Extra/deepnose_features_extRavia.npy'))
extended_training_set = pd.read_csv(os.path.join(input_path, 'Extra/extended_training_set.csv'))
extended_mixture_IDs = pd.read_csv(os.path.join(input_path, 'Extra/extended_mixture_IDs.csv'))
extended_molecule_intensities = pd.read_csv(os.path.join(input_path, 'Extra/extended_molecule_intensites.csv'))
extended_features_CIDs = np.load(os.path.join(input_path, 'Extra/extended_ravia_cid.npy'))

In [7]:
scaler = StandardScaler(with_mean=True, with_std=True)
epsilon = 1e-8
features = scaler.fit_transform(np.log(features + epsilon))
CID2features =  {CID: features[i] for i, CID in enumerate(features_CIDs)}

In [8]:
extended_features = scaler.fit_transform(np.log(extended_features + epsilon))
extended_CID2features =  {CID: extended_features[i] for i, CID in enumerate(extended_features_CIDs)}

In [12]:
overlapped_CIDs = list(set(extended_features_CIDs) & set(features_CIDs))

In [18]:
overlapped_CIDs[2]

7685

In [19]:
extended_CID2features[7685]

array([-0.6388651 ,  0.4503178 , -2.8675652 ,  0.609993  ,  0.97016764,
        0.23411527, -2.7468708 ,  0.24428843, -0.38222095,  0.5383045 ,
        0.15903747,  0.9139166 ,  1.1911384 , -1.0407802 ,  0.370728  ,
        0.30130258,  1.4418327 , -3.173565  ,  0.6516021 , -0.42085725,
        1.0365294 , -0.35263148, -2.3338556 ,  1.044887  , -3.4085736 ,
       -0.38133386,  0.7302112 , -0.09287866,  0.81119543, -2.03048   ,
        1.3058816 , -0.58691734,  0.26965508,  0.9689213 ,  1.4070765 ,
        0.74946094,  1.2294598 , -0.10233068, -0.512149  ,  1.2577751 ,
        0.79386145,  0.6082443 ,  1.1229733 , -0.0207714 ,  1.3238311 ,
       -0.3392976 , -1.2833303 ,  0.08008947,  1.1110983 , -0.6123032 ,
        0.82366085,  0.19103968,  1.3918239 , -2.6403458 , -1.5239561 ,
        0.59876573,  0.5386844 ,  0.7853049 , -0.13104261,  0.7949549 ,
        0.00553943,  0.3444532 ,  0.6002516 , -1.2594389 ,  0.05554546,
        1.1588099 ,  1.186529  , -1.4505392 ,  0.3017367 ,  0.43

In [20]:
CID2features[7685]

array([ 0.87533164, -0.6139562 , -0.17408392, -0.28298667, -0.6783192 ,
       -0.34800217, -0.2615543 ,  0.10881349,  0.74374294,  0.24179766,
        1.3151215 ,  0.6629258 ,  1.0865839 , -1.4804327 ,  0.6645047 ,
        0.4755322 ,  1.0925835 , -0.02570823,  0.35483354, -0.06304997,
        0.7771459 , -1.5664965 ,  0.6042431 , -0.8280097 ,  0.7055481 ,
        0.33589053,  0.66655105, -0.22875623,  0.0765844 , -0.08891495,
        0.83909965,  0.15219378,  0.544762  ,  0.5708272 , -0.13960183,
       -0.6636506 ,  0.5237899 , -1.1565725 ,  0.98964834,  0.4635179 ,
        0.8241205 ,  1.4461416 ,  0.58756804,  0.57770926,  0.6313476 ,
       -0.34951535, -0.27902558,  0.07611417,  1.1588628 , -0.01293533,
        0.6785559 ,  0.20618747,  1.1102014 ,  0.57313454,  0.62411094,
        0.3140369 ,  0.40320694,  0.10476496,  0.75319016,  1.087657  ,
        1.3832383 ,  0.57359654, -1.3234941 , -0.30602968, -1.4204576 ,
       -0.34772623,  1.0671082 , -0.02976478,  0.64885163,  1.08

In [None]:
# Define the scaling constant
scaling_constant = 6.5  # Adjust this value as needed

# Get the list of column names containing "CID"
cid_columns = [col for col in molecule_intensities.columns if 'CID' in col]

# Create a mask to identify rows where "Dataset" is in ['Snitz 1', 'Snitz 2', 'Bushdid']
mask = molecule_intensities['Dataset'].isin(['Snitz 1', 'Snitz 2', 'Bushdid'])

# Scale the values of "CID" columns for the selected rows
molecule_intensities.loc[mask, cid_columns] *= scaling_constant