# processamentoBioreator

## Imports and configs

In [1]:
import os
from numpy import absolute
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


def load_asv_file(file_path):
    if os.path.exists(file_path):
        df = pd.read_csv(file_path, sep="\t", index_col=0)
        # df = pd.read_csv(file_path, sep='\t', index_col=0)
        if df.empty:
            raise ValueError(
                f"Erro: O arquivo ASV foi carregado, mas está vazio: {file_path}"
            )

        # Transpor a tabela (inversão de linhas e colunas)
        df_transposed = df.T

        return df_transposed
    else:
        raise FileNotFoundError(f"Erro: Arquivo ASV não encontrado: {file_path}")


def transform_sample_counts(df):
    # df.sum(axis=1):
    # Calcula a soma das contagens de todas as espécies (ou bins) para cada amostra
    # (cada linha), ou seja, a soma das colunas para cada linha.
    return df.div(df.sum(axis=1), axis=0)
    # df.div(df.sum(axis=1), axis=0): Divide cada valor de uma linha (amostra)
    #  pelo total de contagens daquela linha (soma dos bins para aquela amostra)


TRAIN_PATH = "../drive/Bettle_experiments/06_train_HRT_2Class.csv"
TEST_PATH = "../drive/Bettle_experiments/06_test_HRT_2Class.csv"
meta_all_csv_PATH = "../drive/Bettle_experiments/04_metadata_bin_ML_abs.csv"

## 01 Loading and exploring tables

### Loading absolute hits table:

In [2]:
file_absolute = "../drive/Bettle_experiments/01_map_complete_absolute_n_hits_table.tsv"

absolute_hits = load_asv_file(file_absolute)

absolute_hits.head()

Unnamed: 0,UNDR01_2HCb-bin.0,UNDR01_2HCb-bin.13,UNDR01_2HCb-bin.15,UNDR01_2HCb-bin.18,UNDR01_2HCb-bin.19,UNDR01_2HCb-bin.21,UNDR01_2HCb-bin.22,UNDR01_2HCb-bin.44,UNDR01_2HCb-bin.49,UNDR01_2HCb-bin.5,...,merge_NG-28520_B96-bin.22,merge_NG-28520_B96-bin.28,merge_NG-28520_B96-bin.29,merge_NG-28520_B96-bin.32,merge_NG-28520_B96-bin.47,merge_NG-28520_B96-bin.48,merge_NG-28520_B96-bin.49,merge_NG-28520_B96-bin.72,merge_NG-28520_B96-bin.8,merge_NG-28520_B96-bin.82
UNDR01_2HCb,373235,44722,86186,109524,32450,2420116,51438,39819,3154962,109485,...,117,20,28,3577,51,122,24321,2348,488,33
UNDR01_2MCb,103091,1240,3594,4724,1463,80386,1215,7618,3915941,3962,...,186,32,15,4525,60,169,23699,2009,363,10
UNDR03_102,22,3,67,14,3,586,31,7,754,4,...,1051,2366,949,20,3431,8217,521,118,83,34295
UNDR03_117,67,1,0,16,1,166,37,12,567,2,...,35,3011,1260,13,2158,1635,118,65,57,3910
UNDR03_118,74,2,2,48,2,152,19,2,701,6,...,44,3170,1495,3,2517,1976,168,78,59,13824


In [3]:
absolute_hits.info()

<class 'pandas.DataFrame'>
Index: 55 entries, UNDR01_2HCb to merge_NG-28520_B96
Columns: 589 entries, UNDR01_2HCb-bin.0 to merge_NG-28520_B96-bin.82
dtypes: int64(589)
memory usage: 253.5+ KB


In [11]:
absolute_hits.isna().sum().sum()

np.int64(0)

### Loading relative abundance

In [4]:
rel_file = (
    "../drive/Bettle_experiments/01_map_complete_relative_abundance_table.tsv"
)
rel_abundance = load_asv_file(rel_file)
rel_abundance.sum(axis=1)

UNDR01_2HCb           0.458705
UNDR01_2MCb           0.442497
UNDR03_102            1.013480
UNDR03_117            0.944241
UNDR03_118            0.987501
UNDR03_119            0.402349
UNDR03_121            0.910266
UNDR03_122            0.868590
UNDR03_123            0.879697
UNDR03_1HAb           0.159712
UNDR03_1HBb           0.373698
UNDR03_1MAa           0.388885
UNDR03_1MCa           0.417879
UNDR03_2              0.904669
UNDR03_22             0.899949
UNDR03_23             0.973021
UNDR03_24             0.994456
UNDR03_25             0.938899
UNDR03_26             0.999307
UNDR03_27             0.964011
UNDR03_2HBb           0.463776
UNDR03_2MAa           0.413519
UNDR03_2MBb           0.457875
UNDR03_3              0.893755
UNDR03_4              0.940214
UNDR03_5              0.915345
UNDR03_6              0.883343
UNDR03_7              0.951245
UNDR03_74             0.911951
UNDR03_75             0.931099
UNDR03_76             0.839041
UNDR03_78             0.851097
UNDR03_7

This is weird. You'd expect a relative abundance table to sum 
to 1, but this one doesn't.

In [5]:
rel_abundance.info()

<class 'pandas.DataFrame'>
Index: 55 entries, UNDR01_2HCb to merge_NG-28520_B96
Columns: 589 entries, UNDR01_2HCb-bin.0 to merge_NG-28520_B96-bin.82
dtypes: float64(589)
memory usage: 255.6+ KB


In [10]:
rel_abundance.isna().sum().sum()

np.int64(0)

#### Relative from absolute

This intends to use the `transform_sample_counts` function
defined by the author to create a relative abundance table and
verify it's horizontal sum:

In [6]:
rel_from_abs = transform_sample_counts(absolute_hits)
rel_from_abs.sum(axis=1)

UNDR01_2HCb           1.0
UNDR01_2MCb           1.0
UNDR03_102            1.0
UNDR03_117            1.0
UNDR03_118            1.0
UNDR03_119            1.0
UNDR03_121            1.0
UNDR03_122            1.0
UNDR03_123            1.0
UNDR03_1HAb           1.0
UNDR03_1HBb           1.0
UNDR03_1MAa           1.0
UNDR03_1MCa           1.0
UNDR03_2              1.0
UNDR03_22             1.0
UNDR03_23             1.0
UNDR03_24             1.0
UNDR03_25             1.0
UNDR03_26             1.0
UNDR03_27             1.0
UNDR03_2HBb           1.0
UNDR03_2MAa           1.0
UNDR03_2MBb           1.0
UNDR03_3              1.0
UNDR03_4              1.0
UNDR03_5              1.0
UNDR03_6              1.0
UNDR03_7              1.0
UNDR03_74             1.0
UNDR03_75             1.0
UNDR03_76             1.0
UNDR03_78             1.0
UNDR03_79             1.0
UNDR03_80             1.0
merge_NG-28520_B06    1.0
merge_NG-28520_B11    1.0
merge_NG-28520_B16    1.0
merge_NG-28520_B22    1.0
merge_NG-285

As we can see, this is more mathematically sound. We'll
further explore the author's code below.

### Loading Metadata

In [7]:
meta_all_file = "../drive/Bettle_experiments/01_metadata_productivity.txt"
meta_df = pd.read_csv(meta_all_file, sep="\t", index_col=0)
meta_df.info()

<class 'pandas.DataFrame'>
Index: 55 entries, UNDR03_2 to merge_NG-28520_B96
Data columns (total 14 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Gut compartiment                                  55 non-null     str    
 1   Experiment                                        55 non-null     str    
 2   Category                                          55 non-null     str    
 3   Category2                                         55 non-null     str    
 4   Specific CH4 production (mLNorm gVS-1)_corrected  40 non-null     float64
 5   CH4 (%)                                           40 non-null     float64
 6   O2 (%)                                            40 non-null     float64
 7   CO2 (%)                                           40 non-null     float64
 8   H2 (%)                                            40 non-null     str    
 9   N2 (%)          

In [8]:
meta_df[['Category', 'Category2']]

Unnamed: 0_level_0,Category,Category2
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1
UNDR03_2,1T,1
UNDR03_3,1T,1
UNDR03_4,1T,1
UNDR03_5,1T,1
UNDR03_6,1T,1
UNDR03_7,1T,1
UNDR03_22,3T,1
UNDR03_23,3T,1
UNDR03_24,3T,1
UNDR03_25,3T,1


### Copy absolute hits table

In [9]:
abs_hits_copy = absolute_hits.copy()

### SUMMARY SO FAR

We have 3 Datasets loaded:

`absolute_hits` has the absolute values for each ASV per sample
`rel_abundance` has the weird relative abundance per sample
`meta_df` has the metadata of the experiment

Below, a few sanity checks to make sure these datasets are compatible
for merging and further exploration

#### Equal number of rows

In [13]:
len(meta_df.index)

55

In [14]:
len(absolute_hits.index)

55

In [15]:
len(rel_abundance.index)

55

#### Same index values and columns

In [18]:
(rel_abundance.index == absolute_hits.index).sum()

np.int64(55)

In [20]:
len(rel_abundance.columns)

589

In [19]:
(rel_abundance.columns == absolute_hits.columns).sum()

np.int64(589)

In [22]:
set(meta_df.index) == set(absolute_hits.index) == set(rel_abundance.index)

True

## 02 Preparing tables

### Merging

The author does two merges: One between the absolute counts
and the metadata `Experiment` column (to use it as target label),
and another between the weird relative abundance
table and the metadata `Experiment` column. Just in case,
I'll also do a merge with the calculated relative abundance table:

In [33]:
full_abs = pd.merge(absolute_hits, meta_df[['Experiment']],
                    left_index=True, right_index=True)
full_rel_raw = pd.merge(rel_abundance, meta_df[['Experiment']],
                    left_index=True, right_index=True)
full_rel_calc = pd.merge(rel_from_abs,meta_df[['Experiment']],
                    left_index=True, right_index=True)

full_abs.rename(columns={"Experiment": "y"}, inplace=True)
full_rel_raw.rename(columns={"Experiment": "y"}, inplace=True)
full_rel_calc.rename(columns={"Experiment": "y"}, inplace=True)


In [31]:
len(full_rel_calc.index) == len(full_rel_raw) == len(full_rel_calc)

True

### Do we have ASVs with count zero in all samples?

In [34]:
full_abs.columns[(full_abs==0).all()]

Index([], dtype='str')

### Train - Test split

#### The split

In [37]:
train_abs, test_abs = train_test_split(
    full_abs,
    test_size=0.25,
    stratify=full_abs['y'],
    random_state=42
)

print(f"train shape: {train_abs.shape}")
print(f"test shape: {test_abs.shape}")

train shape: (41, 590)
test shape: (14, 590)


#### Encoding target feature


In [39]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_abs['y'])
y_test = label_encoder.transform(test_abs['y'])
X_train = train_abs.drop(columns=['y']).reset_index(drop=True)
X_test = test_abs.drop(columns=['y']).reset_index(drop=True)

In [40]:
class_mapping = dict(
    zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_)
)
print(f"Mapeamento de classes: {class_mapping}")

Mapeamento de classes: {np.int64(0): 'Enrichement', np.int64(1): 'Reactor'}


In [41]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (41, 589)
y_train shape: (41,)
X_test shape: (14, 589)
y_test shape: (14,)
