In [1]:
# In this notebook I remove the incorrect data (the ones with weird dates) and the wrong quality/felix data.
# I probably won't need this again (the quality/felix removal part) since Iara did it manually shortly after.

In [50]:
import pandas as pd
from datetime import datetime
import math
import re

In [None]:
df = pd.read_csv('../data/absorbance/raw.csv')

# extracting the year, so that we can remove the wrong data

In [39]:
regex = '^\w+.\w+,\d{2}/\d{2}/(\d{4})$'

In [40]:
df['year'] = df[df.columns.values[0]].apply(lambda x: re.findall(regex, x)[0])

# dropping data with wrong date

In [41]:
rows_to_drop = df[df['year'] == '1601']
df.drop(rows_to_drop.index, inplace=True)

In [42]:
df.reset_index(drop=True, inplace=True)

# getting shelf life and treatment info

In [43]:
regex = '^\w*(T\d)(B\d)(\d)'

In [44]:
df['TREATMENT'] = df['Filename,'].apply(lambda x: re.findall(regex, x)[0][0])
df['BLOCK'] = df['Filename,'].apply(lambda x: re.findall(regex, x)[0][1])
df['SHELF-LIFE'] = df['Filename,'].apply(lambda x: re.findall(regex, x)[0][2])

# removing unnecessary info

In [45]:
cols = list(df.columns.values[:2])
cols.append('year')

In [46]:
df.drop(cols, axis=1, inplace=True)

# saving new data

In [47]:
df.to_csv('../data/absorbance/treated.csv')

# extracted reflectance from sérgio's machine

In [173]:
df = pd.read_csv('../data/absorbance/ref.csv')

In [174]:
df = df.applymap(lambda x: float(x))

In [175]:
df = df.applymap(lambda x: x if x <= 0 else math.log(1/x))

In [176]:
old = pd.read_csv('../data/final/firmness_all_bands.csv')

df.insert(loc=0, column='TREATMENT', value=old['TREATMENT'])
df.insert(loc=1, column='BLOCK', value=old['BLOCK'])
df.insert(loc=2, column='SHELF-LIFE', value=old['SHELF-LIFE'].astype(str))

# removing wrong felix data

In [177]:
# Removing these bc in the felix data, there are 20 samples for B1 and 24 for B3, whereas in the quality .csv
# there are 42 samples in total. Assuming that the 24 B3 samples are correct (since this should be the exact
# value), there should be only 18 samples for B1. Thus, I'm removing the last two from B1
df[df['TREATMENT'] == 'T6'][df['SHELF-LIFE'] == '1'][df['BLOCK']=='B1'].tail(2)

  after removing the cwd from sys.path.


Unnamed: 0,TREATMENT,BLOCK,SHELF-LIFE,301457,304.7677,308.0792,311.3915,314.7047,318.0186,321.3333,...,1111735,1114892,1118047,1121199,1124.35,1127498,1130645,1133789,1136931,1140071
710,T6,B1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.639354,-0.55662,-0.403852,-0.254398,-0.040088,0.303237,0.63477,2.044289,0.0,1.750677
711,T6,B1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.57394,-0.495837,-0.298129,-0.177933,0.221039,0.53023,1.863197,3.834368,0.0,0.0


In [178]:
df.drop([708, 709], inplace=True)
df.reset_index(drop=True, inplace=True)

In [179]:
df.to_csv('../data/absorbance/abs_sergio_wo_incorrect.csv')

# getting quality data

In [180]:
qlt = pd.read_csv('../data/quality/raw.csv')

In [181]:
qlt.drop(columns=['LEVELS'], inplace=True)
qlt['TREATMENT'] = qlt['TREATMENT'].apply(lambda x: str(x))
qlt['SHELF-LIFE'] = qlt['SHELF-LIFE'].apply(lambda x: str(x))

# removing wrong quality data

In [182]:
# removing from quality data bc there are 46 samples from felix and 48 from quality .csv. B3 is supposedly correct,
# because it contains 24 felix samples, whereas there's 22 B1 samples. So, I'm gonna remove the last two from B1.
qlt[qlt['TREATMENT']=='T4'][qlt['SHELF-LIFE']=='1'].iloc[22:24]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,SHELF-LIFE,TREATMENT,NUM,FRESH MASS,L,C,H,FIRMNESS,LP,CP,HP,SST,TOTAL ACIDITY,DRY MASS
492,1,T4,23,513.28,65.47,52.12,62.2,6.9,59.83,64.26,79.27,16.5,,20.51
493,1,T4,24,513.28,71.87,56.41,78.29,13.8,66.32,67.88,79.67,16.2,,19.98


In [183]:
qlt.drop([492, 493], inplace=True)
qlt.reset_index(drop=True, inplace=True)

In [184]:
qlt.to_csv('../data/quality/treated_wo_incorrect.csv')

# indexing by treatment and shelf-life to allow assignment

In [209]:
new_df = df.sort_values(by=['TREATMENT', 'SHELF-LIFE']).reset_index(drop=True)

In [211]:
new_qlt = qlt.sort_values(by=['TREATMENT', 'SHELF-LIFE']).reset_index(drop=True)

In [212]:
attrs = list(new_qlt.columns[3:].values)
for att in attrs:
    new_df[att] = new_qlt[att]

In [218]:
cols = ['TREATMENT', 'BLOCK', 'SHELF-LIFE'] + attrs + list(new_df.columns[3:-11].values)

In [219]:
new_df = new_df[cols]

In [221]:
new_df.to_csv('../data/final/sergio.csv')