In [41]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import integrate

Explore individual data set for its preliminary features.
1. Rmax
2. Rmin
3. Resistance Ratio - 1
4. KMax
5. KMin
6. Area under curve
7. Time to Rmax

In [42]:
# Load data from 104
exp_no = 104
file_start = 152
last_file_no = 383
flow_rate = 0.25 #ml/min
ace_conc = 0.01 # %by volume in water

In [25]:
file_name = str(exp_no)+ '/'+ str(file_start)+".txt"
file_path = "../data/interim/" + file_name
df = pd.read_csv(file_path, sep=',' , usecols=['timestamp','relative_time', 'resistance_ohm'])

In [None]:
# Visualise data plot the resistance_ohm against relative_time
plt.figure(figsize=(20,10))
plt.plot(df['relative_time'], df['resistance_ohm'])
plt.xlabel('relative_time')
plt.ylabel('resistance_ohm')
plt.title('Resistance vs Time')
plt.show()

In [None]:
# create resistance df
resistance = df['resistance_ohm']
# search for resistance max and its index
resistance_max = resistance.max() 
resistance_max_index = resistance.idxmax()
# search for relative time to resistance max
t_max = df['relative_time'][resistance_max_index]
# search for resistnace min
resistance_min = resistance.min()
# calculate resistance ratio
resistance_ratio = (resistance_max/resistance_min)-1
# Search for maximum and minimum of first derivative of resistance
resistance_diff = resistance.diff(periods=100)
resistance_diff_max = resistance_diff.max()
resistance_diff_min = resistance_diff.min()
# Search for area under the curve with resistance and relative time
auc = integrate.trapz(resistance, df['relative_time'])

# Create a new DataFrame and add the results 1.max resistance 2.min resistance 3.resistance ratio 4.max resistance diff 5.min resistance diff
# 6. AUC 7. Time to max resistance 8. flow rate 250ml/min 9. gas concentration 1/1000 acetone by volume
df_vector = pd.DataFrame({'resistance_max': [resistance_max], 'resistance_min': [resistance_min], 
                          'resistance_ratio': [resistance_ratio], 'resistance_diff_max': [resistance_diff_max], 
                          'resistance_diff_min': [resistance_diff_min], 'AUC': [integrate.trapz(resistance)], 
                          't_max': [t_max], 'flow_rate': [flow_rate], 'gas_concentration': [ace_conc]})



In [65]:
# normalise the resistance
resistance = df['resistance_ohm']
resistance_normalized = (resistance/resistance.min())-1
resistance_normalized.head()

0    0.296987
1    0.297850
2    0.296987
3    0.296987
4    0.296987
Name: resistance_ohm, dtype: float64

<h1> Iterate through all the files in a folder to create a feature matrix<h1>

In [66]:
# Create an empty DataFrame to store the results
results = pd.DataFrame(columns=['file_no', 'resistance_max', 'resistance_min', 'resistance_ratio', 'resistance_diff_max', 'resistance_diff_min', 'AUC', 't_max', 'flow_rate', 'gas_concentration'])

# Iterate through all the files in the folder
for file_no in range(file_start, last_file_no+1):
    # Load the data
    file_name = str(exp_no)+ '/'+ str(file_no)+".txt"
    file_path = "../data/interim/" + file_name
    df = pd.read_csv(file_path, sep=',' , usecols=['timestamp','relative_time', 'resistance_ohm'])
    # Create resistance df
    resistance = df['resistance_ohm']
    # search for resistance max and its index
    resistance_max = resistance.max() 
    resistance_max_index = resistance.idxmax()
    # search for relative time to resistance max
    t_max = df['relative_time'][resistance_max_index]
    # search for resistnace min
    resistance_min = resistance.min()
    # calculate resistance ratio
    resistance_ratio = (resistance_max/resistance_min)-1
    # Search for maximum and minimum of first derivative of resistance
    resistance_diff = resistance.diff(periods=100)
    resistance_diff_max = resistance_diff.max()
    resistance_diff_min = resistance_diff.min()
    # Search for area under the curve with resistance_normalized and relative time
    resistance_normalized = (resistance/resistance.min())-1
    auc = integrate.trapz(resistance_normalized, df['relative_time'])

    df_vector = pd.DataFrame({'file_no':[file_no], 'resistance_max': [resistance_max], 'resistance_min': [resistance_min], 
                          'resistance_ratio': [resistance_ratio], 'resistance_diff_max': [resistance_diff_max], 
                          'resistance_diff_min': [resistance_diff_min], 'AUC': [integrate.trapz(resistance)], 
                          't_max': [t_max], 'flow_rate': [flow_rate], 'gas_concentration': [ace_conc]})

    results = pd.concat([results, pd.DataFrame(df_vector)], ignore_index=True)

  results = pd.concat([results, pd.DataFrame(df_vector)], ignore_index=True)


In [67]:
results.head()

Unnamed: 0,file_no,resistance_max,resistance_min,resistance_ratio,resistance_diff_max,resistance_diff_min,AUC,t_max,flow_rate,gas_concentration
0,152,1420926.5,792405.38,0.793181,79618.0,-183841.25,4376183000.0,14200,0.25,0.01
1,153,1424699.75,794215.0,0.793846,77191.37,-184639.87,4378478000.0,15400,0.25,0.01
2,154,1431028.87,807313.25,0.772582,77115.88,-178913.25,4411426000.0,16550,0.25,0.01
3,155,1434212.75,799456.94,0.793984,71194.63,-190535.25,4412691000.0,178599,0.25,0.01
4,156,1437409.12,809648.69,0.775349,78374.75,-180714.93,4432979000.0,179099,0.25,0.01


In [68]:
# write result to a file called feature_matriax.csv in the processed folder
results.to_csv("../data/processed/104_feature_matrix.csv", sep=',', index=False)
