# Data Visualisation and Extrapolation prior JMP Analysis

The following data are collected via GC-MS after electrochemical CO<sub>2</sub> reduction testing.

In [1]:
import os
import pandas as pd
import numpy as np
import datetime
import re

## First step: Understanding the data and the code with a single datasheet

The final goal of this Jupyter Notebook is to create a table which contains the area values of hydrogen, carbon monoxide, nitrogen, ethylene and other possible byproducts. Each excel file contains three tables in three different sheets. So, the first step is to create three dataframes whith the three tables.

In [2]:
xls = pd.ExcelFile('Sample1_23_02_2023 09_51_37.XLS') #store the excel file in a variable
#by modifying the argument of pd.read_excel I can select the target sheet and store the results in three
#dataframes
df1 = pd.read_excel(xls, 'Sample1_23_02_2023 09_51_37')
df2 = pd.read_excel(xls, 'Sample1_23_02_2023 09_51_37(2)')
df3 = pd.read_excel(xls, 'Sample1_23_02_2023 09_51_37(3)')

In [3]:
df1

Unnamed: 0,Index,Name,Time,Quantity,Height,Area,Area %
0,,,[Min],[%],[µV],[µV.Min],[%]
1,0,Oxygen,0.94,N.D.,N.D.,N.D.,N.D.
2,1,Hydrogen,1.03,95.34,1960705.3,88731,78.753
3,2,Nitrogen,1.38,0,381148.5,19603.6,17.399
4,0,Carbon monoxide,1.74,N.D.,N.D.,N.D.,N.D.
5,3,UNKNOWN,2.1,4.66,54396.1,4335,3.848
6,,,,,,,
7,Total,-,-,100,2396249.9,112669.5,100


Which data do we need from `df1`? Hydrogen Area

In [4]:
df1 = df1[['Name', 'Area']]
row = df1.loc[df1['Name'] == 'Hydrogen']
print(row)

       Name   Area
2  Hydrogen  88731


In [5]:
df2

Unnamed: 0,Index,Name,Time,Quantity,Height,Area,Area %
0,,,[Min],[ppm],[µV],[µV.Min],[%]
1,1,UNKNOWN,1.29,0,9046.5,1024.1,0.213
2,2,Nitrogen,1.53,0,2544215.9,164582.5,34.283
3,3,Carbon monoxide,1.82,36852.26,477656.6,36852.3,7.677
4,0,Methane,2.3,N.D.,N.D.,N.D.,N.D.
5,4,UNKNOWN,10.78,0,428638.7,277605.3,57.827
6,,,,,,,
7,Total,-,-,36852.26,3459557.7,480064.1,100


Which data do we need from `df2`? Nitrogen and Carbon monoxide areas.

In [6]:
df2 = df2[['Name', 'Area']]
row = df2.loc[df2['Name'] == 'Nitrogen']
print(row)

       Name      Area
2  Nitrogen  164582.5


In [7]:
df2 = df2[['Name', 'Area']]
row2 = df2.loc[df2['Name'] == 'Carbon monoxide']
print(row2)

              Name     Area
3  Carbon monoxide  36852.3


In [8]:
df3

Unnamed: 0,Index,Name,Time,Quantity,Height,Area,Area %
0,,,[Min],[ppm],[µV],[µV.Min],[%]
1,1,UNKNOWN,0.56,0,503.7,23.7,2.99
2,0,Methane,4.6,N.D.,N.D.,N.D.,N.D.
3,2,Ethylene,4.9,768.34,11913.8,518.6,65.368
4,3,UNKNOWN,5.04,0,49.9,4.7,0.592
5,4,Methanol,5.68,5.99,94.7,12.3,1.545
6,5,Ethanol,7.74,52.34,169.8,58.4,7.362
7,6,n-propanol,10.63,175.68,179.3,175.7,22.143
8,,,,,,,
9,Total,-,-,1002.35,12911.1,793.4,100


Which data do we need from `df3`? Ethylene Area (extras: methanol, ethanol and n-propanol)

In [9]:
df3 = df3[['Name', 'Area']]
row2 = df3.loc[df3['Name'] == 'Ethylene']
print(row2)

       Name   Area
3  Ethylene  518.6


In [10]:
path = os.getcwd()
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'XLS']

pattern = r'\d{2}_\d{2}_\d{4}\s\d{2}_\d{2}_\d{2}'
filename_datetime_list = []
for filename in files_xls:
    match = re.search(pattern, filename)
    if match:
        datetime_str = match.group()
        datetime_obj = datetime.datetime.strptime(datetime_str, "%d_%m_%Y %H_%M_%S")
        filename_datetime_list.append((filename, datetime_obj))

# Sort the list of tuples based on the datetime object
sorted_filename_datetime_list = sorted(filename_datetime_list, key=lambda x: x[1])
sorted_filenames = [x[0] for x in sorted_filename_datetime_list]

dfs = []
for filename in sorted_filenames:
    dfname = pd.ExcelFile(os.path.join(path, filename))
    for items in dfname.sheet_names:
        dfnew = pd.read_excel(dfname, sheet_name=items)
        dfnew['filename'] = filename  # add a new column to identify the source file
        dfs.append(dfnew)

data = pd.concat(dfs, ignore_index=True)

In [11]:
data.head(20)

Unnamed: 0,Index,Name,Time,Quantity,Height,Area,Area %,filename
0,,,[Min],[%],[µV],[µV.Min],[%],Sample1_23_02_2023 09_51_37.XLS
1,0,Oxygen,0.94,N.D.,N.D.,N.D.,N.D.,Sample1_23_02_2023 09_51_37.XLS
2,1,Hydrogen,1.03,95.34,1960705.3,88731,78.753,Sample1_23_02_2023 09_51_37.XLS
3,2,Nitrogen,1.38,0,381148.5,19603.6,17.399,Sample1_23_02_2023 09_51_37.XLS
4,0,Carbon monoxide,1.74,N.D.,N.D.,N.D.,N.D.,Sample1_23_02_2023 09_51_37.XLS
5,3,UNKNOWN,2.1,4.66,54396.1,4335,3.848,Sample1_23_02_2023 09_51_37.XLS
6,,,,,,,,Sample1_23_02_2023 09_51_37.XLS
7,Total,-,-,100,2396249.9,112669.5,100,Sample1_23_02_2023 09_51_37.XLS
8,,,[Min],[ppm],[µV],[µV.Min],[%],Sample1_23_02_2023 09_51_37.XLS
9,1,UNKNOWN,1.29,0,9046.5,1024.1,0.213,Sample1_23_02_2023 09_51_37.XLS


In [12]:
a = ['Hydrogen', 'Nitrogen', 'Carbon monoxide', 'Ethylene']
data = data.groupby('filename',as_index=False, sort=False).apply(lambda x: x[x['Name'].isin(a)].drop_duplicates(subset='Name', keep='last'))
data = data[['Name', 'Area', 'filename']].reset_index(drop=True)

In [13]:
data.head(20)

Unnamed: 0,Name,Area,filename
0,Hydrogen,88731.0,Sample1_23_02_2023 09_51_37.XLS
1,Nitrogen,164582.5,Sample1_23_02_2023 09_51_37.XLS
2,Carbon monoxide,36852.3,Sample1_23_02_2023 09_51_37.XLS
3,Ethylene,518.6,Sample1_23_02_2023 09_51_37.XLS
4,Hydrogen,83266.6,Sample2_23_02_2023 10_35_49.XLS
5,Nitrogen,158322.0,Sample2_23_02_2023 10_35_49.XLS
6,Carbon monoxide,34808.6,Sample2_23_02_2023 10_35_49.XLS
7,Ethylene,420.2,Sample2_23_02_2023 10_35_49.XLS
8,Hydrogen,111550.7,Sample3_23_02_2023 10_50_52.XLS
9,Nitrogen,163771.4,Sample3_23_02_2023 10_50_52.XLS


In [14]:
# assuming `data` is your DataFrame
pivot_table = data.pivot_table(index='filename', columns='Name', values='Area', sort=False)
# Reorder the columns
column_order = ['Hydrogen', 'Nitrogen', 'Carbon monoxide', 'Ethylene']
pivot_table = pivot_table.reindex(columns=column_order)
# Display the pivot table
print(pivot_table)

Name                              Hydrogen  Nitrogen  Carbon monoxide  \
filename                                                                
Sample1_23_02_2023 09_51_37.XLS    88731.0  164582.5          36852.3   
Sample2_23_02_2023 10_35_49.XLS    83266.6  158322.0          34808.6   
Sample3_23_02_2023 10_50_52.XLS   111550.7  163771.4          60494.1   
Sample4_23_02_2023 11_05_55.XLS   114152.6  164939.5          58894.2   
Sample5_23_02_2023 11_20_57.XLS   155033.5  168709.6          68766.0   
Sample6_23_02_2023 11_36_01.XLS   158409.4  173483.3          68089.4   
Sample7_23_02_2023 11_51_03.XLS   220230.1  174227.9          71942.9   
Sample8_23_02_2023 12_06_06.XLS   223446.4  180830.9          69695.0   
Sample9_23_02_2023 12_21_08.XLS   292623.4  182068.6          68335.2   
Sample10_23_02_2023 12_36_11.XLS  282881.9  182117.3          67858.4   
Sample11_23_02_2023 12_51_14.XLS  252022.9  185863.4          66643.1   
Sample1_23_02_2023 13_06_22.XLS   175526.2  473742.

In [15]:
pivot_table.to_excel("GC_outputs.xlsx")

***

# Test

data.shape

path = os.getcwd()
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'XLS']

pattern = r'\d{2}_\d{2}_\d{4}\s\d{2}_\d{2}_\d{2}'
filename_datetime_list = []
for filename in files_xls:
    match = re.search(pattern, filename)
    if match:
        datetime_str = match.group()
        datetime_obj = datetime.datetime.strptime(datetime_str, "%d_%m_%Y %H_%M_%S")
        filename_datetime_list.append((filename, datetime_obj))

# Sort the list of tuples based on the datetime object
sorted_filename_datetime_list = sorted(filename_datetime_list, key=lambda x: x[1])
sorted_filenames = [x[0] for x in sorted_filename_datetime_list]

dfs = []
counter = 1
for filename in sorted_filenames:
    dfname = pd.ExcelFile(os.path.join(path, filename))
    for items in dfname.sheet_names:
        dfnew = pd.read_excel(dfname, sheet_name=items)
        dfnew['filename'] = counter  # add a new column to identify the source file
        dfs.append(dfnew)
    counter += 1

data = pd.concat(dfs, ignore_index=True)

path = os.getcwd()
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'XLS']

pattern = r'\d{2}_\d{2}_\d{4}\s\d{2}_\d{2}_\d{2}'
filename_datetime_list = []
for filename in files_xls:
    match = re.search(pattern, filename)
    if match:
        datetime_str = match.group()
        datetime_obj = datetime.datetime.strptime(datetime_str, "%d_%m_%Y %H_%M_%S")
        filename_datetime_list.append((filename, datetime_obj))

# Sort the list of tuples based on the datetime object
sorted_filename_datetime_list = sorted(filename_datetime_list, key=lambda x: x[1])
sorted_filenames = [x[0] for x in sorted_filename_datetime_list]

data = []
#data = pd.DataFrame()
for filename in sorted_filenames:
    dfname = pd.ExcelFile(os.path.join(path, filename))
    dfs = []
    for items in dfname.sheet_names:
        dfnew = pd.read_excel(dfname, sheet_name=items)
        dfs.append(dfnew)
    data.append(dfs)

print(data)

dfname=pd.ExcelFile('Sample1_23_02_2023 09_51_37.XLS')
print(dfname.sheet_names)
df=pd.read_excel('Sample1_23_02_2023 09_51_37.XLS')
for items in dfname.sheet_names[1:]:
    dfnew=pd.read_excel('Sample1_23_02_2023 09_51_37.XLS',sheet_name=items)
    df=pd.concat([df,dfnew])

df

tst=df.loc[: df[(df['Name'] == 'Nitrogen')].index[0], :]

df

import os

path = os.getcwd()
files = os.listdir(path)
files

files_xls = [f for f in files if f[-3:] == 'XLS']
files_xls

import datetime
import re

pattern = r'\d{2}_\d{2}_\d{4}\s\d{2}_\d{2}_\d{2}'

filename_datetime_list = []
for filename in files_xls:
    match = re.search(pattern, filename)
    if match:
        datetime_str = match.group()
        datetime_obj = datetime.datetime.strptime(datetime_str, "%d_%m_%Y %H_%M_%S")
        filename_datetime_list.append((filename, datetime_obj))

# Sort the list of tuples based on the datetime object
sorted_filename_datetime_list = sorted(filename_datetime_list, key=lambda x: x[1])

# Print the sorted list of filenames
for filename, _ in sorted_filename_datetime_list:
    print(filename)

filename_datetime_list = []
for filename in files_xls:
    match = re.search(pattern, filename)
    if match:
        datetime_str = match.group()
        datetime_obj = datetime.datetime.strptime(datetime_str, "%d_%m_%Y %H_%M_%S")
        filename_datetime_list.append((filename, datetime_obj))

# Sort the list of tuples based on the datetime object
sorted_filename_datetime_list = sorted(filename_datetime_list, key=lambda x: x[1])

# Print the sorted list of filenames
for filename, _ in sorted_filename_datetime_list:
    print(filename)

df = pd.DataFrame()

for f in files_xls:
    data = pd.read_excel(f, sheet_name=None)
    df = pd.concat(data, ignore_index=False)

dfname=pd.ExcelFile('Sample1_23_02_2023 09_51_37.XLS')
print(dfname.sheet_names)
df=pd.read_excel('Sample1_23_02_2023 09_51_37.XLS')
for items in dfname.sheet_names[1:]:
    dfnew=pd.read_excel('Sample1_23_02_2023 09_51_37.XLS',sheet_name=items)
    df=pd.concat([df,dfnew])

path = os.getcwd()
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'XLS']
data = []
#data = pd.DataFrame()
for f in files_xls:
    dfname=pd.ExcelFile(f)
    df=pd.read_excel(f, sheet_name=None)
    dfs = []
    #dfs = pd.DataFrame()
    for items in dfname.sheet_names:
        dfnew=pd.read_excel(f,sheet_name=items)
        dfs.append(dfnew)
        #df=pd.concat([df, dfs], ignore_index=False)
    data.append(dfs)
    #data=pd.concat([data, dfs], ignore_index=False)

data

print(data)

import os
import pandas as pd
import numpy as np
import datetime
import re

path = os.getcwd()
files = os.listdir(path)
files_xls = [f for f in files if f[-3:] == 'XLS']

pattern = r'\d{2}_\d{2}_\d{4}\s\d{2}_\d{2}_\d{2}'
filename_datetime_list = []
for filename in files_xls:
    match = re.search(pattern, filename)
    if match:
        datetime_str = match.group()
        datetime_obj = datetime.datetime.strptime(datetime_str, "%d_%m_%Y %H_%M_%S")
        filename_datetime_list.append((filename, datetime_obj))

# Sort the list of tuples based on the datetime object
sorted_filename_datetime_list = sorted(filename_datetime_list, key=lambda x: x[1])
sorted_filenames = [x[0] for x in sorted_filename_datetime_list]

data = []
#data = pd.DataFrame()
for f in sorted_filename_datetime_list:
    dfname=pd.ExcelFile(f)
    df=pd.read_excel(f, sheet_name=None)
    dfs = []
    #dfs = pd.DataFrame()
    for items in dfname.sheet_names:
        dfnew=pd.read_excel(f,sheet_name=items)
        dfs.append(dfnew)
        #df=pd.concat([df, dfs], ignore_index=False)
    data.append(dfs)
    #data=pd.concat([data, dfs], ignore_index=False)


    
filepath = r'C:\Users\GadolS01\OneDrive - Johnson Matthey\Documents\Python Scripts\230222 ECOF-027-2_CuO Galvanostatic_2nd day'

# Load Excel file using Pandas
f = pd.ExcelFile(filepath)

# Define an empty list to store individual DataFrames
list_of_dfs = []

# Iterate through each worksheet
for sheet in f.sheet_names:
    
    # Parse data from each worksheet as a Pandas DataFrame
    df = f.parse(sheet)

    # And append it to the list
    list_of_dfs.append(df)
    
# Combine all DataFrames into one
data = pd.concat(list_of_dfs, ignore_index=True)

# Preview first 10 rows
data.head(10)