# Import necessary packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

import DiadFit as pf
pf.__version__


'1.0.5'

# Setup the folder access and paths

In [2]:
if os.name=='posix':
    slash='/'
else:
    slash='\\'

MasterFolder=os.path.dirname(os.path.dirname(os.getcwd()))

# #Folder to save figures

# figpath=MasterFolder +slash+'Figs'
# if not os.path.exists(figpath):
#     os.mkdir(figpath)

# #Folder to save full datasets

compilation_folder=MasterFolder +slash+"Data_processing_notebooks"+slash+'Data_processing_KS20-527'+slash+'Notebook_Compiled_data'
if not os.path.exists(compilation_folder):
    os.mkdir(compilation_folder)

# This compiles all the Raman data into one file, and exports

## First create the function

In [3]:

## This function compiles raman sessions found in the master folder (even in subdirectories of the specified folder)

def compile_Raman_sessions(*, path=None, sheet_name='Sheet1', keyword='Raman_session'):
    df = pd.DataFrame()

    for root, dirs, files in os.walk(path):
        for file in files:
            if keyword in file and file.endswith('.xlsx'):
                print(file)
                file_path = os.path.join(root, file)
                sheets = pd.read_excel(file_path, sheet_name)
                df = pd.concat([df, sheets], axis=0, ignore_index=True)
                unnamed_columns = [col for col in df.columns if 'Unnamed' in col]
                df = df.drop(columns=unnamed_columns)

    return df

## Now run the function

In [4]:
K20 = compile_Raman_sessions(path=MasterFolder+slash+"Data"+slash+"Data_KS20-527"+slash+"Raman",sheet_name='Sheet1',keyword='Raman_session')

Raman_session_January 10, 2025_fitted_2025-01-13.xlsx
Raman_session_January 13, 2025_fitted_2025-01-13.xlsx


# Now let's tidy names and filter bad analyses before exporting

In [5]:
# filter out bad ones. Including one standard that drifted. 
K20 = K20[(~K20['filename'].str.contains('test'))&(K20['Density g/cm3']>0)&~(K20['σ Density g/cm3']>0.04)]

K20['date_object']=pd.to_datetime(K20['date'])

K20=K20.sort_values(by=['date_object', 'filename'])


# def transform_name(name):
#     # Check if the name starts with 'KL0919'
#     if name.startswith('KL0919'):
#         # Replace 'KL0919' with '919' and replace '-' with '_'
#         new_name = '919' + name[6:].replace('-', '_')
#     else:
#         # Replace '-' with '_'
#         new_name = name.replace('-', '_')
#     return new_name
# # Apply the transformation to the column
# K20['consistent_name'] = K20['filename'].apply(transform_name)

K20=K20.reset_index(drop=True)

# K20['filename_4merge'] = K20['filename'] + '_index' + K20.index.astype(str)
# K20['consistent_name_4merge'] = K20['consistent_name'] + '_index' + K20.index.astype(str)

K20.to_clipboard(excel=True, index=False)

K20.to_excel(compilation_folder+'/'+"K20_Raman_allreps.xlsx")

K20


Unnamed: 0,filename,Density g/cm3,σ Density g/cm3,σ Density g/cm3 (from Ne+peakfit),σ Density g/cm3 (from densimeter),Corrected_Splitting,Corrected_Splitting_σ,Corrected_Splitting_σ_Ne,Corrected_Splitting_σ_peak_fit,power (mW),...,Peak_Area_SO2,Peak_Height_SO2,Model_name,SO2_Diad_Ratio,SO2_mol_ratio,time,preferred_values,lower_values,upper_values,date_object
0,FG04-A1-4-100X-start_r4,0.701217,0.006477,0.006019,0.002393,104.348297,0.011751,0.001518,0.011678,5.971,...,,,,,,43053,0.997801,0.997786,0.997815,2025-01-10
1,FG04-A1-4-100X-start_r5,0.696614,0.005741,0.005218,0.002392,104.339301,0.010211,0.001514,0.010121,5.982,...,,,,,,43480,0.997801,0.997787,0.997816,2025-01-10
2,FG04-A1-4-100X-start_r6,0.691555,0.00608,0.00559,0.002391,104.329388,0.010967,0.001511,0.010887,5.999,...,,,,,,43798,0.997801,0.997787,0.997816,2025-01-10
3,FG04-A1-4-50X-end_r7,0.691514,0.004433,0.003733,0.002391,104.329307,0.007323,0.00161,0.007159,6.044,...,,,,,,66248,0.99782,0.997804,0.997835,2025-01-10
4,FG04-A1-4-50X-end_r8,0.682888,0.004391,0.003684,0.002389,104.312348,0.00726,0.001614,0.007093,6.048,...,,,,,,66499,0.99782,0.997805,0.997835,2025-01-10
5,FG04-A1-4-50X-end_r9,0.689823,0.003308,0.002287,0.002391,104.325989,0.00449,0.001619,0.004198,6.028,...,,,,,,66791,0.99782,0.997805,0.997836,2025-01-10
6,FG04-A1-4-50X-start_r1,0.679494,0.003135,0.002031,0.002388,104.305655,0.00401,0.001531,0.003714,6.002,...,,,,,,41701,0.9978,0.997785,0.997814,2025-01-10
7,FG04-A1-4-50X-start_r2,0.699008,0.003154,0.002055,0.002393,104.343981,0.004016,0.001529,0.003722,5.977,...,,,,,,41983,0.9978,0.997785,0.997815,2025-01-10
8,FG04-A1-4-50X-start_r3,0.693926,0.003037,0.001871,0.002392,104.334036,0.003667,0.001525,0.003342,5.96,...,,,,,,42331,0.9978,0.997786,0.997815,2025-01-10
9,K20_c001_a1_FIA,0.115712,0.004567,0.002501,0.003821,103.042279,0.006312,0.001477,0.00615,9.979,...,280.217696,157.089449,Spline,0.188018,0.075134,45781,0.997803,0.997789,0.997817,2025-01-10


In [6]:
rep_FI_names=pd.read_excel('Helper files/'+'Helperfile_K20.xlsx',sheet_name='Sheet1')
K20_reps = pd.merge(K20, rep_FI_names, on=['filename', 'date_object','sec since midnight'])
K20_reps

Unnamed: 0,filename,Density g/cm3,σ Density g/cm3,σ Density g/cm3 (from Ne+peakfit),σ Density g/cm3 (from densimeter),Corrected_Splitting,Corrected_Splitting_σ,Corrected_Splitting_σ_Ne,Corrected_Splitting_σ_peak_fit,power (mW),...,FI#,Sample_crystal,Sample_crystal_region,FI_name,Name_on_SEM,Notes regarding SEM matching,EBSD_simple_name,EBSD_grainID,EBSDname+grainID,EBSD_comment
0,FG04-A1-4-100X-start_r4,0.701217,0.006477,0.006019,0.002393,104.348297,0.011751,0.001518,0.011678,5.971,...,4,FG04_A1,FG04_A1,FG04-A1-4-100X-start-45667,,,,,,
1,FG04-A1-4-100X-start_r5,0.696614,0.005741,0.005218,0.002392,104.339301,0.010211,0.001514,0.010121,5.982,...,4,FG04_A1,FG04_A1,FG04-A1-4-100X-start-45667,,,,,,
2,FG04-A1-4-100X-start_r6,0.691555,0.00608,0.00559,0.002391,104.329388,0.010967,0.001511,0.010887,5.999,...,4,FG04_A1,FG04_A1,FG04-A1-4-100X-start-45667,,,,,,
3,FG04-A1-4-50X-end_r7,0.691514,0.004433,0.003733,0.002391,104.329307,0.007323,0.00161,0.007159,6.044,...,4,FG04_A1,FG04_A1,FG04-A1-4-50X-end-45667,,,,,,
4,FG04-A1-4-50X-end_r8,0.682888,0.004391,0.003684,0.002389,104.312348,0.00726,0.001614,0.007093,6.048,...,4,FG04_A1,FG04_A1,FG04-A1-4-50X-end-45667,,,,,,
5,FG04-A1-4-50X-end_r9,0.689823,0.003308,0.002287,0.002391,104.325989,0.00449,0.001619,0.004198,6.028,...,4,FG04_A1,FG04_A1,FG04-A1-4-50X-end-45667,,,,,,
6,FG04-A1-4-50X-start_r1,0.679494,0.003135,0.002031,0.002388,104.305655,0.00401,0.001531,0.003714,6.002,...,4,FG04_A1,FG04_A1,FG04-A1-4-50X-start-45667,,,,,,
7,FG04-A1-4-50X-start_r2,0.699008,0.003154,0.002055,0.002393,104.343981,0.004016,0.001529,0.003722,5.977,...,4,FG04_A1,FG04_A1,FG04-A1-4-50X-start-45667,,,,,,
8,FG04-A1-4-50X-start_r3,0.693926,0.003037,0.001871,0.002392,104.334036,0.003667,0.001525,0.003342,5.96,...,4,FG04_A1,FG04_A1,FG04-A1-4-50X-start-45667,,,,,,
9,K20_c001_a1_FIA,0.115712,0.004567,0.002501,0.003821,103.042279,0.006312,0.001477,0.00615,9.979,...,FIA,K20_c001,K20_c001_a1,K20_c001_a1_FIA,,,,,,


## Now let's take mean of repeated analyses

In [7]:
grouped = K20_reps.groupby('FI_name')

# Separate numeric and non-numeric columns
numeric_cols = K20_reps.select_dtypes(include='number')
non_numeric_cols = K20_reps.select_dtypes(exclude='number')

# Aggregate numeric columns by mean and standard deviation
numeric_K20_averaged_mean = grouped[numeric_cols.columns].mean()
numeric_K20_averaged_std = grouped[numeric_cols.columns].std()

# Take the first instance for non-numeric columns
non_numeric_K20_averaged = grouped[non_numeric_cols.columns].first()

# Concatenate K20_averageds and reindex columns to the original order
K20_averaged = pd.concat([numeric_K20_averaged_mean, non_numeric_K20_averaged], axis=1)
K20_averaged = K20_averaged.reindex(columns=K20_reps.columns)

# Add standard deviation columns after reindexing
K20_averaged = pd.concat([K20_averaged, numeric_K20_averaged_std.add_suffix('_Raman_STD')], axis=1)

# Reset index to get a DataFrame
K20_averaged = K20_averaged.reset_index(drop=True)

row_counts = grouped.size().reset_index(name='row_count')

# Add a new column "averaged?" based on the row count
row_counts['averaged?'] = 'No'
row_counts.loc[row_counts['row_count'] > 1, 'averaged?'] = 'Yes'

# Merge the row counts back to the aggregated DataFrame
K20_averaged = pd.merge(K20_averaged, row_counts[['FI_name', 'averaged?']], on='FI_name', how='left')

# If there are missing values (groups with a single row), fill them with 'No'
K20_averaged['averaged?'] = K20_averaged['averaged?'].fillna('No')

#exclude a weird null average spectrum
# K20_averaged=K20_averaged[~(K20_averaged['filename'].str.contains("186-9-118-A"))]
K20_averaged.to_excel(compilation_folder+'/'+"K20_Raman_averaged.xlsx")

K20_averaged


Unnamed: 0,filename,Density g/cm3,σ Density g/cm3,σ Density g/cm3 (from Ne+peakfit),σ Density g/cm3 (from densimeter),Corrected_Splitting,Corrected_Splitting_σ,Corrected_Splitting_σ_Ne,Corrected_Splitting_σ_peak_fit,power (mW),...,preferred_values_Raman_STD,lower_values_Raman_STD,upper_values_Raman_STD,Name_on_SEM_Raman_STD,Notes regarding SEM matching_Raman_STD,EBSD_simple_name_Raman_STD,EBSD_grainID_Raman_STD,EBSDname+grainID_Raman_STD,EBSD_comment_Raman_STD,averaged?
0,FG04-A1-4-100X-start_r4,0.696462,0.006099,0.005609,0.002392,104.338995,0.010976,0.001514,0.010895,5.984,...,3.052248e-07,3.386316e-07,2.718187e-07,,,,,,,Yes
1,FG04-A1-4-50X-end_r7,0.688075,0.004044,0.003235,0.00239,104.322548,0.006358,0.001614,0.00615,6.04,...,2.218869e-07,1.77231e-07,2.665431e-07,,,,,,,Yes
2,FG04-A1-4-50X-start_r1,0.690809,0.003109,0.001986,0.002391,104.327891,0.003898,0.001528,0.003593,5.979667,...,2.576636e-07,2.905995e-07,2.247281e-07,,,,,,,Yes
3,FG04-A1-4-end_r4,0.694369,0.005155,0.004541,0.002392,104.33487,0.008904,0.000712,0.008893,6.004,...,3.395517e-07,3.847807e-07,2.943229e-07,,,,,,,Yes
4,FG04-A1-4-start_r1,0.693323,0.004397,0.00369,0.002391,104.332851,0.007232,0.00065,0.007218,6.092667,...,4.024905e-07,3.745016e-07,4.304802e-07,,,,,,,Yes
5,K20_c001_a1_FIA,0.115712,0.004567,0.002501,0.003821,103.042279,0.006312,0.001477,0.00615,9.979,...,,,,,,,,,,No
6,K20_c002_a1_FIA,0.136977,0.004003,0.00105,0.003863,103.095948,0.002649,0.001467,0.002211,10.093,...,,,,,,,,,,No
7,K20_c002_a1_FIB,0.130733,0.004021,0.001167,0.003848,103.080189,0.002944,0.001465,0.002559,10.087,...,,,,,,,,,,No
8,K20_c003_a1_FIA_r2,0.070133,0.006159,0.004837,0.003809,102.927252,0.012208,0.00146,0.012147,10.093,...,1.518415e-07,1.562791e-07,1.474038e-07,,,,,,,Yes
9,K20_c004_a1_FIA,0.12303,0.004805,0.002897,0.003833,103.060749,0.007311,0.00146,0.007179,10.102,...,,,,,,,,,,No
