# Import necessary packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

import DiadFit as pf
pf.__version__


'1.0.5'

# Setup the folder access and paths

In [2]:
if os.name=='posix':
    slash='/'
else:
    slash='\\'

MasterFolder=os.path.dirname(os.path.dirname(os.getcwd()))

# #Folder to save figures

# figpath=MasterFolder +slash+'Figs'
# if not os.path.exists(figpath):
#     os.mkdir(figpath)

# #Folder to save full datasets

compilation_folder=MasterFolder +slash+"Data_processing_notebooks"+slash+'Data_processing_KS24-628'+slash+'Notebook_Compiled_data'
if not os.path.exists(compilation_folder):
    os.mkdir(compilation_folder)

# This compiles all the Raman data into one file, and exports

## First create the function

In [3]:

## This function compiles raman sessions found in the master folder (even in subdirectories of the specified folder)

def compile_Raman_sessions(*, path=None, sheet_name='Sheet1', keyword='Raman_session'):
    df = pd.DataFrame()

    for root, dirs, files in os.walk(path):
        for file in files:
            if keyword in file and file.endswith('.xlsx'):
                print(file)
                file_path = os.path.join(root, file)
                sheets = pd.read_excel(file_path, sheet_name)
                df = pd.concat([df, sheets], axis=0, ignore_index=True)
                unnamed_columns = [col for col in df.columns if 'Unnamed' in col]
                df = df.drop(columns=unnamed_columns)

    return df

## Now run the function

In [4]:
KD24 = compile_Raman_sessions(path=MasterFolder+slash+"Data"+slash+"Data_KS24-628"+slash+"Raman",sheet_name='Sheet1',keyword='Raman_session')

Raman_session_January 31, 2025_fitted_2025-02-01.xlsx
Raman_session_January 17, 2025_fitted_2025-01-22.xlsx


# Now let's tidy names and filter bad analyses before exporting

In [5]:
# filter out bad ones. Including one standard that drifted. 
KD24 = KD24[(~KD24['filename'].str.contains('test|end_r4'))&(KD24['Density g/cm3']>0)&~(KD24['σ Density g/cm3']>0.04)]

KD24['date_object']=pd.to_datetime(KD24['date'])

KD24=KD24.sort_values(by=['date_object', 'filename'])


# def transform_name(name):
#     # Check if the name starts with 'KL0919'
#     if name.startswith('KL0919'):
#         # Replace 'KL0919' with '919' and replace '-' with '_'
#         new_name = '919' + name[6:].replace('-', '_')
#     else:
#         # Replace '-' with '_'
#         new_name = name.replace('-', '_')
#     return new_name
# # Apply the transformation to the column
# KD24['consistent_name'] = KD24['filename'].apply(transform_name)

KD24=KD24.reset_index(drop=True)

# KD24['filename_4merge'] = KD24['filename'] + '_index' + KD24.index.astype(str)
# KD24['consistent_name_4merge'] = KD24['consistent_name'] + '_index' + KD24.index.astype(str)

KD24.to_clipboard(excel=True, index=False)

KD24.to_excel(compilation_folder+'/'+"KD24_Raman_allreps.xlsx")

KD24


Unnamed: 0,filename,Density g/cm3,σ Density g/cm3,σ Density g/cm3 (from Ne+peakfit),σ Density g/cm3 (from densimeter),Corrected_Splitting,Corrected_Splitting_σ,Corrected_Splitting_σ_Ne,Corrected_Splitting_σ_peak_fit,power (mW),...,preferred_values,lower_values,upper_values,Model_name_x,Peak_Cent_Carb,Peak_Area_Carb,Peak_Height_Carb,Model_name_y,Carb_Diad_Ratio,date_object
0,FG04-A1-4-end_r5,0.692163,0.004278,0.003548,0.002391,104.330580,0.006958,0.000553,0.006951,6.023,...,0.997796,0.997791,0.997801,,,,,,,2025-01-17
1,FG04-A1-4-end_r6,0.690476,0.002953,0.001734,0.002391,104.327270,0.003403,0.000555,0.003365,6.014,...,0.997796,0.997791,0.997801,,,,,,,2025-01-17
2,FG04-A1-4-end_r7,0.693164,0.003053,0.001897,0.002391,104.332543,0.003719,0.000556,0.003686,6.011,...,0.997796,0.997791,0.997801,,,,,,,2025-01-17
3,FG04-A1-4-start_r1,0.686361,0.004383,0.003674,0.002390,104.319185,0.007227,0.000536,0.007223,6.005,...,0.997833,0.997828,0.997838,,,,,,,2025-01-17
4,FG04-A1-4-start_r2,0.686856,0.003134,0.002027,0.002390,104.320159,0.003986,0.000535,0.003958,5.969,...,0.997832,0.997827,0.997837,,,,,,,2025-01-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,KD24_c031_a1_FIA,0.114821,0.006327,0.005044,0.003820,103.040033,0.012729,0.001012,0.012716,9.875,...,0.997841,0.997831,0.997851,,,,,,,2025-01-31
61,KD24_c031_a2_FIB,0.092461,0.008161,0.007221,0.003802,102.983600,0.018225,0.001022,0.018236,9.885,...,0.997840,0.997830,0.997850,,,,,,,2025-01-31
62,KD24_c031_a2_FIC,0.101569,0.004780,0.002892,0.003806,103.006586,0.007298,0.001030,0.007241,9.889,...,0.997839,0.997829,0.997849,,,,,,,2025-01-31
63,KD24_c032_a1_FIA,0.099343,0.004534,0.002467,0.003805,103.000970,0.006226,0.001041,0.006152,9.876,...,0.997837,0.997827,0.997848,,,,,,,2025-01-31


In [6]:
rep_FI_names=pd.read_excel('Helper files/'+'Helperfile_KD24.xlsx',sheet_name='Sheet1')
KD24_reps = pd.merge(KD24, rep_FI_names, on=['filename', 'date_object','sec since midnight'])
KD24_reps

Unnamed: 0,filename,Density g/cm3,σ Density g/cm3,σ Density g/cm3 (from Ne+peakfit),σ Density g/cm3 (from densimeter),Corrected_Splitting,Corrected_Splitting_σ,Corrected_Splitting_σ_Ne,Corrected_Splitting_σ_peak_fit,power (mW),...,FI#,Sample_crystal,Sample_crystal_region,FI_name,Name_on_SEM,Notes regarding SEM matching,EBSD_simple_name,EBSD_grainID,EBSDname+grainID,EBSD_comment
0,FG04-A1-4-end_r5,0.692163,0.004278,0.003548,0.002391,104.330580,0.006958,0.000553,0.006951,6.023,...,4,FG04_A1,FG04_A1,FG04-A1-4-end-45674,,,,,,
1,FG04-A1-4-end_r6,0.690476,0.002953,0.001734,0.002391,104.327270,0.003403,0.000555,0.003365,6.014,...,4,FG04_A1,FG04_A1,FG04-A1-4-end-45674,,,,,,
2,FG04-A1-4-end_r7,0.693164,0.003053,0.001897,0.002391,104.332543,0.003719,0.000556,0.003686,6.011,...,4,FG04_A1,FG04_A1,FG04-A1-4-end-45674,,,,,,
3,FG04-A1-4-start_r1,0.686361,0.004383,0.003674,0.002390,104.319185,0.007227,0.000536,0.007223,6.005,...,4,FG04_A1,FG04_A1,FG04-A1-4-start-45674,,,,,,
4,FG04-A1-4-start_r2,0.686856,0.003134,0.002027,0.002390,104.320159,0.003986,0.000535,0.003958,5.969,...,4,FG04_A1,FG04_A1,FG04-A1-4-start-45674,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,KD24_c031_a1_FIA,0.114821,0.006327,0.005044,0.003820,103.040033,0.012729,0.001012,0.012716,9.875,...,FIA,KD24_c031,KD24_c031_a1,KD24_c031_a1_FIA,KD24_c031_a1,,,,,
61,KD24_c031_a2_FIB,0.092461,0.008161,0.007221,0.003802,102.983600,0.018225,0.001022,0.018236,9.885,...,FIB,KD24_c031,KD24_c031_a2,KD24_c031_a2_FIB,KD24_c031_a2,,,,,
62,KD24_c031_a2_FIC,0.101569,0.004780,0.002892,0.003806,103.006586,0.007298,0.001030,0.007241,9.889,...,FIC,KD24_c031,KD24_c031_a2,KD24_c031_a2_FIC,KD24_c031_a2,,,,,
63,KD24_c032_a1_FIA,0.099343,0.004534,0.002467,0.003805,103.000970,0.006226,0.001041,0.006152,9.876,...,FIA,KD24_c032,KD24_c032_a1,KD24_c032_a1_FIA,KD24_c032_a1,,,,,


## Now let's take mean of repeated analyses

In [7]:
grouped = KD24_reps.groupby('FI_name')

# Separate numeric and non-numeric columns
numeric_cols = KD24_reps.select_dtypes(include='number')
non_numeric_cols = KD24_reps.select_dtypes(exclude='number')

# Aggregate numeric columns by mean and standard deviation
numeric_KD24_averaged_mean = grouped[numeric_cols.columns].mean()
numeric_KD24_averaged_std = grouped[numeric_cols.columns].std()

# Take the first instance for non-numeric columns
non_numeric_KD24_averaged = grouped[non_numeric_cols.columns].first()

# Concatenate KD24_averageds and reindex columns to the original order
KD24_averaged = pd.concat([numeric_KD24_averaged_mean, non_numeric_KD24_averaged], axis=1)
KD24_averaged = KD24_averaged.reindex(columns=KD24_reps.columns)

# Add standard deviation columns after reindexing
KD24_averaged = pd.concat([KD24_averaged, numeric_KD24_averaged_std.add_suffix('_Raman_STD')], axis=1)

# Reset index to get a DataFrame
KD24_averaged = KD24_averaged.reset_index(drop=True)

row_counts = grouped.size().reset_index(name='row_count')

# Add a new column "averaged?" based on the row count
row_counts['averaged?'] = 'No'
row_counts.loc[row_counts['row_count'] > 1, 'averaged?'] = 'Yes'

# Merge the row counts back to the aggregated DataFrame
KD24_averaged = pd.merge(KD24_averaged, row_counts[['FI_name', 'averaged?']], on='FI_name', how='left')

# If there are missing values (groups with a single row), fill them with 'No'
KD24_averaged['averaged?'] = KD24_averaged['averaged?'].fillna('No')

#exclude a weird null average spectrum
# KD24_averaged=KD24_averaged[~(KD24_averaged['filename'].str.contains("186-9-118-A"))]
KD24_averaged.to_excel(compilation_folder+'/'+"KD24_Raman_averaged.xlsx")

KD24_averaged


Unnamed: 0,filename,Density g/cm3,σ Density g/cm3,σ Density g/cm3 (from Ne+peakfit),σ Density g/cm3 (from densimeter),Corrected_Splitting,Corrected_Splitting_σ,Corrected_Splitting_σ_Ne,Corrected_Splitting_σ_peak_fit,power (mW),...,Peak_Area_Carb_Raman_STD,Peak_Height_Carb_Raman_STD,Carb_Diad_Ratio_Raman_STD,Mount_Raman_STD,Notes regarding SEM matching_Raman_STD,EBSD_simple_name_Raman_STD,EBSD_grainID_Raman_STD,EBSDname+grainID_Raman_STD,EBSD_comment_Raman_STD,averaged?
0,FG04-A1-4-end_r5,0.691934,0.003428,0.002393,0.002391,104.330131,0.004693,0.000555,0.004667,6.016,...,,,,,,,,,,Yes
1,FG04-A1-4-end-rep3,0.688868,0.004143,0.003362,0.00239,104.324111,0.006603,0.001066,0.006526,6.06575,...,,,,,,,,,,Yes
2,FG04-A1-4-start_r1,0.688678,0.004276,0.003481,0.00239,104.323735,0.006836,0.000534,0.006827,6.001667,...,,,,,,,,,,Yes
3,KD24_c001_a1_FIA_r1,0.086661,0.007504,0.006332,0.003802,102.968963,0.015979,0.000515,0.016004,9.912,...,,,,,,,,,,Yes
4,KD24_c001_a1_FIB,0.062668,0.006605,0.005391,0.003817,102.908411,0.013605,0.000513,0.013625,9.855,...,,,,,,,,,,No
5,KD24_c001_a1_FIC,0.101646,0.009129,0.008297,0.003806,103.006781,0.02094,0.000509,0.02098,9.836,...,,,,,,,,,,No
6,KD24_c002_a1_FIA,0.095798,0.004789,0.002912,0.003803,102.992023,0.007348,0.000507,0.007347,9.848,...,,,,,,,,,,No
7,KD24_c003_a1_FIA,0.10436,0.004583,0.002549,0.003808,103.013631,0.006434,0.000506,0.006428,9.86,...,,,,,,,,,,No
8,KD24_c003_a1_FIB,0.095473,0.004505,0.002416,0.003803,102.991202,0.006096,0.000505,0.006089,9.859,...,,,,,,,,,,No
9,KD24_c003_a1_FIC,0.107992,0.005762,0.004321,0.003812,103.022797,0.010906,0.000505,0.010918,9.858,...,,,,,,,,,,No
