In [109]:
#%% import libraries

import glob, os, numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import time
import shutil

In [110]:
#%% define functions

def find_times(filename='2018-05-04pedotpss', pressure_col_name='RH stpnt'):
    '''Read file which contains timestamps and changing pressures. The 
    function retuns a dataframe with times and corresponding pressures.
    '''
    data = pd.read_table(str(filename))
    pressure_col_name = str(pressure_col_name)
    p_raw = np.array(data[pressure_col_name])
    p_indices = np.array([])
    time_table = []
    
    for i in range(len(data)-1):
        #get indices of times to keep
        if p_raw[i] != p_raw[i+1]:
            p_indices = np.append(p_indices, i).astype(int)
    
            time_table.append([data['date/time'].iloc[i],
                               data[pressure_col_name].iloc[i]])          
    time_table = pd.DataFrame(time_table, columns=['time', 'pressure'])
    return time_table

Pressure is contolled using mass flow controllers or a humidity generator. The data file containing pressures (in this case relative humidity (RH)) and timestamps associated with each RH level are inside the file '2018-05-04pedotpss'. We use the function 'find_times' (written above) to find the value of RH at each timestep. The times are formatted as 'year-month-day hour:minute:second', as shown below. Here 'pressure' corresponds to relative humidity measured in %.

In [111]:
#%% find times for each pressure    

time_table = find_times('2018-05-04pedotpss', 'RH stpnt')
time_table

Unnamed: 0,time,pressure
0,2018-05-04 18:53:32.74,2.0
1,2018-05-04 20:23:37.66,5.0
2,2018-05-04 21:53:42.58,10.0
3,2018-05-04 23:23:47.51,15.0
4,2018-05-05 00:53:52.46,20.0
5,2018-05-05 02:23:57.45,25.0
6,2018-05-05 03:53:57.55,30.0
7,2018-05-05 05:24:02.33,35.0
8,2018-05-05 06:54:02.38,40.0
9,2018-05-05 08:24:07.35,45.0


In the above table, the data/time column are just strings. They don't correspond to an actual timestamp. We convert them into timestamps using datatime library (note that the last value is displayed in microseconds):

In [112]:
#convert time strings to timestamps

#get list of timestamps corresponding to pressures
p_time = [datetime.datetime.strptime(step, '%Y-%m-%d %H:%M:%S.%f') for step in time_table['time']]
print(np.array(p_time))

[datetime.datetime(2018, 5, 4, 18, 53, 32, 740000)
 datetime.datetime(2018, 5, 4, 20, 23, 37, 660000)
 datetime.datetime(2018, 5, 4, 21, 53, 42, 580000)
 datetime.datetime(2018, 5, 4, 23, 23, 47, 510000)
 datetime.datetime(2018, 5, 5, 0, 53, 52, 460000)
 datetime.datetime(2018, 5, 5, 2, 23, 57, 450000)
 datetime.datetime(2018, 5, 5, 3, 53, 57, 550000)
 datetime.datetime(2018, 5, 5, 5, 24, 2, 330000)
 datetime.datetime(2018, 5, 5, 6, 54, 2, 380000)
 datetime.datetime(2018, 5, 5, 8, 24, 7, 350000)
 datetime.datetime(2018, 5, 5, 9, 54, 12, 180000)
 datetime.datetime(2018, 5, 5, 11, 24, 12, 200000)
 datetime.datetime(2018, 5, 5, 12, 54, 17, 130000)
 datetime.datetime(2018, 5, 5, 14, 24, 22, 50000)
 datetime.datetime(2018, 5, 5, 15, 54, 26, 970000)
 datetime.datetime(2018, 5, 5, 17, 24, 31, 940000)
 datetime.datetime(2018, 5, 5, 18, 54, 36, 890000)
 datetime.datetime(2018, 5, 5, 20, 24, 41, 800000)
 datetime.datetime(2018, 5, 5, 21, 54, 46, 760000)
 datetime.datetime(2018, 5, 5, 23, 24, 51,

While pressure (or RH) is changing, we measure QCM impedance spectra. The QCM data files we measured are inside the folder called '2018-05-04pedotpss_long'. The files are all in csv format and labeled numerically:

In [113]:
#%% find all impedance files in the designated folder and sort by date/time
datafoldername = 'C:\\Users\\a6q\\2018-05-04pedotpss_long'

datafolder = glob.glob(datafoldername + '/*')
datafolder.sort(key=os.path.getmtime)

print('found ' + format(len(datafolder)) + ' spectra') 

print(np.array(datafolder)[:5])
print('...')
print(np.array(datafolder)[-5:])


found 790 spectra
['C:\\Users\\a6q\\2018-05-04pedotpss_long\\SARK_201805041701_1.csv'
 'C:\\Users\\a6q\\2018-05-04pedotpss_long\\SARK_201805041701_2.csv'
 'C:\\Users\\a6q\\2018-05-04pedotpss_long\\SARK_201805041701_3.csv'
 'C:\\Users\\a6q\\2018-05-04pedotpss_long\\SARK_201805041701_4.csv'
 'C:\\Users\\a6q\\2018-05-04pedotpss_long\\SARK_201805041701_5.csv']
...
['C:\\Users\\a6q\\2018-05-04pedotpss_long\\SARK_201805041701_790.csv'
 'C:\\Users\\a6q\\2018-05-04pedotpss_long\\SARK_201805041701_791.csv'
 'C:\\Users\\a6q\\2018-05-04pedotpss_long\\SARK_201805041701_792.csv'
 'C:\\Users\\a6q\\2018-05-04pedotpss_long\\SARK_201805041701_793.csv'
 'C:\\Users\\a6q\\2018-05-04pedotpss_long\\SARK_201805041701_794.csv']


If we look inside a file, we see that it contains a column for frequency, series resistance (Rs), and series reactance (Xs):

In [114]:
#indices to start, stop, and every nth points to skip per data file:
index1, index2, skip_n = 3, -1, 1

data_raw = pd.read_csv(datafolder[0], skiprows=1).iloc[index1:index2:skip_n,:]

data_raw

Unnamed: 0,Freq(MHz),Rs,Xs
3,0.805003,0.0,-8450.129883
4,0.805004,0.0,-8437.253906
5,0.805005,0.0,-8451.085938
6,0.805006,0.0,-8462.063477
7,0.805007,0.0,-8492.322266
8,0.805008,0.0,-8435.054688
9,0.805009,0.0,-8425.537109
10,0.805010,0.0,-8421.113281
11,0.805011,0.0,-8535.351563
12,0.805012,0.0,-8483.681641


Each file is measured at a specific frequency band. In this experiment, we measured at 4 different frequency bands: 0.8 MHz, 4.9 MHz, 14.9 MHz, and 24.9 MHz. Since each data file contains one frequency band, the sequence of 4 frequency bands repeats every 4 files. We can see this by printing the first frequency value in the first 16 files:

In [115]:
for i in range(len(datafolder))[:16]:
    index1, index2, skip_n = 3, -1, 1

    data_raw = pd.read_csv(datafolder[i], skiprows=1).iloc[index1:index2:skip_n,:]
    
    print(data_raw['Freq(MHz)'].iloc[0])

0.805003
4.950015
14.89003
24.840036
0.805003
4.950015
14.89003
24.840036
0.805003
4.950015
14.89003
24.840036
0.805003
4.950015
14.89003
24.840036


Each QCM spectra was measured at a specific frequency band, and at a specific pressure/RH. But, none of the QCM files are labeled by frequency band or by pressure/RH level. What we want is to organize the data files by frequency band and by pressure. That means we must determine frequency band of each file by looking at its frequency values, and determine pressure/RH of each file by looking at the time it was created and comparing that to our 'time_table' list of timestamps and pressures.

By copying files and renaming them, we can make a list of files with names like '80_14.csv', where the first number corresponds to the pressure level, and the 2nd number to the frequency band. There will be multiple files at each pressure level, but we only want to keep the last set of files at each pressure. So if we have measured n frequency bands (in this example n = 4), we will have a set of n files corresponding to each pressure.

That means the total amount of labeled files we should have will be n * p, where n is the number of frequency bands we measured, and p is the number of distinct pressures. The first file could be called '02_08.csv' (for 2% humidity at 0.8 MHz) and the last file could be called '96_24.csv' (for 96% humidity at 24 MHz).

We can make a new directory for the new labeled files to be copied to:

In [116]:
#make new directory for files separated by frequency range

folder_sep = datafoldername +'_labeled_by_freq_and_RH'

if not os.path.exists(folder_sep):
    os.makedirs(folder_sep)

Get timestamp for each data file so that we can compare it with the timestamp at each pressure level. Here we show the first 10 data file timestamps:

In [117]:
#%% time corresponding to data files

d_time = [datetime.datetime.strptime(time.ctime(os.path.getmtime(file)),
        '%a %b  %d %H:%M:%S %Y') for file in datafolder]

print(np.array(d_time[:10]))

[datetime.datetime(2018, 5, 4, 17, 5, 26)
 datetime.datetime(2018, 5, 4, 17, 9, 26)
 datetime.datetime(2018, 5, 4, 17, 13, 28)
 datetime.datetime(2018, 5, 4, 17, 17, 28)
 datetime.datetime(2018, 5, 4, 17, 21, 30)
 datetime.datetime(2018, 5, 4, 17, 25, 30)
 datetime.datetime(2018, 5, 4, 17, 29, 32)
 datetime.datetime(2018, 5, 4, 17, 33, 32)
 datetime.datetime(2018, 5, 4, 17, 37, 34)
 datetime.datetime(2018, 5, 4, 17, 41, 34)]


Finally, compare timestamps for each data file to each pressure step and copy the last n files before each pressure step changes. An example of how to copy the data file and change its name is:

In [118]:
shutil.copy2(datafolder[0], folder_sep+'\\sub_'+format(0)+'.csv')

'C:\\Users\\a6q\\2018-05-04pedotpss_long_labeled_by_freq_and_RH\\sub_0.csv'

Check to see that the file was copied and play with the filename. Make sure to change the file names to show frequency band and pressure level. I would attempt do to it something like this:

In [124]:
#%% copy data files and rename them based on their frequency bands

#indices to start, stop, and every nth points to skip per data file:
index1, index2, skip_n = 3, -1, 1

#loop over each file in folder
for i in range(len(datafolder))[:30]: 
    
    print('spectrum '+format(i+1)+' / '+format(len(datafolder)))

    #read data file
    data_raw = pd.read_csv(datafolder[i], skiprows=1).iloc[index1:index2:skip_n,:]
    freq = np.array(data_raw['Freq(MHz)'])
    
    
    filename0 = os.path.basename(datafolder[i]).split('.')[0]
    
    #check the frequency band and copy and rename files based on frequency
    
    if freq[0] < 4:
        shutil.copy2(datafolder[i], folder_sep+'\\sub_'+format(i)+'.csv')
        print("last modified: %s" % time.ctime(os.path.getmtime(datafolder[i])))
        
    if freq[0] < 4 and freq[0] < 5:
        shutil.copy2(datafolder[i], folder_sep+'\\first_'+format(i)+'.csv')
        
    if freq[0] < 14 and freq[0] < 15:
        shutil.copy2(datafolder[i], folder_sep+'\\third_'+format(i)+'.csv')
        
    if freq[0] < 24 and freq[0] < 25:
        shutil.copy2(datafolder[i], folder_sep+'\\fifth_'+format(i)+'.csv')


spectrum 1 / 790
last modified: Fri May  4 17:05:26 2018
spectrum 2 / 790
spectrum 3 / 790
spectrum 4 / 790
spectrum 5 / 790
last modified: Fri May  4 17:21:30 2018
spectrum 6 / 790
spectrum 7 / 790
spectrum 8 / 790
spectrum 9 / 790
last modified: Fri May  4 17:37:34 2018
spectrum 10 / 790
spectrum 11 / 790
spectrum 12 / 790
spectrum 13 / 790
last modified: Fri May  4 17:53:38 2018
spectrum 14 / 790
spectrum 15 / 790
spectrum 16 / 790
spectrum 17 / 790
last modified: Fri May  4 18:09:40 2018
spectrum 18 / 790
spectrum 19 / 790
spectrum 20 / 790
spectrum 21 / 790
last modified: Fri May  4 18:25:44 2018
spectrum 22 / 790
spectrum 23 / 790
spectrum 24 / 790
spectrum 25 / 790
last modified: Fri May  4 18:41:48 2018
spectrum 26 / 790
spectrum 27 / 790
spectrum 28 / 790
spectrum 29 / 790
last modified: Fri May  4 18:57:52 2018
spectrum 30 / 790



Notes and old code:

In [125]:
band_num = 4 #number of frequency bands measured

In [123]:



#%% separate files by measured frequency range

# create dataframes for each frequency range
#df_sub = pd.DataFrame(folder[0]columns='freq')


'''
for i in range(len(folder)): #loop over each file in folder
    
    #print('spectrum '+format(i+1)+' / '+format(len(folder)))
        
    data_raw = pd.read_csv(folder[i], skiprows=1).iloc[index1:index2:skip_n,:]
    
    f = np.array(data_raw['Freq(MHz)'])
    
    print(f.min()) 
'''



#%% find size of each data file

'''

data_example_full = pd.read_csv(folder[0], skiprows=1)



data_example = data_example_full.iloc[index1:index2:skip_n,:]

freq = np.array(data_example['Freq(MHz)']) #frequencies in MHz

'''

#%% get pressures from filenames

#p_list = np.array([file.split('\\')[-1].split('.')[0] for file in folder]).astype(float)
#np.array([os.path.basename(file).split('.')[0] for file in folder]).astype(folat)

#%% organize data
'''
starttime = time.time()
var_list = np.empty((len(freq), len(folder)+1))
var_list[:,0] = freq
var_max_list = np.array([])
res_freq_list = np.array([])


for i in range(len(folder)): #loop over each file in folder
    print('spectrum '+format(i+1)+' / '+format(len(folder)))
    
    
    data_raw = pd.read_csv(folder[i], skiprows=1).iloc[index1:index2:skip_n,:]
    var_spectrum = np.array(data_raw['Rs'])
    
    #remove minimum outlier points
    min_index = np.argmin(var_spectrum)
    var_spectrum[min_index] = var_spectrum[min_index+1]
    min_index = np.argmin(var_spectrum)
    var_spectrum[min_index] = var_spectrum[min_index+1]
       
    
    #normalize
    var_spectrum = var_spectrum - np.min(var_spectrum)
    #var_spectrum = var_spectrum  / np.max(var_spectrum)
    
    
    #add to table
    var_list[:,i+1] = var_spectrum
    
    var_max = np.max(var_spectrum)
    var_max_list = np.append(var_max_list, var_max)
    res_freq_list = np.append(res_freq_list, freq[np.argmax(var_spectrum)])
    
    rh0 = p_list[i]
    
    plt.plot(freq, var_spectrum, c='k')
    label_axes('$\Delta$F (Hz)', 'Signal')
    plt.title(format(rh0)+'% RH', fontsize=18)
    plt.show()


endtime = time.time()
tottime = (endtime-starttime)/60
print('elapsed time = %.2f minutes' %tottime)


'''






"\nstarttime = time.time()\nvar_list = np.empty((len(freq), len(folder)+1))\nvar_list[:,0] = freq\nvar_max_list = np.array([])\nres_freq_list = np.array([])\n\n\nfor i in range(len(folder)): #loop over each file in folder\n    print('spectrum '+format(i+1)+' / '+format(len(folder)))\n    \n    \n    data_raw = pd.read_csv(folder[i], skiprows=1).iloc[index1:index2:skip_n,:]\n    var_spectrum = np.array(data_raw['Rs'])\n    \n    #remove minimum outlier points\n    min_index = np.argmin(var_spectrum)\n    var_spectrum[min_index] = var_spectrum[min_index+1]\n    min_index = np.argmin(var_spectrum)\n    var_spectrum[min_index] = var_spectrum[min_index+1]\n       \n    \n    #normalize\n    var_spectrum = var_spectrum - np.min(var_spectrum)\n    #var_spectrum = var_spectrum  / np.max(var_spectrum)\n    \n    \n    #add to table\n    var_list[:,i+1] = var_spectrum\n    \n    var_max = np.max(var_spectrum)\n    var_max_list = np.append(var_max_list, var_max)\n    res_freq_list = np.append(res