# Grain size distribution using dataframe

In this python code I am using for the first time Pandas.
Pandas allows me to introduce a handy tool, called 'dataframe'. 

- Here I calculate the HHcorr for every density, every temperature and everyfile. Annex parameter which i obtain are the correlation length, RMS, and alpha.
- Collecting the data in a dataframe will help me later, when I will  calculate the average HH-corr function for every temperature and Ru density.
- The dataframe allows me also to visualize my data in one single table, and to work on the files I want to keep/discard referring to their indices.
- Here I used dataframes, dictionaries, pickle.dump and finally a tool called profiling

In [24]:
import os
import sys
import json
import cv2

import numpy as np
import pandas as pd
import pickle

%matplotlib nbagg
%reload_ext autoreload
%autoreload 2
import matplotlib
import matplotlib.lines as mlines
import matplotlib.pyplot as plt

from matplotlib import cm
from skimage import data, img_as_float

from scipy.optimize import curve_fit
from scipy.ndimage import gaussian_filter

%reload_ext autoreload
%autoreload 2
%aimport AFM_lib

In [25]:
debug = True 

path = "/Users/sfiligoj/Desktop/Thesis/Data Analysis/AFM/SSP"
# path = "/Users/sfiligoj/Google Drive/Crispies/Ru/Data analysis with Python/Pandas and Matplotlib - HHcorr, Basics and Images/DSP_all"

#old data is in path = "/Users/sfiligoj/Google Drive/Crispies/Ru/Data analysis with Python/Method1/SSP_all"
os.chdir(path)
filess = os.listdir()
filess = [f for f in filess if (f.endswith('txt') and '*' not in f)]
new_folder_name = "ALL"
if not os.path.exists(f"{new_folder_name}"):
    os.mkdir(f"{new_folder_name}")
print(f"Found {len(filess)} files")

Found 74 files


In [26]:
# read-in the dictionary of Scan Size values:
with open('ScanSizes.json', 'r') as f:
    ScanSize_dict = json.loads(f.read())

In [27]:
#define new dictionary, which contains all the data
data = dict.fromkeys(filess)
z_score = 3.5
for i,file in enumerate(filess[:]):
    print(f'Running file : "{file}"')
    
    data[file] = {}
    
    filesize = os.path.getsize(file)     #to differentiate between 512x512 px or 1024x1024 px img
    Pix_width = int(np.sqrt(filesize/32))
    Pix_height = Pix_width
 
    data[file]['orig'] = np.genfromtxt(file, skip_header=1).reshape((Pix_width,Pix_height))
    
    if Pix_width == 1024:
        print(f"Resampling file: {file}")
        data[file]['orig'] = cv2.resize(data[file]['orig'], dsize=(512,512),interpolation=cv2.INTER_LINEAR)
    if Pix_width == 256:
        print(f"Resampling file: {file}")
        data[file]['orig'] = cv2.resize(data[file]['orig'], dsize=(512,512),interpolation=cv2.INTER_LINEAR)    
    
    data[file]['img'] = AFM_lib.remove_outliers(data[file]['orig'],threshold=z_score)   #was 3

Running file : "Ru_H_asdep_FLAT.001.txt"
Resampling file: Ru_H_asdep_FLAT.001.txt
Running file : "Ru_H_700C_FLAT.031.txt"
Running file : "Ru_L_250C_500nm_FLAT.000.txt"
Running file : "Ru_H_500C_FLAT.004.txt"
Resampling file: Ru_H_500C_FLAT.004.txt
Running file : "Ru_LH_600C_FLAT.017.txt"
Running file : "Ru_H_asdep_FLAT.003.txt"
Running file : "Ru_LH_600C_FLAT.016.txt"
Running file : "Ru_L_500C_FLAT.002.txt"
Running file : "Ru_H_asdep_1um_FLAT_106.txt"
Running file : "Ru_H_500C_FLAT.003.txt"
Resampling file: Ru_H_500C_FLAT.003.txt
Running file : "Ru_LH_asdep_FLAT.023.txt"
Running file : "Ru_H_700C_FLAT.022.txt"
Running file : "Ru_LH_asdep_FLAT.022.txt"
Running file : "Ru_H_800C_FLAT.008.txt"
Running file : "Ru_LH_600C_FLAT.013.txt"
Running file : "Ru_LH_asdep_FLAT_002.txt"
Running file : "Ru_H_500C_FLAT.002.txt"
Running file : "Ru_L_800C_FLAT.033.txt"
Running file : "Ru_LH_asdep_FLAT.020.txt"
Running file : "Ru_L_asdep_500nm_FLAT.005.txt"
Running file : "Ru_H_asdep_FLAT.005.txt"
Running

In [28]:
# https://thispointer.com/pandas-how-to-create-an-empty-dataframe-and-append-rows-columns-to-it-in-python/

In [29]:
np.seterr(all='raise')


# Create the empty DataFrame:
df = pd.DataFrame(
        columns= ['filename', 'density', 'temp', 'xi', 'hhcorr', 'scansize', 'n_pix', 'rms','rms_sq','alpha'])

for i,file in enumerate(filess[:]):
    
    # Parses information from filename
    if '*' in file:
        continue  
    elif 'nm' in file:
        continue  
    elif '10um' in file:
        continue
    elif '5um' in file:
        continue
    elif '_S_' in file:
        density = 'LH'
        temperature = '20'
    elif '_H_' in file:
        density = 'H'
        if 'asdep' in file:
            temperature = '25'
        else:
            temperature = file[5:8]       
    elif '_LH_' in file:
        density = 'LH'
        if 'asdep' in file:
            temperature = '25'
        else:
            temperature = file[6:9]
    elif '_L_' in file:
        density = 'L'
        if 'asdep' in file:
            temperature = '25'
        else:
            temperature = file[5:8]
    
    print(f'running file: {file}')
        
    # ***    Calculate theoretical asymptotic value of the Height difference correlation: 2w^2   ***    

    ScanSize = ScanSize_dict[file]              # nm
    N_Pixel = data[file]['img'].shape[1] 
    StepSize = ScanSize/N_Pixel                 # nm/px     
    Xdelta = np.linspace(StepSize, ScanSize, num=N_Pixel) 
    RMS_sq = np.mean((data[file]['img'])**2)    #   RMS^2 = w^2 - also called Interface Width
    RMS = np.sqrt(RMS_sq)    
    
    # ***    Calculate Height-Height correlation function:   ***
    
    HHcorr = AFM_lib.calc_HHcorr(data[file]['img'])
    autocorr = AFM_lib.calc_autocorr(data[file]['img'])
         
    # ***    Fit the data:   ***  
    
#   Consider, as cut-off, 90% of the horizontal asymptote, i.e. 2w^2:
    CutoffPointPLaw = int(np.argwhere(HHcorr >=2*RMS_sq)[0])
    if CutoffPointPLaw > 20:
        CutoffPointPLaw = 15
        
    a0=0
    y_err = np.arange(a0,CutoffPointPLaw+a0)**2
#     y_err = y_err / max(y_err)
    weights = np.exp(y_err)
    
    # Fit data using Model: 
    Model = lambda x, a, alpha: a*x**(2*alpha)
    popt, pcov = curve_fit(Model, Xdelta[a0:CutoffPointPLaw+a0], HHcorr[a0:CutoffPointPLaw+a0], 
                           sigma = weights)

    # Correlation Length   
    # *popt= a,alpha
    a = popt[0]                     # primo parametro dipendente
    alpha = popt[1]                 # secondo paramtero dipendente
    
    # Equating a*(x**alpha)= 2*RMS_sq you extract x, the correlation length
    CorrLength= (1/a *(2*RMS_sq))**(1/(2*alpha))
    

    # append data to empty dataframe:
    df = df.append( 
            {'filename': file[-7:-4] , 
             'density': density, 
             'temp': temperature, 
             'xi': CorrLength,
             'hhcorr': HHcorr , 
             'scansize': ScanSize , 
             'n_pix': N_Pixel, 
             'rms': RMS, 
             'rms_sq': RMS_sq,
             'alpha': alpha
            },
            ignore_index=True)

running file: Ru_H_asdep_FLAT.001.txt
running file: Ru_H_700C_FLAT.031.txt
running file: Ru_H_500C_FLAT.004.txt
running file: Ru_LH_600C_FLAT.017.txt
running file: Ru_H_asdep_FLAT.003.txt
running file: Ru_LH_600C_FLAT.016.txt
running file: Ru_L_500C_FLAT.002.txt
running file: Ru_H_asdep_1um_FLAT_106.txt
running file: Ru_H_500C_FLAT.003.txt
running file: Ru_LH_asdep_FLAT.023.txt
running file: Ru_H_700C_FLAT.022.txt
running file: Ru_LH_asdep_FLAT.022.txt
running file: Ru_H_800C_FLAT.008.txt
running file: Ru_LH_600C_FLAT.013.txt
running file: Ru_LH_asdep_FLAT_002.txt
running file: Ru_H_500C_FLAT.002.txt
running file: Ru_L_800C_FLAT.033.txt
running file: Ru_LH_asdep_FLAT.020.txt
running file: Ru_H_asdep_FLAT.005.txt
running file: Ru_LH_asdep_FLAT.009.txt
running file: Ru_LH_800C_FLAT.004.txt
running file: Ru_H_asdep_FLAT.117.txt
running file: Ru_L_250C_FLAT.001.txt
running file: Ru_LH_700C_FLAT.004.txt
running file: Ru_H_600C_FLAT.021.txt
running file: Ru_L_600C_FLAT.008.txt
running file: 

In [30]:
# Save a dictionary into a pickle file.
pickle.dump(df, open(f"{new_folder_name}/save_z{z_score}.p", "wb" ) )

In [None]:
import pandas_profiling
from pathlib import Path
profile = df.profile_report(title="AFMdata")
profile.to_file(output_file=Path("NEW/AFMdata.html"))