In [2]:
import pandas as pd
from CifFile import ReadCif
import os

def cif_to_dataframe(file_path):
    # Load the CIF file
    cif = ReadCif(file_path)
    
    # Access the first (and likely only) block
    block_name = list(cif.keys())[0]
    data_block = cif[block_name]
    
    # Extract unit cell parameters
    unit_cell_params = {
        'Cell_Length_A': data_block.get('_cell_length_a', 'N/A'),
        'Cell_Length_B': data_block.get('_cell_length_b', 'N/A'),
        'Cell_Length_C': data_block.get('_cell_length_c', 'N/A'),
        'Cell_Angle_Alpha': data_block.get('_cell_angle_alpha', 'N/A'),
        'Cell_Angle_Beta': data_block.get('_cell_angle_beta', 'N/A'),
        'Cell_Angle_Gamma': data_block.get('_cell_angle_gamma', 'N/A'),
    }
    
    # Extract molecule name (if available)
    molecule_name = data_block.get('_chemical_name_common', 'Unknown Molecule').strip()
    
    # Check for atomic site data
    if all(key in data_block for key in ['_atom_site_label', '_atom_site_fract_x', '_atom_site_fract_y', '_atom_site_fract_z']):
        # Extract atomic site data
        atom_labels = data_block['_atom_site_label']
        x_coords = data_block['_atom_site_fract_x']
        y_coords = data_block['_atom_site_fract_y']
        z_coords = data_block['_atom_site_fract_z']
        
        # Create a DataFrame
        data = {
            'Molecule_Name': [molecule_name] * len(atom_labels),
            'Cell_Length_A': [unit_cell_params['Cell_Length_A']] * len(atom_labels),
            'Cell_Length_B': [unit_cell_params['Cell_Length_B']] * len(atom_labels),
            'Cell_Length_C': [unit_cell_params['Cell_Length_C']] * len(atom_labels),
            'Cell_Angle_Alpha': [unit_cell_params['Cell_Angle_Alpha']] * len(atom_labels),
            'Cell_Angle_Beta': [unit_cell_params['Cell_Angle_Beta']] * len(atom_labels),
            'Cell_Angle_Gamma': [unit_cell_params['Cell_Angle_Gamma']] * len(atom_labels),
            'Label': atom_labels,
            'Fractional_X': x_coords,
            'Fractional_Y': y_coords,
            'Fractional_Z': z_coords
        }
        df = pd.DataFrame(data)
        return df
    else:
        print("No atomic site data found in the CIF file.")
        return pd.DataFrame()

def process_directory(directory_path, output_csv):
    """Process all CIF files in a directory and stream rows to a CSV file."""
    # Create or overwrite the CSV file with headers
    with open(output_csv, 'w') as f:
        # Define the header structure
        header = [
            'Molecule_Name',
            'Cell_Length_A', 'Cell_Length_B', 'Cell_Length_C',
            'Cell_Angle_Alpha', 'Cell_Angle_Beta', 'Cell_Angle_Gamma',
            'Label', 'Fractional_X', 'Fractional_Y', 'Fractional_Z'
        ]
        f.write(','.join(header) + '\n')
    
    # Loop through all files in the directory
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.cif'):  # Process only .cif files
            file_path = os.path.join(directory_path, file_name)
            print(f"Processing file: {file_name}")
            
            try:
                # Convert CIF to DataFrame
                df = cif_to_dataframe(file_path)
                
                if not df.empty:
                    # Append rows to the CSV
                    df.to_csv(output_csv, mode='a', header=False, index=False)
            except Exception as e:
                # Log the error and continue
                print(f"Error processing file {file_name}: {e}")

def main():
    # Path to the directory containing CIF files
    directory_path = "./data"  # Replace with your directory path
    
    # Output CSV file
    output_csv = "./cif_data_streamed.csv"
    
    # Process all CIF files in the directory and save to CSV
    process_directory(directory_path, output_csv)
    print(f"All CIF files processed. Data saved to '{output_csv}'.")

main()

['_chemical_name_common: ethylene                              ', '_cell_length_a: 6.79002', '_cell_length_b: 4.70173', '_cell_length_c: 2.56389', '_cell_angle_alpha: 90', '_cell_angle_beta: 90', '_cell_angle_gamma: 89.99996', '_space_group_name_h-m_alt: P 1', '_space_group_it_number: 1', "_space_group_symop_operation_xyz: ['x, y, z']", "_atom_site_label: ['C1', 'C2', 'C3', 'C4', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'H7', 'H8']", "_atom_site_occupancy: ['1.0', '1.0', '1.0', '1.0', '1.0', '1.0', '1.0', '1.0', '1.0', '1.0', '1.0', '1.0']", "_atom_site_fract_x: ['0.043180', '0.956818', '0.456820', '0.543182', '0.203697', '0.016827', '0.796302', '0.983173', '0.483173', '0.296303', '0.516828', '0.703697']", "_atom_site_fract_y: ['0.064406', '0.934928', '0.564405', '0.434928', '0.036683', '0.294946', '0.962649', '0.704388', '0.794946', '0.536684', '0.204387', '0.462649']", "_atom_site_fract_z: ['0.250001', '0.750000', '0.749999', '0.250000', '0.250001', '0.250001', '0.750000', '0.750000', '0.