In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
from pathlib import Path
from csv import writer
import os

In [None]:
#returns list of [chemicalSum, spaceGroupName, crystalSystem)
def readFile(f):
    #reading file and extracting all needed info
    chemicalSum = None
    spaceGroupName = None
    crystalSystem = None
    volume = None
    spaceGroupITNumber = None
    pubYear = None

    badCrystalSystemTag = '_symmetry_cell_setting' #includes 'rhombehdral', which means we have to map it to trigonal
    crystalSystemTag = '_space_group_crystal_system' #doesnt include 'rhombhedral', this is the standard
    chemFormulaTag = '_chemical_formula_sum'
    spaceGroupTag = '_symmetry_space_group_name_H-M' #space group name tag
    otherSpaceGroupTag = '_space_group_name_H-M_alt' #other space group name tag
    spaceGroupITTag = '_space_group_IT_number'
    loopTag = 'loop_'
    cellVolumeTag = '_cell_volume'
    yearTag = '_journal_year'
    
    loopCounter = 0
    hasAtomicPositions = False
    Skipping = False
    for line in f:
        line = line.decode('utf-8')
        #Grabs all usefull information from .cif file
        if(not Skipping):
            if(badCrystalSystemTag in line):
                crySys = line[len(badCrystalSystemTag):].strip()
                if('rhombohedral' in crySys):
                    crystalSystem = 'trigonal'
                else:
                    crystalSystem = crySys
            elif(crystalSystemTag in line):
                crystalSystem = line[len(crystalSystemTag):].strip()
            elif(chemFormulaTag in line):
                chemicalSum = line[len(chemFormulaTag):].strip()
            elif(spaceGroupTag in line):
                spaceGroupName = line[len(spaceGroupTag):].strip()
            elif(otherSpaceGroupTag in line):
                spaceGroupName = line[len(otherSpaceGroupTag):].strip()
            elif(yearTag in line):
                pubYear = line[len(yearTag):].strip()
            elif(spaceGroupITTag in line):
                spaceGroupITNumber = line[len(spaceGroupITTag):].strip()
            elif(cellVolumeTag in line):
                volume = line[len(cellVolumeTag):].strip()
                Skipping = True
                if 'l_cell_volume' in volume:
                    volume = volume[13:].strip()
        if(loopTag in line):
            loopCounter += 1
            if(loopCounter >= 2):
                hasAtomicPositions = True
                break
    #checks to make sure the database actually has atomic positions
    if(hasAtomicPositions == False):
        return None
    measurements = [volume]

     #Truncates out the () for all measrurments
    for x in range(len(measurements)):
        if(measurements[x] != None and '(' in measurements[x]):
            measurements[x] = measurements[x][:measurements[x].find('(')]
    return [chemicalSum,crystalSystem,spaceGroupName,spaceGroupITNumber, pubYear] + measurements

#fix: Find location of it, and then just add wtv, TLDR its gonna be alot harder to encode each one
#go thru each jounral and see if they have a diff format

In [None]:
#IF zipped == true, directory = path to zip folder
#if zipped == false, direcotry = path to folder
def createCifs(directory,zipped):
    if zipped:
        zip = zipfile.ZipFile(directory)
        cifs = [zinfo for zinfo in zip.filelist if zinfo.filename.endswith('.cif')]
    else:
        all_files = os.listdir(directory)
        cifs = [file for file in all_files if file.endswith('.cif')]
    return cifs



In [None]:
directory = '' #directory  
zipped = False
cifs = createCifs(directory,zipped)

noAtomicPosCounter = 0
for x in cifs:
	# List that we want to add as a new row
	if(zipped):
		lis  = readFile(zip.open(x,'r'))
	else:
		lis = readFile(open(x,'r'))
	if(lis == None):
		noAtomicPosCounter +=1
		if(noAtomicPosCounter < 50):
			print("No Atomic Position File Detected:" + x.filename)
	else:
		lis = lis + [x.filename]
		# Open our existing CSV file in append mode
		# Create a file object for this file
		with open('compiledOutput.csv', 'a',newline='') as f_object:
			# Pass this file object to csv.writer()
			# and get a writer object
			writer_object = writer(f_object)
			# Pass the list as an argument into
			# the writerow()
			writer_object.writerow(lis)
			# Close the file object
			f_object.close()
print(f'Amount without Atomic Positions:{noAtomicPosCounter}')

### Some Pandas Editing:

In [None]:

df = pd.read_csv('compiledOutput.csv',columns = ['Chemical Sum','Crystal System','Space Group Name','Space Group IT Number','pubYear','volume','Local Zip Path'])

#Sorting Space Group Numbers
df.loc[df['Space Group IT Number']=='?','Space Group IT Number'] = None
df['Space Group IT Number'] = np.floor(pd.to_numeric(df['Space Group IT Number'],errors='coerce')).astype('Int64')
df.dropna(subset=['Space Group IT Number'])
df = df[df['Space Group IT Number'] >= 1]

#Adding Bravais Lattices
spaceGroupDf = pd.read_csv('spaceGroups.csv')
df['Space Group IT Number'] = df['Space Group IT Number'].astype(str)
merged_df = df.merge(spaceGroupDf, left_on='Space Group IT Number', right_on='Point Group', how='left')

merged_df
merged_df.drop(['Space Group Name', 'Crystal System_x','Full Name'], axis=1, inplace=True)
# Major Changes MADE
#P-Triagonal  = R-Triagonal
# A-Orthorhombic = C-Orthorhombic
merged_df.loc[merged_df['Bravais Lattice'] == 'P-tetragonal', 'Bravais Lattice'] = 'P-Tetragonal'
merged_df.loc[merged_df['Bravais Lattice'] == 'I-tetragonal', 'Bravais Lattice'] = 'I-Tetragonal'
merged_df.loc[merged_df['Bravais Lattice'] == 'P-Trigonal', 'Bravais Lattice'] = 'R-Trigonal'
merged_df.loc[merged_df['Bravais Lattice'] == 'A-Orthorhombic', 'Bravais Lattice'] = 'C-Orthorhombic'
merged_df.loc[merged_df['Bravais Lattice'] == 'P-hexagonal', 'Bravais Lattice'] = 'P-Hexagonal'

merged_df.to_csv('compiledOutput.csv')
