## Some utilities for a more fine-grained analysis of the data.

As initially proposed, tables were constructed in such a way that two elements X,Y appear in the same table only if there exist 2 compounds $A = R-X_n$ and $B = R-Y_n$, where if X were replaced by Y in compound A, the result would be compound B. This approximation ignores every structural factor and relies only on compositional data.

In [1]:
import numpy as np
import bz2
import pickle
import re
from scipy import sparse as sp

dataPath = "../Preprocess/Data/"

# Load matches_pickle.bin
Match_file = bz2.BZ2File(dataPath+'matches_pickle.bin', 'r')
Matches = pickle.load(Match_file)

# Load Rs_sparse.npz
Rs = sp.load_npz(dataPath+'Rs_sparse.npz')
print(Rs.shape)

# Load element list
elemList = []
with open(f"{dataPath}/ElementList.txt",'r') as f:
    for line in f:
        elemList.append(line.strip())
        
print(elemList[:10])


def getFormula(R,elemList):
    """Convert R sparse vector into string composition:
    sparse([0,1,0,...,4,6]) --> Ti4X6 for instance """
    form = ''
    for ind,n in zip(R.indices[:-1],R.data[:-1]):
        form += elemList[ind] 
        if n != 1:
            form += str(int(n))
    return form + f'X{int(R.data[-1]) if R.data[-1]!=1 else ""}'

(6096, 61)
['Ag', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'C']


## The idea of having information of substance ID and years is to have different query methods.

1. For instance, we might be interested only in substances discovered before 1939.

2. Or you may be interested in checking in which tables a particular compound occurs. For instance, what tables does H2O appear in? Then query by substance ID!

---

## First case: Filter out by year.

It's similar for the other case, just replace the condition on the year, for a condition on ID.

In [2]:
year = 1930
matches1930 = []
idx1930 = []

for j,mat in enumerate(Matches):
    tmp_e, tmp_y, tmp_id = [],[],[]  #elems, years, ids
    for i,y in enumerate(mat[1]):
        if int(y)<year:
            tmp_e.append(mat[0][i])
            tmp_y.append(y)
            tmp_id.append(mat[2][i])
    if len(tmp_y)>1:    
        matches1930.append([tmp_e,tmp_y,tmp_id])
        idx1930.append(j)

rs1930 = Rs[idx1930]    

# List Subs. formula and related elements (found before 1930)
N = 10
for i in range(N):
    print(f"{getFormula(rs1930[i],elemList)}\t{matches1930[i][0]}")

C14H10N2OX4	['H', 'O']
C9H12O3X	['C', 'S']
C5H7ClOX2	['O', 'C']
C9H12O3X2	['C', 'N']
C5H7ClOX3	['O', 'C']
C5H7ClOX4	['H', 'O', 'C']
C9H12O3X4	['N', 'H']
C5H7ClX3	['O', 'C']
Br2H12N4X	['Cd', 'Pd', 'Pt']
C6H9BX6	['O', 'H']


In [3]:
def findRns_withConds(Rs,formula,allOtherZero=True,n=False):
    """Finds all pairs (R,n) that fulfill certain conditions
    Rs is the list of (R,n) vectors where to look up
    Ts is the list of tables corresponding to these (R,n) pairs
    formula is any composition to look for inside any R. 
        For instance formula = 'Cl3Si' forces to look for Rs where there are only 3 Cl and 1 Si
    n is the X subindex. If set to False, then any n is posible
    allOtherZero is True if we want to look up for Rs where only the aforementioned elements are present
        if we're open to other possibilities, then False
        
    Returns: the pairs (R,n) fulfilling the conditions and their corresponding tables
    """
    ## Convert given formula into conditions
    Li = re.split(r"(?<!^)(?=[A-Z])",formula)  #Split as ['H2','O']
    li = [re.split(r"([A-z]+)(([0-9]*[.])?[0-9]+)",i)
          if bool(re.match(r'[A-z]*([0-9]*[.])?[0-9]+',i))
          else re.split(r"([A-z]+)(([0-9]*[.])?[0-9]+)",i+'1') for i in Li]  
    elems  = [i[1] for i in li]  # Index of element i to put correspondent data
    subin = [int(i[2]) for i in li]           # Num. atoms of element i
    
    if n: condition = (Rs.toarray()[:,-1]==n)   # Fix choice of n
    else: condition = True            # Otherwise any choice is ok

    suma = 0
    for elem,N in zip(elems,subin):
        condition = condition & (Rs[:,elemList.index(elem)].toarray()==N).flatten()  # Apply each condition 
        suma += N

    # Condition for looking only for exactly the inputed composition
    if allOtherZero:   condition = condition & (Rs[:,:-1].toarray().sum(axis=1)==suma) 

    if not condition.any():
        print('No (R,n)s where found that meet the provided conditions')
        return 0
    
    # Make query
    selectedRs = Rs[condition]

    for R in selectedRs:
        print(getFormula(R,elemList))

In [4]:
# Example: Find all Rs containing exactly 3 Cl, 2 C and 1 O (and anything else)
formula = 'Cl3C2'

print("Example:\nFind all Rs containing exactly 3 Cl and 2 C\n")
print("\t- And anything else, with any n for X:\n")
findRns_withConds(Rs,formula,False)
print("\n\t- And anything else, but n for X == 2:")
findRns_withConds(Rs,formula,False,2)
print("\n\t- And NOTHING else (except for X), with any n:")
findRns_withConds(Rs,formula,True)

Example:
Find all Rs containing exactly 3 Cl and 2 C

	- And anything else, with any n for X:

C2H3Cl3OX
C2HCl3X2
C2H2Cl3OX
C2Cl3X
C2Cl3X3
C2HCl3NOX
C2HCl3O2X2
C2Cl3OX
C2H2Cl3X

	- And anything else, but n for X == 2:
C2HCl3X2
C2HCl3O2X2

	- And NOTHING else (except for X), with any n:
C2Cl3X
C2Cl3X3


### Explore which substitution formulas relate 2 elements.

In [5]:
def showCommonOccurences(elem1,elem2,maxNum):
    """Show which compounds exist in which elem1 and elem2 can be replaceable
    Print first maxNum formulas"""

    common_idx = [i for i,m in enumerate(Matches) if elem1 in m[0] and elem2 in m[0]]
    print(f"Number of common occurences: {len(common_idx)}\n")
    print(f"X = ({elem1}, {elem2}) \n")

    for i in common_idx[:maxNum]:
        print(f"{getFormula(Rs[i],elemList)}\t{Matches[i][0]}")

In [6]:
showCommonOccurences('H','C',20)

Number of common occurences: 542

X = (H, C) 

C5H7ClOX2	['O', 'C', 'H']
C5H7ClOX4	['H', 'O', 'C']
C12H19NX6	['H', 'C']
C13H14O4X2	['O', 'C', 'H']
C6H5NO2X2	['H', 'O', 'C']
C11H8O2X2	['C', 'H']
C16H26O4X2	['C', 'H']
C8H9NX2	['C', 'H', 'O']
C8H11NO2X6	['H', 'C']
C8H6N2O4X2	['H', 'C']
C3H4N4O3X2	['H', 'C']
C6H8O5X2	['C', 'O', 'H']
C6H8O5X4	['H', 'C']
C11H16O3X6	['H', 'C']
C10H10O3X4	['C', 'H']
C4H4O6X2	['H', 'Ag', 'K', 'Na', 'C', 'Tl', 'Hg', 'Li']
C6H7NO3X2	['N', 'C', 'H']
C6H7NO3X4	['C', 'H']
C8H13NO3X2	['H', 'C']
C16H20X10	['C', 'H']


## New section: How many possible PTs are there? (Search space for an optimization algorithm)

In [7]:
from itertools import permutations
from math import factorial

N is the total number of different possible positions. e.g. 32*7=224 for long PT

n is the number of elements. e.g. 60 in 1868

### No. unique configurations not keeping order:

\begin{equation}
\frac{N!}{(N-n)!}
\end{equation}

### No. configurations keeping order but not unique:

\begin{equation}
\frac{N!}{n!}
\end{equation}

### No. unique configurations and keeping order:

\begin{equation}
\frac{N!}{(N-n)!n!}
\end{equation}

In [8]:
N = 9     # Numero de posiciones (32*7 para PT)
n = 4     # Numero de elementos (1,2,3,4,...)

zeros = [0]*(N-n) 
unique_elems = [i for i in range(1,n+1)]

perms = np.array(list(permutations(zeros + unique_elems)))

# Según formula, es:
print("No. unique configurations not keeping order:\t",int(factorial(N)/(factorial(N-n))))
print("No. configurations keeping order but not unique:",int(factorial(N)/(factorial(n))))
print("No. unique configurations and keeping order:\t",int(factorial(N)/(factorial(N-n)*factorial(n))))

No. unique configurations not keeping order:	 3024
No. configurations keeping order but not unique: 15120
No. unique configurations and keeping order:	 126


### Run empirical calculations to corroborate the results above

In [9]:
#No. unique configurations not keeping order
a = np.unique(perms,axis=0)   # Take only unique permutations (that is, make all zeros equal)
print("No. unique configurations not keeping order:\t",a.shape[0])

condition = True
for i in unique_elems[:-1]:
    tmp_i = (perms == i).nonzero()[1]  # Get index of element i in permutation
    tmp_i_1 = (perms == i+1).nonzero()[1]  # Get index of element i+1 in permutation
    condition = condition & (tmp_i < tmp_i_1)
#No. configurations keeping order but not unique
a = perms[condition]
print("No. configurations keeping order but not unique:",a.shape[0])
   
uniqs = np.unique(a,axis=0)
print("No. unique configurations and keeping order:\t",uniqs.shape[0])

No. unique configurations not keeping order:	 3024
No. configurations keeping order but not unique: 15120
No. unique configurations and keeping order:	 126


## That said, how many different possible PTs exist where the order is preserved?

In [10]:
N = 30*7     # Numero de posiciones (32*7)
n = 60     # Numero de elementos (1,2,3,4,...)

print("No. unique configurations not keeping order:\t",factorial(N)/(factorial(N-n)))

#print("No. configurations keeping order but not unique:",factorial(N)//(factorial(n)))

print("No. unique configurations and keeping order:\t",factorial(N)/(factorial(N-n)*factorial(n)))

print(f"\nDifference is a factor of:\t\t\t {factorial(n):.15e}")

No. unique configurations not keeping order:	 1.8522056472827472e+135
No. unique configurations and keeping order:	 2.225944617131523e+53

Difference is a factor of:			 8.320987112741390e+81
