In [3]:
import numpy as np
import pandas as pd

# Finding the simplest mechanisms and most complicated mechanisms

In [35]:
mechanisms = pd.read_csv('collections/MT_library.csv')
mechanisms.head()

Unnamed: 0,MT_class,mechanistic pathway,Unnamed: 2
0,DCC_condensation,"[(5, 4.1), ([4.1, 4], 4), (4, 6), ([6, 5], 5),...",
1,nucleophilic_attack_to_(thio)carbonyl_or_sulfonyl,"[(1, 2), ([2, 3], 3), (3, [3, 2]), ([2, 4], 4)]",
2,SN1,"[([2, 3], 3), (1, 2)]",
3,SN2,"[(1, 2), ([2, 3], 3)]",
4,SNAr,"# removed [(1, 2), ([2, 3], 3), (3, [3, 2]), (...",


## Show the 5 most complicated reactions

In [20]:
mechanisms['num_steps']=mechanisms['mechanistic pathway'].map(lambda x: np.nan if x.startswith('#') else len(eval(x)))
mechanisms.sort_values(by='num_steps', ascending=False, inplace=True)
mechanisms.head(5)

Unnamed: 0,MT_class,mechanistic pathway,Unnamed: 2,num_steps
26,Vilsmeier_formylation,"[(6, [6, 2]), ([2, 1], 7), ([7, 8], 8), (8, [8...",,23.0
34,imidazole_synthesis,"[(10,4.1), ([4.1,4],4), (4,6), ([6,7],7), (7,[...",,22.0
7,Swern_oxidation,"[([9, 10], 6), ([6, 7], 7), (7, [7, 6]), ([6, ...",,19.0
36,Fischer_indole_synthesis,"[(8, 2), ([2, 7], 9), (7, 8.1), ([8.1, 8], 8),...",,18.0
16,Wolf_Kishner_reduction,"[(3,1), ([1,2], 5), (2,3.1), ([3.1,3],3), (3,[...",,17.0


## Show the 10 least complicated reactions

In [26]:
mechanisms.loc[mechanisms.num_steps.notna()].tail(5)

Unnamed: 0,MT_class,mechanistic pathway,Unnamed: 2,num_steps
14,alcohol_condensation,"[(3, 4), (1, 2), ([2, 3], 3), ([1.1,1],1)]",,4.0
32,Markovnikov_addition,"[([2, 3], 4), (1, 2), ([1.1,1],1)]",,3.0
66,imine_reduction,"[(2,4), ([3,3.1],1), ([1,2],2),]",,3.0
3,SN2,"[(1, 2), ([2, 3], 3)]",,2.0
2,SN1,"[([2, 3], 3), (1, 2)]",,2.0


## Join MT and RT dataframes to see which MT maps to the most RTs

In [34]:
rxn_templates = pd.read_csv('collections/LRT_library.csv')
rxn_templates.head()

Unnamed: 0,No,LRT,Criterion,MT_class,Reagent,remap,LRT_extension,Unnamed: 7
0,1,[N:1].[O:4]-[C:2]=[O:3]>>[N:1]-[C:2]=[O:3],"is_carboxylic_acid, [4,2]","{True: 'DCC_condensation', False: 'nucleophili...",{True: {'[N:5](=[C:6]=[N:7]C1CCCCC1)C2CCCCC2' ...,,,
1,2,[N:1].[X:5]-[c:2](:[a:3]):[a:4]>>[N:1]-[c:2](:...,"contains_op_EWG, 2","{'True_ortho': 'SNAr(ortho)', 'True_para' : 'S...","{'True_ortho': None, 'True_para' : None, Fals...",,"{'True_ortho' : {'target_map_num' : 2,\n ...",
2,3,[N:1].[C:2]-[X:3]>>[N:1]-[C:2],"get_degree, 2","{1: 'SN2', 2: 'SN2', 3: 'SN2', 4: 'SN1'}",,,,
3,4,[N:1].[X:4]-[C:2]=[O:3]>>[N:1]-[C:2]=[O:3],,nucleophilic_attack_to_(thio)carbonyl_or_sulfonyl,,,,
4,5,[O:1].[C:2]-[X:3]>>[O:1]-[C:2],"get_degree_and_check_nucleophile, [2,1]",{'SN2_deprotonation_needed': 'SN2_alcohol(thio...,{'SN2_deprotonation_needed': {'[H-:4].[Na+]' :...,,,


In [40]:
rxn_templates['mt_class_exploded']=rxn_templates['MT_class'].map(lambda x: [x] if not x.startswith('{') else [a for a in eval(x).values()])
#pd.json_normalize(tmp, 0, [])
df_exploded = rxn_templates.explode('mt_class_exploded').reset_index()
#pd.json_normalize(tmp, 0, [])
df_exploded=df_exploded.drop('MT_class', axis='columns')
df_exploded

Unnamed: 0,index,No,LRT,Criterion,Reagent,remap,LRT_extension,Unnamed: 7,mt_class_exploded
0,0,1,[N:1].[O:4]-[C:2]=[O:3]>>[N:1]-[C:2]=[O:3],"is_carboxylic_acid, [4,2]",{True: {'[N:5](=[C:6]=[N:7]C1CCCCC1)C2CCCCC2' ...,,,,DCC_condensation
1,0,1,[N:1].[O:4]-[C:2]=[O:3]>>[N:1]-[C:2]=[O:3],"is_carboxylic_acid, [4,2]",{True: {'[N:5](=[C:6]=[N:7]C1CCCCC1)C2CCCCC2' ...,,,,nucleophilic_attack_to_(thio)carbonyl_or_sulfonyl
2,1,2,[N:1].[X:5]-[c:2](:[a:3]):[a:4]>>[N:1]-[c:2](:...,"contains_op_EWG, 2","{'True_ortho': None, 'True_para' : None, Fals...",,"{'True_ortho' : {'target_map_num' : 2,\n ...",,SNAr(ortho)
3,1,2,[N:1].[X:5]-[c:2](:[a:3]):[a:4]>>[N:1]-[c:2](:...,"contains_op_EWG, 2","{'True_ortho': None, 'True_para' : None, Fals...",,"{'True_ortho' : {'target_map_num' : 2,\n ...",,SNAr(para)
4,1,2,[N:1].[X:5]-[c:2](:[a:3]):[a:4]>>[N:1]-[c:2](:...,"contains_op_EWG, 2","{'True_ortho': None, 'True_para' : None, Fals...",,"{'True_ortho' : {'target_map_num' : 2,\n ...",,catalytic_amination
...,...,...,...,...,...,...,...,...,...
163,95,96,[C:2]=[O:1].[C:3]#[C:4]>>[O:1]-[C:2]-[C:3]#[C:4],,{'[NH2-:5].[Na+]' : '_'},,,,alkynyl_attack_to_carbonyl
164,96,97,[C:1]-[O:5].[a:3]:[n:2]:[a:4]>>[C:1]-[n:2](:[a...,,"{'c1ccccc1[P:6](c2ccccc2)c3ccccc3' : '_', '[O:...","{1:2,2:1,3:5,4:6,5:8,6:7,7:10,8:9}",,,Mitsunobu
165,97,98,[N:1]-[C:2](=[O:3])-[a:4]1:c:c:c:c:c:1>>[N:1],,"{'[OH2:5]': '_', 'H+':6}","{1:5, 2:2, 3:3, 4:1, 5:6}",,,carboxylic_acid_derivative_hydrolysis_or_forma...
166,98,99,[O:1].[C:2]=[N:3]=[N:4]>>[O:1]-[C:2],,,,,,methyl_ester_synthesis


In [49]:
tmp=df_exploded.merge(mechanisms, left_on='mt_class_exploded', right_on='MT_class')
tmp.groupby('mt_class_exploded').mt_class_exploded.count().sort_values(ascending=False).head()

mt_class_exploded
SN2                                                  28
SN1                                                  14
nucleophilic_attack_to_(thio)carbonyl_or_sulfonyl    12
SNAr(para)                                            7
SNAr(ortho)                                           7
Name: mt_class_exploded, dtype: int64

# Future Data Analysis tasks

1. join LRT_library and MT_library and see which MT's are mapped to the most RT's
2. find reactions in a college chemistry textbook and determine which MT's are the most common in the reactions in the textbook. 
3. Determine if this program can find mechanisms for reactions that are not in the USPTO_33k.csv reaction dataset
4. generate mechanisms for sample reactions in each of the MTs in the college textbook using this software, and give it to a chemist at Stony brook to evaluate whether the mechanisms are realistic. If not, why not.

If the mechanisms are correct, then we have enough to make a software to throw up on the Web. The licence is MIT, which i think is commercial friendly. 

Still, there might be more to add to the web app in order to make the educational software as effective as possible. Specifically, we might want to use the software as well as the ai reaction extractor to explore chemical reaction space and find a minimal reductionist approach to determining mechanism.