In [1]:
import pandas as pd

In [2]:
physics = pd.read_csv("../data/equation/physics_post_eq.csv", lineterminator='\n')
chemistry = pd.read_csv("../data/equation/chemistry_post_eq.csv", lineterminator='\n')
biology = pd.read_csv("../data/equation/biology_post_eq.csv", lineterminator='\n')

In [3]:
print("physics has " + str(physics.shape[0]) + " data")
print("chemistry has " + str(chemistry.shape[0]) + " data")
print("biology has " + str(biology.shape[0]) + " data")

physics has 161313 data
chemistry has 29269 data
biology has 1968 data


# take chemistry as an example

In [5]:
chemistry.head()

Unnamed: 0,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,Id,LastActivityDate,...,OwnerDisplayName,OwnerUserId,ParentId,PostTypeId,Score,Tags,Title,ViewCount,Equation,Label
0,56.0,4.0,<p>My understanding is that $\mathrm{NaCl}$ is...,,1,,2012-04-25T18:25:27.290,3.0,2,2018-08-23T13:43:44.867,...,,10.0,,1,21,<ions><crystal-structure><ionic-compounds><sol...,How does NaCl maintain its crystalline structure?,3318.0,"['\\mathrm{NaCl}', '\\mathrm{Cl}', '\\mathrm{C...",chemistry
1,13.0,4.0,<p>A reaction proceeds towards the direction o...,,0,,2012-04-25T18:33:07.480,8.0,4,2019-04-06T05:02:28.063,...,,22.0,,1,31,<equilibrium><free-energy>,Gibbs free energy-minimum or zero?,15422.0,"['T', 'P', '\\Delta G=\\Delta G^\\circ + RT\\l...",chemistry
2,,,<p>Gibbs free energy is a measure of how much ...,,2,,2012-04-25T18:39:16.133,,6,2012-04-25T18:49:03.220,...,,10.0,4.0,2,10,,,,['\\Delta G = -T\\Delta S'],chemistry
3,12.0,3.0,"<p>The nitration of <em>N</em>,<em>N</em>-dime...",,3,,2012-04-25T18:44:29.807,6.0,7,2016-09-09T07:08:09.893,...,,12.0,,1,44,<organic-chemistry><aromatic-compounds><nitro-...,"Why does nitration of N,N-dimethylaniline occu...",9455.0,"['\\ce{H2SO4}', '\\ce{HNO3}', '\\ce{-NMe2}']",chemistry
4,,,<p>In the presence of these strong acids the $...,,0,,2012-04-25T18:51:23.513,,12,2016-09-09T07:08:09.893,...,,7.0,7.0,2,36,,,,['\\ce{-NMe2}'],chemistry


In [6]:
chemistry = chemistry[['Id', 'Tags']]
chemistry.head()

Unnamed: 0,Id,Tags
0,2,<ions><crystal-structure><ionic-compounds><sol...
1,4,<equilibrium><free-energy>
2,6,
3,7,<organic-chemistry><aromatic-compounds><nitro-...
4,12,


# drop nan values

In [7]:
chemistry.dropna(inplace = True)

In [8]:
chemistry.reset_index(drop = True, inplace = True)
chemistry.head()

Unnamed: 0,Id,Tags
0,2,<ions><crystal-structure><ionic-compounds><sol...
1,4,<equilibrium><free-energy>
2,7,<organic-chemistry><aromatic-compounds><nitro-...
3,17,<organic-chemistry><thermodynamics>
4,25,<organic-chemistry><carbocation>


# extract tags from string

In [9]:
import re

In [10]:
chemistry['Tags'] = chemistry['Tags'].astype(str)
pattern = re.compile("<(.+?)>")
chemistry['Tag_list'] = chemistry.apply(lambda row: re.findall(pattern, row.Tags), axis=1)
chemistry.head()

Unnamed: 0,Id,Tags,Tag_list
0,2,<ions><crystal-structure><ionic-compounds><sol...,"[ions, crystal-structure, ionic-compounds, sol..."
1,4,<equilibrium><free-energy>,"[equilibrium, free-energy]"
2,7,<organic-chemistry><aromatic-compounds><nitro-...,"[organic-chemistry, aromatic-compounds, nitro-..."
3,17,<organic-chemistry><thermodynamics>,"[organic-chemistry, thermodynamics]"
4,25,<organic-chemistry><carbocation>,"[organic-chemistry, carbocation]"


In [11]:
chemistry['Tag_str'] = chemistry.apply(lambda row: ','.join(row.Tag_list), axis = 1)
chemistry.head()

Unnamed: 0,Id,Tags,Tag_list,Tag_str
0,2,<ions><crystal-structure><ionic-compounds><sol...,"[ions, crystal-structure, ionic-compounds, sol...","ions,crystal-structure,ionic-compounds,solid-s..."
1,4,<equilibrium><free-energy>,"[equilibrium, free-energy]","equilibrium,free-energy"
2,7,<organic-chemistry><aromatic-compounds><nitro-...,"[organic-chemistry, aromatic-compounds, nitro-...","organic-chemistry,aromatic-compounds,nitro-com..."
3,17,<organic-chemistry><thermodynamics>,"[organic-chemistry, thermodynamics]","organic-chemistry,thermodynamics"
4,25,<organic-chemistry><carbocation>,"[organic-chemistry, carbocation]","organic-chemistry,carbocation"


# vectorize the string

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans


In [13]:
tag = chemistry['Tag_str']
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','))
X = vectorizer.fit_transform(tag)
vectorizer.get_feature_names()

['ab-initio',
 'absorption',
 'acid-base',
 'adhesion',
 'adsorption',
 'alcohols',
 'aldol-reaction',
 'alkali-metals',
 'alkaline-earth-metals',
 'allotropes',
 'alloy',
 'amides',
 'amines',
 'amino-acids',
 'analytical-chemistry',
 'applied-chemistry',
 'aqueous-solution',
 'aromatic-compounds',
 'aromaticity',
 'astrochemistry',
 'atmospheric-chemistry',
 'atomic-radius',
 'atomic-structure',
 'atoms',
 'basis-set',
 'bent-bond',
 'biochemistry',
 'boiling-point',
 'bond',
 'books',
 'boron-family',
 'bravais-lattices',
 'c-c-addition',
 'c-x-addition',
 'calorimetry',
 'capillary-forces',
 'carbene',
 'carbocation',
 'carbohydrates',
 'carbon-allotropes',
 'carbon-family',
 'carbonyl-complexes',
 'carbonyl-compounds',
 'catalysis',
 'ceramics',
 'chemical-biology',
 'chemical-engineering',
 'cheminformatics',
 'chemistry-in-fiction',
 'chemoselectivity',
 'chirality',
 'chromatography',
 'cleaning',
 'colligative-properties',
 'colloids',
 'color',
 'combustion',
 'computational-

In [14]:
len(vectorizer.get_feature_names())

307

In [15]:
X.shape

(11859, 307)

In [16]:
chemistry.shape[0]

11859

# use kmeans to cluster tags into 5 clusters

In [17]:
kmeans = KMeans(n_clusters=5, random_state=0).fit(X)

In [18]:
chemistry['Cluster_label'] = kmeans.labels_
chemistry.head()

Unnamed: 0,Id,Tags,Tag_list,Tag_str,Cluster_label
0,2,<ions><crystal-structure><ionic-compounds><sol...,"[ions, crystal-structure, ionic-compounds, sol...","ions,crystal-structure,ionic-compounds,solid-s...",0
1,4,<equilibrium><free-energy>,"[equilibrium, free-energy]","equilibrium,free-energy",0
2,7,<organic-chemistry><aromatic-compounds><nitro-...,"[organic-chemistry, aromatic-compounds, nitro-...","organic-chemistry,aromatic-compounds,nitro-com...",1
3,17,<organic-chemistry><thermodynamics>,"[organic-chemistry, thermodynamics]","organic-chemistry,thermodynamics",1
4,25,<organic-chemistry><carbocation>,"[organic-chemistry, carbocation]","organic-chemistry,carbocation",1


# cluster result

In [19]:
df_list = []
for i in range(5):
    cluster = chemistry[chemistry.apply(lambda row: row.Cluster_label == i and len(row.Tag_list) > 1, axis = 1)]
    df = cluster.groupby(['Tag_str'])['Tag_str'].count().rename('count').reset_index().sort_values('count', ascending=False)
    print(df.head(10))
    print("\n")
    df_list.append(df)
    

                                        Tag_str  count
2573                    thermodynamics,enthalpy     41
680                      electrochemistry,redox     35
2592                 thermodynamics,equilibrium     27
2587                     thermodynamics,entropy     23
629               electrochemistry,electrolysis     21
2197                      redox,oxidation-state     20
1916  quantum-chemistry,computational-chemistry     20
2636                        thermodynamics,heat     18
1422                     homework,stoichiometry     17
2109                reaction-mechanism,kinetics     16


                                               Tag_str  count
645               organic-chemistry,reaction-mechanism    157
561                     organic-chemistry,nomenclature     58
0                          organic-chemistry,acid-base     51
109               organic-chemistry,aromatic-compounds     29
963                        organic-chemistry,synthesis     26
928                  

# get the centroid

In [20]:
from sklearn.metrics import pairwise_distances_argmin_min

In [21]:
kmeans.cluster_centers_

array([[2.25733634e-03, 1.38913006e-03, 0.00000000e+00, ...,
        2.08369509e-03, 0.00000000e+00, 1.56277131e-03],
       [0.00000000e+00, 9.19540230e-04, 8.41379310e-02, ...,
        0.00000000e+00, 1.83908046e-03, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 6.74763833e-04, ...,
        1.34952767e-03, 0.00000000e+00, 1.34952767e-03],
       [0.00000000e+00, 1.46412884e-03, 1.46412884e-03, ...,
        7.32064422e-04, 0.00000000e+00, 7.32064422e-04]])

In [22]:
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)
for i in closest:
    print(chemistry.iloc[i].Tag_str)

thermodynamics
organic-chemistry
acid-base
inorganic-chemistry
physical-chemistry


# evaluate the result - silhouette coefficient

In [23]:
chemistry.shape

(11859, 5)

In [24]:
from sklearn import metrics
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, kmeans.labels_, sample_size=chemistry.shape[0]))

Silhouette Coefficient: 0.120


# below is the generalized code for each subject

In [26]:
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn import metrics
import re
from ast import literal_eval

In [27]:
pattern = re.compile("<(.+?)>")

In [28]:
def get_data(sub_string):
     # read data
    subject = pd.read_csv("../data/equation/" + sub_string + "_post_eq.csv", lineterminator='\n')
    subject.Equation = subject.Equation.apply(lambda x: literal_eval(x))
    subject = subject[['Id', 'Tags', 'Equation', 'Label']]
    # drop nan values
    subject.dropna(inplace = True)
    subject.reset_index(drop = True, inplace = True)
    subject['Tags'] = subject['Tags'].astype(str)
    # extract tags from "<>"
    subject['Tag_list'] = subject.apply(lambda row: re.findall(pattern, row.Tags), axis=1)
    # join tags with ","
    subject['Tag_str'] = subject.apply(lambda row: ','.join(row.Tag_list), axis = 1)
    
    return subject

In [29]:
def tf_idf_vectorizer(subject):
    tag = subject['Tag_str']
    # convert tag string to vectors
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(','))
    X = vectorizer.fit_transform(tag)
    return X

In [30]:
def count_vectorizer(subject):
    tag = subject['Tag_str']
    # convert tag string to vectors
    vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','))
    X = vectorizer.fit_transform(tag)
    return X

In [31]:
def k_means(X, subject, n):        
    # kmeans clustering
    kmeans = KMeans(n_clusters= n, random_state=0).fit(X)
    subject['Cluster_label'] = kmeans.labels_
    
    # get the centroid
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)
    for i in closest:
        print(subject.iloc[i].Tag_str)
        print("\n")
    return subject
    # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, kmeans.labels_, sample_size=subject.shape[0]))
    

In [32]:
def k_means_res(subject, n):    
    # get the most freq tag pairs
    df_list = []
    for i in range(n):
        cluster = subject[subject.apply(lambda row: row.Cluster_label == i, axis = 1)]
        df = cluster.groupby(['Tag_str'])['Tag_str'].count().rename('count').reset_index().sort_values('count', ascending=False)
        df_list.append(df)
    return df_list

In [33]:
def cluster(subject_string, number):
    subject = get_data(subject_string)
    X = count_vectorizer(subject)
    subject = k_means(X, subject, number)
    return subject

In [34]:
def save_result(subject, subject_string):
    subject_new = subject[['Id', 'Tags', 'Equation', 'Label', 'Cluster_label']]
    start = time.time()
    post_rows = []
    _ = subject_new.apply(lambda row: [post_rows.append([eqn, row["Label"], row['Cluster_label'], row["Id"], row["Tags"]]) for eqn in row.Equation], axis=1)
    subject_relabel = pd.DataFrame(post_rows, columns=["Equation", "Label", "Cluster_label", "Id", "Tags"])
    end = time.time()
    print(len(subject_relabel.index))
    print((end-start)/60.0)

    start = time.time()
    subject_relabel.to_csv("../data/equation/" + subject_string + "_post_eq_tag_id.csv", index = False)
    end = time.time()
    print((end-start)/60.0)

# physics with count vectorizer and cluster in 8 groups

In [35]:
physics = cluster("physics", 8)

quantum-field-theory


special-relativity


electromagnetism


newtonian-mechanics


quantum-mechanics


homework-and-exercises


quantum-mechanics,operators


general-relativity




In [36]:
df_list = k_means_res(physics, 8)
for i in range(8):
    print(df_list[i].head(10))
    print("\n")

                                           Tag_str  count
272                           quantum-field-theory    179
2401          quantum-field-theory,renormalization     72
3297        quantum-mechanics,quantum-field-theory     45
777          quantum-field-theory,feynman-diagrams     41
37     homework-and-exercises,quantum-field-theory     28
485    quantum-field-theory,conformal-field-theory     25
2989            quantum-field-theory,supersymmetry     25
2260  quantum-field-theory,quantum-electrodynamics     20
1322    quantum-field-theory,klein-gordon-equation     19
1824         quantum-field-theory,particle-physics     18


                                        Tag_str  count
712                          special-relativity    218
285   homework-and-exercises,special-relativity     76
14          electromagnetism,special-relativity     44
1830           special-relativity,time-dilation     29
1653          special-relativity,speed-of-light     26
155       general-relativity,s

In [37]:
physics.groupby(['Cluster_label'])['Cluster_label'].count()

Cluster_label
0     5040
1     2982
2    27829
3     4932
4     7963
5     6333
6     1486
7     3319
Name: Cluster_label, dtype: int64

In [38]:
physics[physics['Cluster_label'] == 1]

Unnamed: 0,Id,Tags,Equation,Label,Tag_list,Tag_str,Cluster_label
32,716,<particle-physics><special-relativity>,"[E^2 = (pc)^2 + (mc^2)^2., pc = Ev/c, E = mc^2...",physics,"[particle-physics, special-relativity]","particle-physics,special-relativity",1
66,1251,<quantum-mechanics><classical-mechanics><speci...,"[RM_c, c \rightarrow \infty, QM_h, h \rightarr...",physics,"[quantum-mechanics, classical-mechanics, speci...","quantum-mechanics,classical-mechanics,special-...",1
68,1268,<special-relativity><history>,"[E=mc^2, E=mc^2]",physics,"[special-relativity, history]","special-relativity,history",1
69,1273,<special-relativity>,"[c, E=mc^2]",physics,[special-relativity],special-relativity,1
70,1307,<special-relativity><speed-of-light><refractio...,"[c, c]",physics,"[special-relativity, speed-of-light, refractio...","special-relativity,speed-of-light,refraction,f...",1
74,1368,<newtonian-mechanics><special-relativity><ener...,"[V, v, \frac{1}{2}m(v+V)^2, \frac{1}{2}mv^2]",physics,"[newtonian-mechanics, special-relativity, ener...","newtonian-mechanics,special-relativity,energy-...",1
94,1574,<homework-and-exercises><electromagnetism><spe...,"[\frac{1}{\sqrt{\mu_0 \epsilon_0}}, v = \frac{...",physics,"[homework-and-exercises, electromagnetism, spe...","homework-and-exercises,electromagnetism,specia...",1
151,2586,<quantum-field-theory><special-relativity><spe...,"[n=\sqrt{\epsilon_r \mu_r}, \rightarrow 0, c]",physics,"[quantum-field-theory, special-relativity, spe...","quantum-field-theory,special-relativity,speed-...",1
168,2774,<special-relativity><faster-than-light><contin...,"[\omega, V = \omega * r, c, \omega = V / r, 20...",physics,"[special-relativity, faster-than-light, contin...","special-relativity,faster-than-light,continuum...",1
179,2978,<special-relativity><forces>,"[e, \mathbf{E}, \frac{\mathrm{d}}{\mathrm{d}t}...",physics,"[special-relativity, forces]","special-relativity,forces",1


In [39]:
physics[physics['Cluster_label'] == 6]

Unnamed: 0,Id,Tags,Equation,Label,Tag_list,Tag_str,Cluster_label
240,4049,<quantum-mechanics><operators><heisenberg-unce...,[\langle ( \Delta A )^{2} \rangle \langle ( \D...,physics,"[quantum-mechanics, operators, heisenberg-unce...","quantum-mechanics,operators,heisenberg-uncerta...",6
397,6584,<quantum-mechanics><time><operators><observables>,[\langle x | P | \psi \rangle ~=~ -i \hbar \fr...,physics,"[quantum-mechanics, time, operators, observables]","quantum-mechanics,time,operators,observables",6
458,7638,<quantum-mechanics><homework-and-exercises><op...,[\hat{A}^{\dagger} + \hat{B}^{\dagger} = \left...,physics,"[quantum-mechanics, homework-and-exercises, op...","quantum-mechanics,homework-and-exercises,opera...",6
563,9183,<quantum-mechanics><statistical-mechanics><ope...,"[Z=tr(e^{-\beta H}), H, Z= \int D(\bar{\gamma}...",physics,"[quantum-mechanics, statistical-mechanics, ope...","quantum-mechanics,statistical-mechanics,operat...",6
565,9194,<quantum-mechanics><operators><commutator><obs...,"[A, H]",physics,"[quantum-mechanics, operators, commutator, obs...","quantum-mechanics,operators,commutator,observa...",6
575,9349,<quantum-mechanics><momentum><operators><commu...,"[A, B, AB, \frac{AB+BA}{2}., \frac{\vec{p}\cdo...",physics,"[quantum-mechanics, momentum, operators, commu...","quantum-mechanics,momentum,operators,commutator",6
588,9551,<quantum-mechanics><mathematical-physics><math...,"[H, A, H, P^{A}, A~=~\int_{\mathbb{R}} \lambda...",physics,"[quantum-mechanics, mathematical-physics, math...","quantum-mechanics,mathematical-physics,mathema...",6
598,9641,<quantum-mechanics><operators><hamiltonian-for...,"[c, Q, P\in \mathbb{R}, \exp\left[\frac{i}{\hb...",physics,"[quantum-mechanics, operators, hamiltonian-for...","quantum-mechanics,operators,hamiltonian-formal...",6
634,10004,<quantum-mechanics><mathematical-physics><oper...,"[\Psi, \Psi, |\textbf{u}_k\rangle = e^{-i k x ...",physics,"[quantum-mechanics, mathematical-physics, oper...","quantum-mechanics,mathematical-physics,operato...",6
720,11158,<quantum-mechanics><operators>,"[x_4, \langle a\rangle = \int\Psi A\Psi^* \mat...",physics,"[quantum-mechanics, operators]","quantum-mechanics,operators",6


In [40]:
physics_new = physics[(physics['Cluster_label'] == 1) | (physics['Cluster_label'] == 6) | (physics['Cluster_label'] == 0) | (physics['Cluster_label'] == 7)]


In [42]:
physics_new['Cluster_label'] = physics_new.Cluster_label.map({1:"physics-relativity", 7:"physics-relativity", 0: "physics-quantum",
                           6: "physics-quantum"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [43]:
physics_new.head()

Unnamed: 0,Id,Tags,Equation,Label,Tag_list,Tag_str,Cluster_label
20,494,<quantum-field-theory><particle-physics><resou...,[\phi^4],physics,"[quantum-field-theory, particle-physics, resou...","quantum-field-theory,particle-physics,resource...",physics-quantum
25,575,<electromagnetism><general-relativity><gravity>,"[V\propto \frac{1}{r}, V\propto \frac{1}{r}]",physics,"[electromagnetism, general-relativity, gravity]","electromagnetism,general-relativity,gravity",physics-relativity
28,625,<quantum-field-theory><quantization><second-qu...,"[t\mapsto \vec x(t), \vec x(t), x^\mu(\lambda)...",physics,"[quantum-field-theory, quantization, second-qu...","quantum-field-theory,quantization,second-quant...",physics-quantum
32,716,<particle-physics><special-relativity>,"[E^2 = (pc)^2 + (mc^2)^2., pc = Ev/c, E = mc^2...",physics,"[particle-physics, special-relativity]","particle-physics,special-relativity",physics-relativity
35,768,<quantum-mechanics><electromagnetism><quantum-...,"[\psi(x,t), \psi : M \to \mathbb{C}, M, M=\mat...",physics,"[quantum-mechanics, electromagnetism, quantum-...","quantum-mechanics,electromagnetism,quantum-fie...",physics-quantum


In [None]:
# some reason, the save_result function has some issue in the environment of GCP AISE, use alternative way to create equationsa nd save it/


In [44]:
def row_to_labeled_equation(row: pd.Series):
    for equation in row['Equation']:
        yield {
            'Equation': equation,
            'Label': row['Label'],
            'Cluster_label': row['Cluster_label'],
            'Id': row['Id'],
            'Tags': row['Tags'],

        }

In [45]:
equations = []
_ = physics_new.apply(lambda row: [equations.append(e) for e in row_to_labeled_equation(row)], axis=1)


In [46]:
physics_new = pd.DataFrame(equations)

In [47]:
physics_new.to_csv("../data/equation/physics_post_eq_tag_id.csv", index = False)


In [48]:
physics_new.shape[0]

101970

# chemistry with count vectorizer and cluster in 4 groups

In [None]:
# some unknown reason that we cannot run get_data for chemistry dataset on GCP.

In [49]:
subject = pd.read_csv("../data/equation/chemistry_post_eq.csv", lineterminator='\n')

In [50]:
subject.Equation = subject.Equation.apply(lambda x: literal_eval(x))
subject = subject[['Id', 'Tags', 'Equation', 'Label']]
# drop nan values
subject.dropna(inplace = True)
subject.reset_index(drop = True, inplace = True)
subject['Tags'] = subject['Tags'].astype(str)

In [51]:
tag_list = []
for i, row in subject.iterrows():
    try:
        res = re.findall(pattern, row.Tags)
        tag_list.append(res)
    except:
        print(i)

In [52]:
subject['Tag_list'] = tag_list

In [53]:
 # join tags with ","
subject['Tag_str'] = subject.apply(lambda row: ','.join(row.Tag_list), axis = 1)
    

In [54]:
X = count_vectorizer(subject)

In [55]:
chemistry = k_means(X, subject, 4)
chemistry.head()

physical-chemistry


inorganic-chemistry


acid-base


organic-chemistry




Unnamed: 0,Id,Tags,Equation,Label,Tag_list,Tag_str,Cluster_label
0,2,<ions><crystal-structure><ionic-compounds><sol...,"[\mathrm{NaCl}, \mathrm{Cl}, \mathrm{Cl^-}, \m...",chemistry,"[ions, crystal-structure, ionic-compounds, sol...","ions,crystal-structure,ionic-compounds,solid-s...",0
1,4,<equilibrium><free-energy>,"[T, P, \Delta G=\Delta G^\circ + RT\ln Q, , wh...",chemistry,"[equilibrium, free-energy]","equilibrium,free-energy",0
2,7,<organic-chemistry><aromatic-compounds><nitro-...,"[\ce{H2SO4}, \ce{HNO3}, \ce{-NMe2}]",chemistry,"[organic-chemistry, aromatic-compounds, nitro-...","organic-chemistry,aromatic-compounds,nitro-com...",3
3,17,<organic-chemistry><thermodynamics>,"[\ce{C-P}, \ce{C-O-P}]",chemistry,"[organic-chemistry, thermodynamics]","organic-chemistry,thermodynamics",3
4,25,<organic-chemistry><carbocation>,"[R, &gt;C(+)-C(R_1R_2R_3)]",chemistry,"[organic-chemistry, carbocation]","organic-chemistry,carbocation",3


In [56]:
chemistry.groupby(['Cluster_label'])['Cluster_label'].count()

Cluster_label
0    7093
1    1649
2    1091
3    2026
Name: Cluster_label, dtype: int64

In [57]:
chemistry[chemistry['Cluster_label'] == 0]

Unnamed: 0,Id,Tags,Equation,Label,Tag_list,Tag_str,Cluster_label
0,2,<ions><crystal-structure><ionic-compounds><sol...,"[\mathrm{NaCl}, \mathrm{Cl}, \mathrm{Cl^-}, \m...",chemistry,"[ions, crystal-structure, ionic-compounds, sol...","ions,crystal-structure,ionic-compounds,solid-s...",0
1,4,<equilibrium><free-energy>,"[T, P, \Delta G=\Delta G^\circ + RT\ln Q, , wh...",chemistry,"[equilibrium, free-energy]","equilibrium,free-energy",0
5,27,<hybridization><color>,"[d^3s, \mathrm{Cr}, \mathrm{Mn}, d^3s]",chemistry,"[hybridization, color]","hybridization,color",0
9,87,<coordination-compounds><home-experiment><color>,[_{\textrm{(s)}}],chemistry,"[coordination-compounds, home-experiment, color]","coordination-compounds,home-experiment,color",0
11,162,<solubility>,[\ce{MgKPO4}],chemistry,[solubility],solubility,0
12,163,<electronic-configuration><periodic-trends><el...,[Z_\mathrm{eff}],chemistry,"[electronic-configuration, periodic-trends, el...","electronic-configuration,periodic-trends,elect...",0
13,172,<theoretical-chemistry><intermolecular-forces>...,"[\pi\cdots\pi, \pi\cdots\pi]",chemistry,"[theoretical-chemistry, intermolecular-forces,...","theoretical-chemistry,intermolecular-forces,ca...",0
14,191,<electrons><nuclear><crystal-structure>,[\ce{NaCl}],chemistry,"[electrons, nuclear, crystal-structure]","electrons,nuclear,crystal-structure",0
17,236,<hybridization><quantum-chemistry>,"[sp^3, sp^2, \ce{PH3}, p, \ce{P-H}, s, \ce{B2H...",chemistry,"[hybridization, quantum-chemistry]","hybridization,quantum-chemistry",0
18,238,<orbitals><quantum-chemistry>,"[\ce{N^3+}, 2p]",chemistry,"[orbitals, quantum-chemistry]","orbitals,quantum-chemistry",0


In [58]:
chemistry[chemistry['Cluster_label'] == 1]

Unnamed: 0,Id,Tags,Equation,Label,Tag_list,Tag_str,Cluster_label
10,148,<inorganic-chemistry><ions><electronic-configu...,"[[\ce{Ar}]\mathrm{4s^2 3d^3}, \ce{V^{3+}}]",chemistry,"[inorganic-chemistry, ions, electronic-configu...","inorganic-chemistry,ions,electronic-configurat...",1
16,235,<inorganic-chemistry><synthesis><noble-gases>,"[\ce{XeF4}, \ce{XeF6}, \ce{NiF4}, \ce{Xe}]",chemistry,"[inorganic-chemistry, synthesis, noble-gases]","inorganic-chemistry,synthesis,noble-gases",1
21,275,<organic-chemistry><inorganic-chemistry><organ...,"[\ce{-SH}, \ce{=S}, \ce{Hg}]",chemistry,"[organic-chemistry, inorganic-chemistry, organ...","organic-chemistry,inorganic-chemistry,organosu...",1
38,372,<inorganic-chemistry><stoichiometry>,"[\ce{W}, \ce{Cl}, \frac{0.7715}{0.3302}\approx...",chemistry,"[inorganic-chemistry, stoichiometry]","inorganic-chemistry,stoichiometry",1
42,383,<inorganic-chemistry><equilibrium><solubility>,"[\ce{AgNO3 + AlCl3}, \ce{3AgNO3 + AlCl3 -&gt; ...",chemistry,"[inorganic-chemistry, equilibrium, solubility]","inorganic-chemistry,equilibrium,solubility",1
43,385,<inorganic-chemistry><coordination-compounds><...,"[(\ce{ZnCl2.2H2O}), \ce{FeCl2.4H2O}, \ce{H2O}]",chemistry,"[inorganic-chemistry, coordination-compounds, ...","inorganic-chemistry,coordination-compounds,aqu...",1
48,431,<inorganic-chemistry><bond><allotropes>,[\mathrm{p}\pi\text{-}\mathrm{d}\pi],chemistry,"[inorganic-chemistry, bond, allotropes]","inorganic-chemistry,bond,allotropes",1
50,472,<inorganic-chemistry><coordination-compounds><...,"[\ce{Fe3O4}, \ce{As(III)}, \ce{As(V)}, \ce{As}...",chemistry,"[inorganic-chemistry, coordination-compounds, ...","inorganic-chemistry,coordination-compounds,wat...",1
59,537,<inorganic-chemistry><acid-base><nomenclature>,"[\ce{H3AsO3}, \ce{As(OH)3}, \ce{H2SO4}, \ce{H3...",chemistry,"[inorganic-chemistry, acid-base, nomenclature]","inorganic-chemistry,acid-base,nomenclature",1
63,569,<inorganic-chemistry><crystal-structure>,[\ce{NaBO2 +B2O3}],chemistry,"[inorganic-chemistry, crystal-structure]","inorganic-chemistry,crystal-structure",1


In [59]:
chemistry[chemistry['Cluster_label'] == 2]

Unnamed: 0,Id,Tags,Equation,Label,Tag_list,Tag_str,Cluster_label
7,72,<acid-base><ph>,"[\mathrm{p}K_\mathrm{a}, \mathrm{pH}]",chemistry,"[acid-base, ph]","acid-base,ph",2
44,395,<acid-base><experimental-chemistry><titration>,"[\ce{H2SO4}, \ce{ HNO3}, \ce{HCl}, \ce{NaOH}]",chemistry,"[acid-base, experimental-chemistry, titration]","acid-base,experimental-chemistry,titration",2
58,531,<acid-base><equilibrium>,"[\ce{HF}, \ce{HF + H2O &lt;=&gt; H3O+ + F-}, [...",chemistry,"[acid-base, equilibrium]","acid-base,equilibrium",2
105,969,<acid-base><experimental-chemistry>,"[\mathrm{p}K_\mathrm{a}, \ce{Ca_3(PO_4)_2}, \c...",chemistry,"[acid-base, experimental-chemistry]","acid-base,experimental-chemistry",2
113,1048,<acid-base><hydrogen>,"[\ce{H2}, \ce{H2}, \ce{H2}]",chemistry,"[acid-base, hydrogen]","acid-base,hydrogen",2
136,1171,<acid-base><water><ph>,[[\ce{H3O+}]],chemistry,"[acid-base, water, ph]","acid-base,water,ph",2
164,1340,<acid-base><ph>,"[\ce{H2NNH2}, K_\text{b} = 3.0 \times 10^{-6},...",chemistry,"[acid-base, ph]","acid-base,ph",2
165,1341,<organic-chemistry><acid-base>,"[\ce{H2F+}, \ce{F-H}, \ce{CH3OH}, \ce{(CH3)2OH...",chemistry,"[organic-chemistry, acid-base]","organic-chemistry,acid-base",2
174,2423,<acid-base><ph>,"[\ce{HCl}, \ce{HCl}, vol,vol, \ce{HCl}]",chemistry,"[acid-base, ph]","acid-base,ph",2
193,2563,<acid-base><ph>,"[\text{pH}, \left[\text{H}^{+}_{(\text{aq})}\r...",chemistry,"[acid-base, ph]","acid-base,ph",2


In [60]:
chemistry[chemistry['Cluster_label'] == 3]

Unnamed: 0,Id,Tags,Equation,Label,Tag_list,Tag_str,Cluster_label
2,7,<organic-chemistry><aromatic-compounds><nitro-...,"[\ce{H2SO4}, \ce{HNO3}, \ce{-NMe2}]",chemistry,"[organic-chemistry, aromatic-compounds, nitro-...","organic-chemistry,aromatic-compounds,nitro-com...",3
3,17,<organic-chemistry><thermodynamics>,"[\ce{C-P}, \ce{C-O-P}]",chemistry,"[organic-chemistry, thermodynamics]","organic-chemistry,thermodynamics",3
4,25,<organic-chemistry><carbocation>,"[R, &gt;C(+)-C(R_1R_2R_3)]",chemistry,"[organic-chemistry, carbocation]","organic-chemistry,carbocation",3
6,45,<organic-chemistry><reaction-mechanism><synthe...,[\mathrm{H^+}],chemistry,"[organic-chemistry, reaction-mechanism, synthe...","organic-chemistry,reaction-mechanism,synthesis",3
8,79,<organic-chemistry><reaction-mechanism><aromat...,[\ce{KMnO4}],chemistry,"[organic-chemistry, reaction-mechanism, aromat...","organic-chemistry,reaction-mechanism,aromatic-...",3
15,215,<organic-chemistry><bent-bond>,"[\pi, \sigma, \ce{NO2}]",chemistry,"[organic-chemistry, bent-bond]","organic-chemistry,bent-bond",3
29,299,<organic-chemistry><molecular-orbital-theory>,"[\ce{sp^2-sp^2}, \ce{p\pi-p\pi}, \ce{p\pi-p\pi...",chemistry,"[organic-chemistry, molecular-orbital-theory]","organic-chemistry,molecular-orbital-theory",3
36,359,<organic-chemistry><reaction-mechanism><free-e...,"[\sigma, \sigma^\pm]",chemistry,"[organic-chemistry, reaction-mechanism, free-e...","organic-chemistry,reaction-mechanism,free-energy",3
47,429,<organic-chemistry><ionic-compounds><organomet...,[\ce{PhLi}],chemistry,"[organic-chemistry, ionic-compounds, organomet...","organic-chemistry,ionic-compounds,organometall...",3
68,597,<organic-chemistry><polymers>,[\ce{(C6H10O5)_{n} + C3H8O3 + C2H4O2 + H2O -&g...,chemistry,"[organic-chemistry, polymers]","organic-chemistry,polymers",3


In [61]:
chemistry.head()

Unnamed: 0,Id,Tags,Equation,Label,Tag_list,Tag_str,Cluster_label
0,2,<ions><crystal-structure><ionic-compounds><sol...,"[\mathrm{NaCl}, \mathrm{Cl}, \mathrm{Cl^-}, \m...",chemistry,"[ions, crystal-structure, ionic-compounds, sol...","ions,crystal-structure,ionic-compounds,solid-s...",0
1,4,<equilibrium><free-energy>,"[T, P, \Delta G=\Delta G^\circ + RT\ln Q, , wh...",chemistry,"[equilibrium, free-energy]","equilibrium,free-energy",0
2,7,<organic-chemistry><aromatic-compounds><nitro-...,"[\ce{H2SO4}, \ce{HNO3}, \ce{-NMe2}]",chemistry,"[organic-chemistry, aromatic-compounds, nitro-...","organic-chemistry,aromatic-compounds,nitro-com...",3
3,17,<organic-chemistry><thermodynamics>,"[\ce{C-P}, \ce{C-O-P}]",chemistry,"[organic-chemistry, thermodynamics]","organic-chemistry,thermodynamics",3
4,25,<organic-chemistry><carbocation>,"[R, &gt;C(+)-C(R_1R_2R_3)]",chemistry,"[organic-chemistry, carbocation]","organic-chemistry,carbocation",3


In [81]:
chemistry_new = chemistry[(chemistry['Cluster_label'] == 1) | (chemistry['Cluster_label'] == 3)]

In [None]:
# save organic and inorganic chemistry

In [82]:
chemistry_new['Cluster_label'] = chemistry_new.Cluster_label.map({1:"chemistry-inorganic", 3:"chemistry-organic"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [77]:
def row_to_labeled_equation(row: pd.Series):
    for equation in row['Equation']:
        yield {
            'Equation': equation,
            'Label': row['Label'],
            'Cluster_label': row['Cluster_label'],
            'Id': row['Id'],
            'Tags': row['Tags'],

        }

In [83]:
equations = []
_ = chemistry_new.apply(lambda row: [equations.append(e) for e in row_to_labeled_equation(row)], axis=1)

In [84]:
chemistry_new = pd.DataFrame(equations)
chemistry_new.head()

Unnamed: 0,Cluster_label,Equation,Id,Label,Tags
0,chemistry-organic,\ce{H2SO4},7,chemistry,<organic-chemistry><aromatic-compounds><nitro-...
1,chemistry-organic,\ce{HNO3},7,chemistry,<organic-chemistry><aromatic-compounds><nitro-...
2,chemistry-organic,\ce{-NMe2},7,chemistry,<organic-chemistry><aromatic-compounds><nitro-...
3,chemistry-organic,\ce{C-P},17,chemistry,<organic-chemistry><thermodynamics>
4,chemistry-organic,\ce{C-O-P},17,chemistry,<organic-chemistry><thermodynamics>


In [85]:
chemistry_new.shape[0]

15830

In [86]:
chemistry_new.to_csv("../data/equation/chemistry_post_eq_tag_id.csv", index = False)

# since biology dataset is small, we don't do the clustering, just keep the cluster label as "biology"

In [87]:
biology = pd.read_csv("../data/equation/biology_post_eq.csv", lineterminator='\n')
biology.Equation = biology.Equation.apply(lambda x: literal_eval(x))


In [88]:
biology['Cluster_label'] = 'biology'

In [89]:
biology.shape[0]

1968

In [90]:
biology = biology[biology.apply(lambda row: len(row.Equation) > 0, axis = 1)]

In [91]:
biology.shape[0]

1835

In [92]:
biology = biology[['Cluster_label','Equation','Id','Label','Tags']]

In [93]:
save_result(biology, "biology")

12571
0.007124773661295573
0.0006338357925415039
