In [1]:
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor
import os, requests

In [2]:
def formula_as_file( formula, file, negate=False ):
    tfile = file
    if negate:
        tfile = 'tmp.png'
    r = requests.get( 'http://latex.codecogs.com/png.latex?\dpi{300} \huge %s' % formula )
    f = open( tfile, 'wb' )
    f.write( r.content )
    f.close()
    if negate:
        os.system( 'convert tmp.png -channel RGB -negate -colorspace rgb %s' %file )

In [3]:
chem = pd.read_csv("../data/equation/chemistry_post_eq_filtered_id.csv", lineterminator='\n')
physics = pd.read_csv("../data/equation/physics_post_eq_filtered_id.csv", lineterminator='\n')
bio = pd.read_csv("../data/equation/biology_post_eq_filtered_id.csv", lineterminator='\n')
#math = pd.read_csv("../data/equation/math_post_eq_filtered_id.csv", lineterminator='\n')

In [4]:
#pruning - delete nan values
count = 0
for text in physics["Equation"]:
    if not isinstance(text, str):
        count += 1
print(count)

309


In [5]:
bio = bio.loc[bio.Equation.apply(type) != float]
chem = chem.loc[chem.Equation.apply(type) != float]
physics = physics.loc[physics.Equation.apply(type) != float]

In [6]:
#check out lengths of equations, delete everything 2 characters or less
bio_lens = []
chem_lens = []
physics_lens = []
for text in bio["Equation"]:
    bio_lens.append(len(text))
for text in chem["Equation"]:
    chem_lens.append(len(text))
for text in physics["Equation"]:
    physics_lens.append(len(text))

In [7]:
len(bio.index) - bio_lens.count(1) - bio_lens.count(2)

9591

In [8]:
bio = bio.loc[bio.Equation.apply(len) > 2]
len(bio.index)

9591

In [9]:
len(chem.index) - chem_lens.count(1) - chem_lens.count(2)

180156

In [10]:
chem = chem.loc[chem.Equation.apply(len) > 2]
len(chem.index)

180156

In [11]:
len(physics.index) - physics_lens.count(1) - physics_lens.count(2)

1065471

In [12]:
physics = physics.loc[physics.Equation.apply(len) > 2]
len(physics.index)

1065471

In [13]:
#eliminate chemical equations for now
count = 0
for text in bio.Equation:
    if "ce{" in text:
        count += 1
print(len(bio.index) - count)

9001


In [14]:
bio = bio[~bio.Equation.str.contains("ce{")]
print(len(bio.index))

9001


In [15]:
count = 0
for text in chem.Equation:
    if "ce{" in text:
        count += 1
print(len(chem.index) - count)

90860


In [16]:
chem = chem[~chem.Equation.str.contains("ce{")]
print(len(chem.index))

90860


In [17]:
count = 0
for text in physics.Equation:
    if "ce{" in text:
        count += 1
print(len(physics.index) - count)
physics = physics[~physics.Equation.str.contains("ce{")]
print(len(physics.index))

1064429
1064429


In [18]:
#generate equation filenames
bio_lst = []
_ = bio.apply(lambda row: [bio_lst.append(str(row.name) + "_" + str(row["Label"]) + "_" + str(row["Id"]))], axis=1)
                                                       
                            

In [20]:
bio["FileName"] = bio_lst

In [21]:
bio.head()

Unnamed: 0,Equation,Label,Id,Tags,FileName
0,v = 6d,biology,424,<human-biology><neuroscience>,0_biology_424
3,v = \sqrt{\frac{i_\text{Na max}}{r_i c_m^2 V_\...,biology,424,<human-biology><neuroscience>,3_biology_424
6,V_d=k\frac{\large a.([C_2]-[C_1])}{\large l},biology,1000,,6_biology_1000
7,"counts, mids=h",biology,1207,,7_biology_1207
8,even &lt;- (df,biology,1207,,8_biology_1207


In [22]:
def bio_process(row):
    filepath = "../data/images/bio/" + row.FileName + ".png"
    formula_as_file(row.Equation, filepath)

In [23]:
start = time.time()
with ThreadPoolExecutor(max_workers=8) as executor:
    [executor.submit(process, row) for i, row in bio.iterrows()]
end = time.time()
print((end-start)/60.0)

2.3051753560702006


In [24]:
def chem_process(row):
    filepath = "../data/images/chem/" + row.FileName + ".png"
    formula_as_file(row.Equation, filepath)

In [25]:
chem_lst = []
_ = chem.apply(lambda row: [chem_lst.append(str(row.name) + "_" + str(row["Label"]) + "_" + str(row["Id"]))], axis=1)
chem["FileName"] = chem_lst
chem.head()

Unnamed: 0,Equation,Label,Id,Tags,FileName
0,\mathrm{NaCl},chemistry,2,<ions><crystal-structure><ionic-compounds><sol...,0_chemistry_2
1,\mathrm{Cl},chemistry,2,<ions><crystal-structure><ionic-compounds><sol...,1_chemistry_2
2,\mathrm{Cl^-},chemistry,2,<ions><crystal-structure><ionic-compounds><sol...,2_chemistry_2
3,\mathrm{Na},chemistry,2,<ions><crystal-structure><ionic-compounds><sol...,3_chemistry_2
4,\mathrm{Na^+},chemistry,2,<ions><crystal-structure><ionic-compounds><sol...,4_chemistry_2


In [26]:
print(len(chem.index))

90860


In [27]:
start = time.time()
with ThreadPoolExecutor(max_workers=8) as executor:
    [executor.submit(chem_process, row) for i, row in chem.iterrows()]
end = time.time()
print((end-start)/60.0)

37.6723642150561


In [28]:
def phys_process(row):
    filepath = "../data/images/physics/" + row.FileName + ".png"
    formula_as_file(row.Equation, filepath)

In [29]:
phys_lst = []
_ = physics.apply(lambda row: [phys_lst.append(str(row.name) + "_" + str(row["Label"]) + "_" + str(row["Id"]))], axis=1)
physics["FileName"] = phys_lst
physics.head()

Unnamed: 0,Equation,Label,Id,Tags,FileName
2,S = \hbar \sqrt{s (s + 1)},physics,5,,2_physics_5
3,\displaystyle \frac{\delta S}{\delta q},physics,14,,3_physics_14
5,q_{t_{1}},physics,14,,5_physics_14
6,q_{t_{2}},physics,14,,6_physics_14
7,\delta S=0,physics,14,,7_physics_14


In [30]:
print(len(physics.index))

1064429


In [None]:
start = time.time()
with ThreadPoolExecutor(max_workers=8) as executor:
    [executor.submit(phys_process, row) for i, row in physics.iterrows()]
end = time.time()
print((end-start)/60.0)