In [2]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
pd.options.display.max_colwidth = 100

In [4]:
data_path = '/Volumes/ja2/vegan/vegan_parser/scraping/ewg_data/'

In [5]:
column_names = ['datetime_pulled',
               'search_page_num',
               'chemical_name',
               'score_url',
               'data_availability',
               'chemical_concerns',
               'chemical_functions',
               'chemical_about',
               'chemical_synonyms']

In [6]:
filepaths = [os.path.join(data_path,f) for f in os.listdir(data_path) if f.startswith('chemical_details')]

In [7]:
len(filepaths)

252

In [8]:
df = pd.concat([pd.read_csv(f, names=column_names) for f in filepaths], axis=0)

In [9]:
df.shape

(9062, 9)

In [10]:
df[df['chemical_name']=='SODIUM RAPESEEDATE']

Unnamed: 0,datetime_pulled,search_page_num,chemical_name,score_url,data_availability,chemical_concerns,chemical_functions,chemical_about,chemical_synonyms
2,2019-12-09 21:55:34.303180,157,SODIUM RAPESEEDATE,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: None,,"surfactant - cleansing agent, surfactant - emulsifying agent, viscosity increasing agent -nonaqu...",Sodium Rapeseedate is a sodium salt of the fatty acids derived from Brassica Campestris(Rapeseed...,"FATTY ACIDS, RAPESEED OIL, SODIUM SALT, RAPESEED OIL SODIUM SALT FATTY ACIDS, SODIUM RAPESEEDATE..."


In [11]:
df[df['chemical_name']=='SODIUM RAPESEEDATE']['chemical_synonyms'].values

array(['FATTY ACIDS, RAPESEED OIL, SODIUM SALT, RAPESEED OIL SODIUM SALT FATTY ACIDS, SODIUM RAPESEEDATE, and SODIUM SALT FATTY ACIDS, RAPESEED OIL'],
      dtype=object)

In [12]:
# Oh, it's not just a list, there's an "and" in there... but if the names are always capitalized, that's a way to deal with it.
# Let's look up the alternative names


In [13]:
df[df['chemical_name']=='FATTY ACIDS']

Unnamed: 0,datetime_pulled,search_page_num,chemical_name,score_url,data_availability,chemical_concerns,chemical_functions,chemical_about,chemical_synonyms
27,2019-12-09 09:41:55.692990,124,FATTY ACIDS,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Fair,"Multiple, additive exposure sources (high)",,,FATTY ACIDS


In [14]:
# OH COME ON - these data are different!
# Because Sodium Rapeseedate is one of many Fatty Acids out there.

In [15]:
df[df['chemical_name']=='RAPESEED OIL']

Unnamed: 0,datetime_pulled,search_page_num,chemical_name,score_url,data_availability,chemical_concerns,chemical_functions,chemical_about,chemical_synonyms


In [16]:
# Okay, but it looks like direct synoyms are not in the list.

### A few things to do:
- duplicate rows for all the chemical synonyms, so we can have all the names in a column to look up
- translate the score_url to actual scores


In [17]:
# Translating score_url to actual scores.

df['score_url'].unique()

array(['https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=8&score_min=8',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=2&score_min=1',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=2&score_min=2',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=3&score_min=1',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=4&score_min=2',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=3&score_min=3',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=4&score_min=4',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=7&score_min=5',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=5&score_min=2',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=6&score_min=3',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=4&score_min=1',
       'https://www.ewg.org/skindeep/squircle/show.svg?score=3&score_min=2',

The scores that have a range means it depends on usage.
But, to be fair, water has a value of 1, but if you inhale too much water, or drink way too much of it, you can die. So, we should take these with a grain of salt

In [18]:
df[df['chemical_name']=='WATER']

Unnamed: 0,datetime_pulled,search_page_num,chemical_name,score_url,data_availability,chemical_concerns,chemical_functions,chemical_about,chemical_synonyms
0,2019-12-07 11:47:29.665736,1,WATER,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Robust,,solvent,,"AQUA, DEIONIZED WATER, DISTILLED WATER, ONSEN-SUI, PURIFIED WATER, and WATER"


In [19]:
df.iloc[0,3]

'https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1'

In [20]:
# Let's write out the min and max scores to their own columns, and average them to get an average score
# we'll need a function to read these out

In [21]:
def extract_max(string):
    score_range = [int(s) for s in string if s.isdigit()]
    return score_range[0]

def extract_min(string):
    score_range = [int(s) for s in string if s.isdigit()]
    return score_range[1]

def extract_mean(string):
    score_range = [int(s) for s in string if s.isdigit()]
    return np.mean(score_range)

In [22]:
'111'.isdigit()

True

In [23]:
string = 'https://www.ewg.org/skindeep/squircle/show.svg?score=9&score_min=4'

In [24]:
[int(s) for s in string if s.isdigit()]

[9, 4]

In [25]:
np.mean([1, 1])

1.0

In [26]:
df['min_score'] = df['score_url'].apply(lambda x: extract_min(x))
df['max_score'] = df['score_url'].apply(lambda x: extract_max(x))
df['mean_score'] = df['score_url'].apply(lambda x: extract_mean(x))

In [27]:
df.head()

Unnamed: 0,datetime_pulled,search_page_num,chemical_name,score_url,data_availability,chemical_concerns,chemical_functions,chemical_about,chemical_synonyms,min_score,max_score,mean_score
0,2019-12-07 11:47:29.665736,1,WATER,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Robust,,solvent,,"AQUA, DEIONIZED WATER, DISTILLED WATER, ONSEN-SUI, PURIFIED WATER, and WATER",1,1,1.0
1,2019-12-07 11:47:43.295987,1,FRAGRANCE,https://www.ewg.org/skindeep/squircle/show.svg?score=8&score_min=8,Data: Fair,"Non-reproductive organ system toxicity (moderate), Ecotoxicology (low), and Irritation (skin, ey...","deodorant, masking, and perfuming","The word ""fragrance"" or ""parfum"" on the product label represents an undisclosed mixture of vario...","AROMA, FRAGRANCE, and PARFUM",8,8,8.0
2,2019-12-07 11:47:56.948139,1,GLYCERIN,https://www.ewg.org/skindeep/squircle/show.svg?score=2&score_min=1,Data: Good,Use restrictions (moderate),"denaturant, fragrance ingredient, hair conditioning agent, humectant, oral care agent;oral healt...",Glycerin (also called glycerol) is a naturally occurring alcohol compound and a component of man...,"1,2,3-PROPANETRIOL, 1,2,3-TRIHYDROXYPROPANE, 1,2,3PROPANETRIOL, CONCENTRATED GLYCERIN, GLYCERIN,...",1,2,1.5
3,2019-12-07 11:48:10.616512,1,IRON OXIDES,https://www.ewg.org/skindeep/squircle/show.svg?score=2&score_min=2,Data: Fair,"Enhanced skin absorption, Persistence and bioaccumulation (high), and Non-reproductive organ sys...",colorant and cosmetic colorant,Iron oxides are inorganic chemicals used as colorants.,"BLACK IRON OXIDE, BLACK OXIDE OF IRON, BROWN IRON OXIDE, C.I. PIGMENT BROWN 7, CI 77489, CI 7749...",2,2,2.0
4,2019-12-07 11:48:24.130988,1,TITANIUM DIOXIDE,https://www.ewg.org/skindeep/squircle/show.svg?score=3&score_min=1,Data: Good,Non-reproductive organ system toxicity (moderate) and Occupational hazards (high),"colorant, opacifying agent, sunscreen agent, and ultraviolet light absorber",Titanium dioxide is an inorganic compound used in a range of body care products such as sunscree...,"1385RN 59, 1700 WHITE, 234DA, 500HD, 63B1 WHITE, A 200 (PIGMENT), A 330 (PIGMENT), A-FIL, A-FIL ...",1,3,2.0


In [28]:
pd.options.display.max_colwidth = 250

In [29]:
df['chemical_synonyms'].sample(40)

21                                                                                                                                                                   BRAN, ORYZA SATIVA, BRAN, RICE, ORYZA SATIVA (RICE) BRAN, ORYZA SATIVA BRAN, and RICE BRAN
12                                                                                                       SILANE, TRIMETHYL(OCTADECYLOXY)-, STEAROXYTRIMETHYLSILANE, TRIMETHYL(OCTADECYLOXY)- SILANE, TRIMETHYL(OCTADECYLOXY)SILANE, and TRIMETHYLSILYL STEARATE
7                                                                                                                                                                                                                                         LINUM ALPINUM EXTRACT
10                                                                                          BENZIMIDAZOLE DIAMOND AMINOETHYL UREA CARBAMOYL PROPYL POLYMETHYLSILSESQUIOXANE and BENZIMIDAZOLE DIAMOND AMIDOETHYL UREA CARBAMOYL PROPYL P

# After looking at a bunch of entries, it looks like each ingredient is split by ',' OR ";" OR "and".  (And sometimes, both.)
# Also noticed a few instances of spaces where there shouldn't be (e.g. "DIS ODIUM" instead of "DISODIUM"), but we can't do much about that. 
# Also, some commas are included where they are part of the name of the chemical: "1,2,3-PROPANETRIOL, 1,2,3-TRIHYDROXYPROPANE, 1,2,3PROPANETRIOL, ..."
Hopefully, we can do some fuzzy matching and catch any missing text
We'll have to ignore any numbers

In [85]:
not "1".isdigit()

False

In [86]:
def extract_synonymns_list(str1):
#     print(str1)
    if pd.isna(str1):
        return []
    
    split_and_list = str1.split('and')
    split_and_comma_list_in_list = [string.split(',') for string in split_and_list]
    split_and_comma_semi_list_in_list = [item.strip().split(';') for sublist in split_and_comma_list_in_list for item in sublist]

    intermediate_list = [item for sublist in split_and_comma_semi_list_in_list for item in sublist if item!='']
   
    # also need to filter out numbers in the ingredients due to odd splitting
    
    return [item for item in intermediate_list if not item.isdigit()]
    

In [91]:
str1 = 'PEG-75 MEADOWFOAM OIL, POLYETHYLENE GLYCOL 4000 MEADOWFOAM OIL, and POLYOXYETHYLENE (75) MEADOWFOAM OIL'
str2 = 'PEG-75 MEADOWFOAM OIL, POLYETHYLENE GLYCOL 4000 MEADOWFOAM OIL and POLYOXYETHYLENE (75) MEADOWFOAM OIL'
str3 = 'PEG-75 MEADOWFOAM OIL, POLYETHYLENE GLYCOL 4000 MEADOWFOAM OIL; and POLYOXYETHYLENE (75) MEADOWFOAM OIL'
str4 = '1, 2, 3 SOMETHING OR OTHER, WATER'


In [92]:
extract_synonymns_list(str1)

['PEG-75 MEADOWFOAM OIL',
 'POLYETHYLENE GLYCOL 4000 MEADOWFOAM OIL',
 'POLYOXYETHYLENE (75) MEADOWFOAM OIL']

In [93]:
extract_synonymns_list(str2)

['PEG-75 MEADOWFOAM OIL',
 'POLYETHYLENE GLYCOL 4000 MEADOWFOAM OIL',
 'POLYOXYETHYLENE (75) MEADOWFOAM OIL']

In [94]:
extract_synonymns_list(str3)

['PEG-75 MEADOWFOAM OIL',
 'POLYETHYLENE GLYCOL 4000 MEADOWFOAM OIL',
 'POLYOXYETHYLENE (75) MEADOWFOAM OIL']

In [95]:
extract_synonymns_list(str4)

['3 SOMETHING OR OTHER', 'WATER']

### Okay, that looks about right... 
I don't see any way out of iterating through the rows to deal with dupes.
We'll make a new dataframe and append to that.
I'm not going to bother including a value for the synonym'ed rows



In [96]:
syn_col_names = list(df.columns)+['is_syn']
syn_col_names

['datetime_pulled',
 'search_page_num',
 'chemical_name',
 'score_url',
 'data_availability',
 'chemical_concerns',
 'chemical_functions',
 'chemical_about',
 'chemical_synonyms',
 'min_score',
 'max_score',
 'mean_score',
 'is_syn']

In [97]:
print(datetime.now())

2020-01-05 22:07:01.504249


In [98]:
df_test = df.head(20)

In [99]:
# OH man this is going to be SLOW AF
df_wsyns = pd.DataFrame([],columns=syn_col_names)  # DataFrame WithSYNonymS
print(datetime.now())

for index, row in df_test.iterrows():
    
    if index%100==0:
        print(f'index: {index}')
    
    syn_list = extract_synonymns_list(row[8])
    
    if syn_list!=[]:

        # append the original
        row['is_syn'] = False
        df_wsyns = df_wsyns.append(row, ignore_index=True)
    
        for syn in syn_list:
            row['chemical_name'] = syn
            row['chemical_synonyms'] = np.nan
            row['is_syn'] = True
            df_wsyns = df_wsyns.append(row, ignore_index=True)

    else:
        row['is_syn'] = False
        df_wsyns = df_wsyns.append(row, ignore_index=True)
    
print(datetime.now())
df_wsyns.to_csv('dataframe_with_syns.csv', index=False)
            

2020-01-05 22:07:03.119859
index: 0
2020-01-05 22:07:07.984080


In [100]:
df_wsyns

Unnamed: 0,datetime_pulled,search_page_num,chemical_name,score_url,data_availability,chemical_concerns,chemical_functions,chemical_about,chemical_synonyms,min_score,max_score,mean_score,is_syn
0,2019-12-07 11:47:29.665736,1,WATER,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Robust,,solvent,,"AQUA, DEIONIZED WATER, DISTILLED WATER, ONSEN-SUI, PURIFIED WATER, and WATER",1,1,1.0,False
1,2019-12-07 11:47:29.665736,1,AQUA,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Robust,,solvent,,,1,1,1.0,True
2,2019-12-07 11:47:29.665736,1,DEIONIZED WATER,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Robust,,solvent,,,1,1,1.0,True
3,2019-12-07 11:47:29.665736,1,DISTILLED WATER,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Robust,,solvent,,,1,1,1.0,True
4,2019-12-07 11:47:29.665736,1,ONSEN-SUI,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Robust,,solvent,,,1,1,1.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,2019-12-07 11:51:48.394000,1,ESTER WITH 1,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Limited,,fragrance ingredient and skin-conditioning agent - occlusive,Caprylic/Capric Triglyceride is a mixed triester of glycerin and caprylic and capric acids.,,1,1,1.0,True
594,2019-12-07 11:51:48.394000,1,3-PRPANETRIOL OCTANOATE DECANOIC ACID,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Limited,,fragrance ingredient and skin-conditioning agent - occlusive,Caprylic/Capric Triglyceride is a mixed triester of glycerin and caprylic and capric acids.,,1,1,1.0,True
595,2019-12-07 11:51:48.394000,1,GLYCEROL CALRYLATE CAPRINATE,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Limited,,fragrance ingredient and skin-conditioning agent - occlusive,Caprylic/Capric Triglyceride is a mixed triester of glycerin and caprylic and capric acids.,,1,1,1.0,True
596,2019-12-07 11:51:48.394000,1,GLYCEROL CAPRYLATE CAPRINATE,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Limited,,fragrance ingredient and skin-conditioning agent - occlusive,Caprylic/Capric Triglyceride is a mixed triester of glycerin and caprylic and capric acids.,,1,1,1.0,True


In [79]:

pwd

'/Volumes/ja2/vegan/vegan_parser/notebooks'

In [72]:
df_wsyns.append(row, ignore_index=True)

Unnamed: 0,datetime_pulled,search_page_num,chemical_name,score_url,data_availability,chemical_concerns,chemical_functions,chemical_about,chemical_synonyms,min_score,max_score,mean_score,is_syn
0,2019-12-07 11:51:48.394000,1,OCTANOIC/DECANOIC ACID TRIGLYCERIDE,https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1,Data: Limited,,fragrance ingredient and skin-conditioning agent - occlusive,Caprylic/Capric Triglyceride is a mixed triester of glycerin and caprylic and capric acids.,,1,1,1.0,True


In [73]:
df_wsyns

Unnamed: 0,datetime_pulled,search_page_num,chemical_name,score_url,data_availability,chemical_concerns,chemical_functions,chemical_about,chemical_synonyms,min_score,max_score,mean_score,is_syn


In [170]:
df.iloc[0,:]

datetime_pulled                                                         2019-12-07 11:47:29.665736
search_page_num                                                                                  1
chemical_name                                                                                WATER
score_url                       https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1
data_availability                                                                     Data: Robust
chemical_concerns                                                                              NaN
chemical_functions                                                                         solvent
chemical_about                                                                                 NaN
chemical_synonyms     AQUA, DEIONIZED WATER, DISTILLED WATER, ONSEN-SUI, PURIFIED WATER, and WATER
min_score                                                                                        1
max_score 

In [178]:
testrow = df.iloc[110,:]

In [182]:
testrow
# we'll leave these the same, exept we'll change testrow['chemical_name'] = first chemical synonym  & testrow['chemical_synonyms'] = np.nan
# and so on, for all the chemical synonyms

datetime_pulled                                                                                                                                            2019-12-09 00:36:55.835887
search_page_num                                                                                                                                                                   101
chemical_name                                                                                                                                                        SODIUM BISULFATE
score_url                                                                                                          https://www.ewg.org/skindeep/squircle/show.svg?score=1&score_min=1
data_availability                                                                                                                                                          Data: Fair
chemical_concerns                                                                         

nan

In [121]:
split_and_list = str1.split('and')
split_and_list

['PEG-75 MEADOWFOAM OIL, POLYETHYLENE GLYCOL 4000 MEADOWFOAM OIL, ',
 ' POLYOXYETHYLENE (75) MEADOWFOAM OIL']

In [123]:
split_and_comma_list = [string.split(',') for string in split_and_list]
split_and_comma_list

[['PEG-75 MEADOWFOAM OIL', ' POLYETHYLENE GLYCOL 4000 MEADOWFOAM OIL', ' '],
 [' POLYOXYETHYLENE (75) MEADOWFOAM OIL']]

In [127]:
parsed_list_of_list = [item.strip().split(';') for sublist in split_and_comma_list for item in sublist]
parsed_list_of_list

[['PEG-75 MEADOWFOAM OIL'],
 ['POLYETHYLENE GLYCOL 4000 MEADOWFOAM OIL'],
 [''],
 ['POLYOXYETHYLENE (75) MEADOWFOAM OIL']]

In [128]:
[item for sublist in parsed_list_of_list for item in sublist if item!='']

['PEG-75 MEADOWFOAM OIL',
 'POLYETHYLENE GLYCOL 4000 MEADOWFOAM OIL',
 'POLYOXYETHYLENE (75) MEADOWFOAM OIL']

In [None]:
#