In [None]:
# Importing all the necessary libraries
import pandas as pd
import re


In [None]:
# Read the two CSV files
df1 = pd.read_csv("cosmetic_notifications.csv")
df2 = pd.read_csv("cosmetic_products_with_risk_description.csv") # Refer to create_description.ipynb for more details

In [4]:
# Show the first 10 entries of df1
print("=== First 10 entries of df1 ===\n")
print(df1.head(10))

=== First 10 entries of df1 ===

        notif_no                                            product  \
0  NOT200706378K  La Maison Du Savon De Marseille 100g Argan Oil...   
1  NOT200706384K  La Maison Du Savon De Marseille Shampoing Soli...   
2  NOT200706392K  La Maison Du Savon De Marseille 100g Argan Oil...   
3  NOT220707665K             I57 HYPNOBEAUTY WATER BOOSTER (ENERGY)   
4  NOT220707660K             HELEN SEWARD MEDITER THERAPY SCRUB 6/P   
5  NOT220707668K              EUNYUL Collagen Special Program Cream   
6  NOT220707669K            EUNYUL Collagen Special Program Essence   
7  NOT220707670K          EUNYUL Collagen Special Program Eye Cream   
8  NOT220707699K  HSF NATURALS - JOJOBA HAIR SERUM WITH ORGANIC ...   
9  NOT220707700K              HSF NATURALS - OCEANIC SEA SALT SCRUB   

                                     company  date_notif  
0  LA MAISON DU SAVON DE MARSEILLE SDN. BHD.  2022-07-25  
1  LA MAISON DU SAVON DE MARSEILLE SDN. BHD.  2022-07-25  
2  LA MA

In [5]:
print("\n=== Read the first 10 entries of df2 ===\n")
print(df2.head(10))


=== Read the first 10 entries of df2 ===

        notif_no                                            product  \
0  NOT200603276K   DELUXE BEAUTY - ULTRA LIGHTENING CREAM PEARL ...   
1  NOT180503436K          3RD SERIES YANKO FADE OUT CREAM DAY CREAM   
2  NOT180503437K       3RD SERIES YANKO WHITENING CREAM NIGHT CREAM   
3  NOT180503439K          5TH SERIES YANKO FADE OUT CREAM DAY CREAM   
4  NOT180503440K       5TH SERIES YANKO WHITENING CREAM NIGHT CREAM   
5  NOT180503441K               7TH SERIES YANKO WHITENING CREAM DAY   
6  NOT180503442K             7TH SERIES YANKO WHITENING CREAM NIGHT   
7  NOT221004660K  ABYRA SEAWEED SKIN CARE- SEAWEED CELL RENEWAL ...   
8  NOT181003521K                        AILISI Acne Purifying Cream   
9  NOT220202802K                       AIREE BEAUTYCARE NIGHT CREAM   

                       holder                       manufacturer  \
0        LUNA GROUP SDN. BHD.               ROYAL INTERNATIONAL.   
1           LURVEYA SDN. BHD.          

In [6]:
# Show the columns of df1 and df2
print(df1.columns)
print(df2.columns)

Index(['notif_no', 'product', 'company', 'date_notif'], dtype='object')
Index(['notif_no', 'product', 'holder', 'manufacturer', 'substance_detected',
       'Description'],
      dtype='object')


In [7]:
# Merge the two dataframes df1 and df2 on the notif_no column
common_notifs = pd.merge(df1, df2, on="notif_no")

# Check the columns of the merged dataframe
print(common_notifs)

Empty DataFrame
Columns: [notif_no, product_x, company, date_notif, product_y, holder, manufacturer, substance_detected, Description]
Index: []


In [8]:
# Convert the notif_no column to 'string' datatype and remove any whitespaces
df1['notif_no'] = df1['notif_no'].astype(str).str.strip()
df2['notif_no'] = df2['notif_no'].astype(str).str.strip()

# Check the duplicated notif_no between df1 and df2
common = set(df1['notif_no']) & set(df2['notif_no'])
print("number of duplicated notif_no：", len(common))  # 0 if no duplicated notif_no

number of duplicated notif_no： 0


In [9]:
# Create a 'status' column for df1 and df2 with the boolean values True and False
df1['status'] = True # True represents 'Safe' product
df2['status'] = False # False represents 'Unsafe' product

In [10]:
# Check for correct data type of the 'status' column
print(df1['status'].dtype)  # correct output should be: bool
print(df2['status'].dtype) 

bool
bool


In [11]:
# Function for Data Preprocessing: Title case capitalisation
def clean_text_column(series):
    return series.astype(str).str.strip().str.lower().str.title()

In [12]:
# Apply title case capitalisation on column 'product' and 'company' for df1 
df1['product'] = clean_text_column(df1['product'])
df1['company'] = clean_text_column(df1['company'])

In [13]:
# Check the first 10 entries of df1
print("First 10 entries of df1:\n")
print(df1.head(10))

First 10 entries of df1:

        notif_no                                            product  \
0  NOT200706378K  La Maison Du Savon De Marseille 100G Argan Oil...   
1  NOT200706384K  La Maison Du Savon De Marseille Shampoing Soli...   
2  NOT200706392K  La Maison Du Savon De Marseille 100G Argan Oil...   
3  NOT220707665K             I57 Hypnobeauty Water Booster (Energy)   
4  NOT220707660K             Helen Seward Mediter Therapy Scrub 6/P   
5  NOT220707668K              Eunyul Collagen Special Program Cream   
6  NOT220707669K            Eunyul Collagen Special Program Essence   
7  NOT220707670K          Eunyul Collagen Special Program Eye Cream   
8  NOT220707699K  Hsf Naturals - Jojoba Hair Serum With Organic ...   
9  NOT220707700K              Hsf Naturals - Oceanic Sea Salt Scrub   

                                     company  date_notif  status  
0  La Maison Du Savon De Marseille Sdn. Bhd.  2022-07-25    True  
1  La Maison Du Savon De Marseille Sdn. Bhd.  2022-07-25  

In [14]:
# Apply title case capitalisation on column 'product', 'holder', 'manufacturer', 'substance_detected' for df2
df2['product'] = clean_text_column(df2['product'])
df2['holder'] = clean_text_column(df2['holder'])
df2['manufacturer'] = clean_text_column(df2['manufacturer'])
df2['substance_detected'] = clean_text_column(df2['substance_detected'])

In [15]:
# Check the first 10 entries of df2
print("\nFirst 10 entries of df2:\n")
print(df2.head(10))


First 10 entries of df2:

        notif_no                                            product  \
0  NOT200603276K  Deluxe Beauty - Ultra Lightening Cream Pearl P...   
1  NOT180503436K          3Rd Series Yanko Fade Out Cream Day Cream   
2  NOT180503437K       3Rd Series Yanko Whitening Cream Night Cream   
3  NOT180503439K          5Th Series Yanko Fade Out Cream Day Cream   
4  NOT180503440K       5Th Series Yanko Whitening Cream Night Cream   
5  NOT180503441K               7Th Series Yanko Whitening Cream Day   
6  NOT180503442K             7Th Series Yanko Whitening Cream Night   
7  NOT221004660K  Abyra Seaweed Skin Care- Seaweed Cell Renewal ...   
8  NOT181003521K                        Ailisi Acne Purifying Cream   
9  NOT220202802K                       Airee Beautycare Night Cream   

                       holder                       manufacturer  \
0        Luna Group Sdn. Bhd.               Royal International.   
1           Lurveya Sdn. Bhd.           Taiwan Biotech 

In [16]:
# Export the pre-processed data to CSV files
df1.to_csv("cleaned_cosmetic_notifications.csv", index=False) # df1 to 'cleaned_cosmetic_notifications.csv'
df2.to_csv("cleaned_cosmetic_products_with_risk_description.csv", index=False) # df2 to 'cleaned_cosmetic_products_with_risk_description.csv'

In [17]:
# Read the 'cleaned_.csv' files 
df_1 = pd.read_csv("cleaned_cosmetic_notifications.csv")
df_2 = pd.read_csv("cleaned_cosmetic_products_with_risk_description.csv")

# Rename the column 'holder' to 'company' in df2. No change in the contents
if 'holder' in df_2.columns:
    df_2 = df_2.rename(columns={'holder': 'company'})

# Ensure df1 and df2 have 'company' column (now both have the same column name) and then merge both dataframes
merged_df = pd.concat([df_2, df_1], ignore_index=True, sort=False)

# Save the merged dataframe to a new a CSV file named 'merged_cosmetic_data.csv'
merged_df.to_csv("merged_cosmetic_data.csv", index=False)

# Show the first 10 entries of the merged dataframe
print(merged_df.head(10))

        notif_no                                            product  \
0  NOT200603276K  Deluxe Beauty - Ultra Lightening Cream Pearl P...   
1  NOT180503436K          3Rd Series Yanko Fade Out Cream Day Cream   
2  NOT180503437K       3Rd Series Yanko Whitening Cream Night Cream   
3  NOT180503439K          5Th Series Yanko Fade Out Cream Day Cream   
4  NOT180503440K       5Th Series Yanko Whitening Cream Night Cream   
5  NOT180503441K               7Th Series Yanko Whitening Cream Day   
6  NOT180503442K             7Th Series Yanko Whitening Cream Night   
7  NOT221004660K  Abyra Seaweed Skin Care- Seaweed Cell Renewal ...   
8  NOT181003521K                        Ailisi Acne Purifying Cream   
9  NOT220202802K                       Airee Beautycare Night Cream   

                      company                       manufacturer  \
0        Luna Group Sdn. Bhd.               Royal International.   
1           Lurveya Sdn. Bhd.           Taiwan Biotech Co., Ltd.   
2           Lu

In [18]:
# Read the 'merged_cosmetic_data.csv' file
df = pd.read_csv("merged_cosmetic_data.csv")

# Create a new variable to store the split rows
split_rows = []

# Split the 'Description' column into 'substance_detected' and 'Description' columns
for _, row in df.iterrows():
    notif_no = row['notif_no']
    substances = []
    descriptions = {}

    # Step 1: Split the 'Description'
    description_text = str(row['Description'])
    matches = re.findall(r"【(.*?)】(.*?)(?=(【|$))", description_text, flags=re.DOTALL)
    for match in matches:
        name = match[0].strip().lower().capitalize()  # Standardise the case of the substance name
        desc = match[1].strip().replace('\n', ' ')
        desc = desc.replace("||", "").strip()         # Remove ||
        desc = re.sub(r"\s+", " ", desc)              # Remove whitespace
        descriptions[name] = desc

    # Step 2: Split the substances
    raw_substances = str(row['substance_detected'])
    cleaned_substance_list = re.sub(r"\s+and\s+", ",", raw_substances, flags=re.IGNORECASE)
    substances = [s.strip().lower().capitalize() for s in cleaned_substance_list.split(",")]

    # Step 3: Generate the split structure
    for substance in substances:
        desc = descriptions.get(substance, "")  # If the substance is not found, fill in an empty string
        split_rows.append({
            "notif_no": notif_no,
            "substance_detected": substance,
            "Description": desc
        })

# Convert the split rows to a dataframe
df_split = pd.DataFrame(split_rows)

# Drop duplicates and split into two dataframes
df_risk_info = df_split[["notif_no", "substance_detected"]].drop_duplicates() # For substance detected
df_substance_desc = df_split[["substance_detected", "Description"]].drop_duplicates() # For substance description

# Save the two dataframes to CSV files
df_risk_info.to_csv("cleaned_risk_info.csv", index=False)
df_substance_desc.to_csv("cleaned_substance_description.csv", index=False)


  df = pd.read_csv("merged_cosmetic_data.csv")


In [19]:
# Read the original merged data CSV file
df = pd.read_csv('merged_cosmetic_data.csv')

# Clean by keeping only the necessary columns
columns_needed = ['notif_no', 'product', 'company', 'manufacturer', 'date_notif', 'status']
cleaned_df = df[columns_needed]

# Save the cleaned dataframe to a new CSV file named 'cleaned_product_info.csv'
cleaned_df.to_csv('cleaned_product_info.csv', index=False)


  df = pd.read_csv('merged_cosmetic_data.csv')


In [22]:
# Read the cleaned_risk_info CSV file
df_risk_info = pd.read_csv("cleaned_risk_info.csv")

# Check the total number of entries in the dataframe
print("\n=== Total Rows ===")
print(len(df_risk_info))

# Display the first 10 entries of the dataframe
print("\n=== First 10 Entries ===")
print(df_risk_info.head(10))



=== Total Rows ===
202835

=== First 10 Entries ===
        notif_no substance_detected
0  NOT200603276K            Mercury
1  NOT180503436K            Mercury
2  NOT180503437K            Tretnon
3  NOT180503439K            Mercury
4  NOT180503440K            Tretnon
5  NOT180503441K            Mercury
6  NOT180503442K            Tretnon
7  NOT221004660K            Mercury
8  NOT181003521K        Clindamycin
9  NOT220202802K            Mercury


In [23]:
# Read the cleaned_product_info CSV file
df_product_info = pd.read_csv("cleaned_product_info.csv")

# Check the total number of entries in the dataframe
print("\n=== Total Rows ===")
print(len(df_product_info))

# Display the first 10 entries of the dataframe
print("\n=== First 10 Entries ===")
print(df_product_info.head(10))



=== Total Rows ===
202790

=== First 10 Entries ===
        notif_no                                            product  \
0  NOT200603276K  Deluxe Beauty - Ultra Lightening Cream Pearl P...   
1  NOT180503436K          3Rd Series Yanko Fade Out Cream Day Cream   
2  NOT180503437K       3Rd Series Yanko Whitening Cream Night Cream   
3  NOT180503439K          5Th Series Yanko Fade Out Cream Day Cream   
4  NOT180503440K       5Th Series Yanko Whitening Cream Night Cream   
5  NOT180503441K               7Th Series Yanko Whitening Cream Day   
6  NOT180503442K             7Th Series Yanko Whitening Cream Night   
7  NOT221004660K  Abyra Seaweed Skin Care- Seaweed Cell Renewal ...   
8  NOT181003521K                        Ailisi Acne Purifying Cream   
9  NOT220202802K                       Airee Beautycare Night Cream   

                      company                       manufacturer date_notif  \
0        Luna Group Sdn. Bhd.               Royal International.        NaN   
1      

  df_product_info = pd.read_csv("cleaned_product_info.csv")


In [24]:
# Read the cleaned_substance_description CSV file
df_substance_desc = pd.read_csv("cleaned_substance_description.csv")

# Check the total number of entries in the dataframe
print("\n=== Total Rows ===")
print(len(df_substance_desc))

# Display the first 10 entries of the dataframe
print("\n=== First 10 Entries ===")
print(df_substance_desc.head(10))





=== Total Rows ===
21

=== First 10 Entries ===
  substance_detected                                        Description
0            Mercury  Ingredient's risk: Toxic heavy metal that dama...
1            Tretnon                                                NaN
2        Clindamycin  Ingredient's risk: Contributes to antibiotic r...
3       Hydroquinone  Ingredient's risk: May cause ochronosis (perma...
4            Steroid  Ingredient's risk: Skin thinning, stretch mark...
5    Diphenhydramine  Ingredient's risk: Drowsiness, dizziness, dry ...
6          Tretinoin  Ingredient's risk: Severe skin irritation, inc...
7  Methyl salicylate  Ingredient's risk: Highly toxic if ingested, c...
8            Menthol  Ingredient's risk: Can cause severe irritation...
9             Thymol  Ingredient's risk: Skin and mucous membrane ir...
