In [None]:
import pandas as pd
import re
from transformers import BertTokenizer

In [None]:
# Load cleaned data
cleaned_product_data = pd.read_csv('/content/cleaned_product_data.csv')
cleaned_product_data

Unnamed: 0,link,product_title,price,actual_price,ratings,color
0,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i3 12th Gen | 16GB RAM | 512G...,55999.0,55999.0,13.0,Black
1,https://www.daraz.com.np/products/apple-macboo...,Apple MacBook Air 13-inch M1 256GB - Oliz Store,109900.0,139900.0,76.0,Space Grey
2,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 16GB RAM | 512G...,68999.0,68999.0,22.0,Black
3,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 8GB RAM | 256GB...,64000.0,64000.0,8.0,Black
4,https://www.daraz.com.np/products/acer-nitro-v...,Acer Nitro V 15 i7 13th Gen 13620H | 16GB DDR5...,137999.0,137999.0,2.0,Black
...,...,...,...,...,...,...
528,https://www.daraz.com.np/products/asus-x515-i5...,Asus X515 i5 11th Gen 8GB RAM 512GB ssd 2GB Nv...,75990.0,88000.0,,Silver
529,https://www.daraz.com.np/products/asus-vivoboo...,ASUS VivoBook 16 F1605VA Intel Core i5 13th Ge...,89990.0,110000.0,,Black
530,https://www.daraz.com.np/products/lenovo-ideap...,"Lenovo Ideapad 3 AMD Ryzen 3 5300U processor, ...",52000.0,52000.0,,Brown
531,https://www.daraz.com.np/products/lenovo-ideap...,Lenovo IdeaPad slim 3 Windows 11 15.6 inch HD ...,55000.0,55000.0,,Grey


In [None]:
# Function to extract specific attributes from the 'product_title" column
def extract_attributes(product_title):
    # Extracting brand as the first word
    brand = product_title.split()[0]

    # Extracting model (next 1 or 2 words after the brand)
    model = " ".join(product_title.split()[1:])

    model_match = re.search(r'([A-Za-z0-9\s]+?)(?=\s*[-]|i\d+|AMD\s*\w*|M\d+|\(\d*\w*\s*)', model)
    filtered_model = model_match.group(0) if model_match else " ".join(product_title.split()[1:3])

    # Extracting processor
    processor_match = re.search(r'(i\d+\s*\d+\w* Gen|i\d+\s*-\d+\w*|i\d+-\d+\s*\w*|M\d+|AMD\s*[\w\d]+|i\d+)', product_title)
    processor = processor_match.group(0) if processor_match else None

    # Extracting RAM
    ram_match = re.search(r'(\d+GB\s*RAM|\d+GB\s*-R\w+)', product_title)
    ram = ram_match.group(0) if ram_match else None

    # Extracting storage
    storage_match = re.search(r'(\d+GB\s*(SSD|HDD))', product_title)
    storage = storage_match.group(0) if storage_match else None

    return pd.Series([brand, filtered_model, processor, ram, storage])

In [None]:
# Applying the extract_attributes function to the "product_title" column
cleaned_product_data[['Brand', 'Model', 'Processor', 'RAM', 'Storage']] = cleaned_product_data['product_title'].apply(extract_attributes)
cleaned_product_data

Unnamed: 0,link,product_title,price,actual_price,ratings,color,Brand,Model,Processor,RAM,Storage
0,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i3 12th Gen | 16GB RAM | 512G...,55999.0,55999.0,13.0,Black,Dell,Vostro 3520,i3 12th Gen,16GB RAM,512GB SSD
1,https://www.daraz.com.np/products/apple-macboo...,Apple MacBook Air 13-inch M1 256GB - Oliz Store,109900.0,139900.0,76.0,Space Grey,Apple,MacBook Air 13,M1,,
2,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 16GB RAM | 512G...,68999.0,68999.0,22.0,Black,Dell,Vostro 3520,i5 12th Gen,16GB RAM,512GB SSD
3,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 8GB RAM | 256GB...,64000.0,64000.0,8.0,Black,Dell,Vostro 3520,i5 12th Gen,8GB RAM,256GB SSD
4,https://www.daraz.com.np/products/acer-nitro-v...,Acer Nitro V 15 i7 13th Gen 13620H | 16GB DDR5...,137999.0,137999.0,2.0,Black,Acer,Nitro V 15,i7 13th Gen,,512GB SSD
...,...,...,...,...,...,...,...,...,...,...,...
528,https://www.daraz.com.np/products/asus-x515-i5...,Asus X515 i5 11th Gen 8GB RAM 512GB ssd 2GB Nv...,75990.0,88000.0,,Silver,Asus,X515,i5 11th Gen,8GB RAM,
529,https://www.daraz.com.np/products/asus-vivoboo...,ASUS VivoBook 16 F1605VA Intel Core i5 13th Ge...,89990.0,110000.0,,Black,ASUS,VivoBook 16 F1605VA Intel Core,i5 13th Gen,8GB RAM,512GB SSD
530,https://www.daraz.com.np/products/lenovo-ideap...,"Lenovo Ideapad 3 AMD Ryzen 3 5300U processor, ...",52000.0,52000.0,,Brown,Lenovo,Ideapad 3,AMD Ryzen,4GB RAM,
531,https://www.daraz.com.np/products/lenovo-ideap...,Lenovo IdeaPad slim 3 Windows 11 15.6 inch HD ...,55000.0,55000.0,,Grey,Lenovo,IdeaPad slim,,,


In [None]:
cleaned_product_data.iloc[368]

Unnamed: 0,368
link,https://www.daraz.com.np/products/dell-inspiro...
product_title,Dell Inspiron 16 5625 (AMD Ryzen 5 - 5625U Pro...
price,94500.0
actual_price,102000.0
ratings,
color,Silver
Brand,Dell
Model,Inspiron 16 5625
Processor,AMD Ryzen
RAM,8GB RAM


In [None]:
none_storage_count = cleaned_product_data.isnull().sum()
none_storage_count

Unnamed: 0,0
link,0
product_title,0
price,0
actual_price,0
ratings,436
color,0
Brand,0
Model,0
Processor,170
RAM,285


In [None]:
# Replace NaN values with 0.0 in the 'ratings' column
cleaned_product_data['ratings'] = cleaned_product_data['ratings'].fillna(0.0)

# Replace None values with 'Unknown' in the 'Processor', 'RAM', and 'Storage' columns
cleaned_product_data['Processor'] = cleaned_product_data['Processor'].fillna('Unknown')
cleaned_product_data['RAM'] = cleaned_product_data['RAM'].fillna('Unknown')
cleaned_product_data['Storage'] = cleaned_product_data['Storage'].fillna('Unknown')

cleaned_product_data

Unnamed: 0,link,product_title,price,actual_price,ratings,color,Brand,Model,Processor,RAM,Storage
0,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i3 12th Gen | 16GB RAM | 512G...,55999.0,55999.0,13.0,Black,Dell,Vostro 3520,i3 12th Gen,16GB RAM,512GB SSD
1,https://www.daraz.com.np/products/apple-macboo...,Apple MacBook Air 13-inch M1 256GB - Oliz Store,109900.0,139900.0,76.0,Space Grey,Apple,MacBook Air 13,M1,Unknown,Unknown
2,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 16GB RAM | 512G...,68999.0,68999.0,22.0,Black,Dell,Vostro 3520,i5 12th Gen,16GB RAM,512GB SSD
3,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 8GB RAM | 256GB...,64000.0,64000.0,8.0,Black,Dell,Vostro 3520,i5 12th Gen,8GB RAM,256GB SSD
4,https://www.daraz.com.np/products/acer-nitro-v...,Acer Nitro V 15 i7 13th Gen 13620H | 16GB DDR5...,137999.0,137999.0,2.0,Black,Acer,Nitro V 15,i7 13th Gen,Unknown,512GB SSD
...,...,...,...,...,...,...,...,...,...,...,...
528,https://www.daraz.com.np/products/asus-x515-i5...,Asus X515 i5 11th Gen 8GB RAM 512GB ssd 2GB Nv...,75990.0,88000.0,0.0,Silver,Asus,X515,i5 11th Gen,8GB RAM,Unknown
529,https://www.daraz.com.np/products/asus-vivoboo...,ASUS VivoBook 16 F1605VA Intel Core i5 13th Ge...,89990.0,110000.0,0.0,Black,ASUS,VivoBook 16 F1605VA Intel Core,i5 13th Gen,8GB RAM,512GB SSD
530,https://www.daraz.com.np/products/lenovo-ideap...,"Lenovo Ideapad 3 AMD Ryzen 3 5300U processor, ...",52000.0,52000.0,0.0,Brown,Lenovo,Ideapad 3,AMD Ryzen,4GB RAM,Unknown
531,https://www.daraz.com.np/products/lenovo-ideap...,Lenovo IdeaPad slim 3 Windows 11 15.6 inch HD ...,55000.0,55000.0,0.0,Grey,Lenovo,IdeaPad slim,Unknown,Unknown,Unknown


In [None]:
# Drop the 'product_title' column
cleaned_product_data = cleaned_product_data.drop('product_title', axis=1)

# Display the first few rows to confirm the column has been dropped
cleaned_product_data.head()

Unnamed: 0,link,price,actual_price,ratings,color,Brand,Model,Processor,RAM,Storage
0,https://www.daraz.com.np/products/dell-vostro-...,55999.0,55999.0,13.0,Black,Dell,Vostro 3520,i3 12th Gen,16GB RAM,512GB SSD
1,https://www.daraz.com.np/products/apple-macboo...,109900.0,139900.0,76.0,Space Grey,Apple,MacBook Air 13,M1,Unknown,Unknown
2,https://www.daraz.com.np/products/dell-vostro-...,68999.0,68999.0,22.0,Black,Dell,Vostro 3520,i5 12th Gen,16GB RAM,512GB SSD
3,https://www.daraz.com.np/products/dell-vostro-...,64000.0,64000.0,8.0,Black,Dell,Vostro 3520,i5 12th Gen,8GB RAM,256GB SSD
4,https://www.daraz.com.np/products/acer-nitro-v...,137999.0,137999.0,2.0,Black,Acer,Nitro V 15,i7 13th Gen,Unknown,512GB SSD


In [None]:
# Save the preprocessed Data
cleaned_product_data.to_csv('preprocessed_product_data.csv', index=False)