In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import ngrams
from transformers import BertTokenizer

In [None]:
# Load cleaned data
cleaned_product_data = pd.read_csv('/content/cleaned_product_data.csv')
cleaned_product_data

Unnamed: 0,link,product_title,price,actual_price,ratings,color
0,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i3 12th Gen | 16GB RAM | 512G...,55999.0,55999.0,13.0,Black
1,https://www.daraz.com.np/products/apple-macboo...,Apple MacBook Air 13-inch M1 256GB - Oliz Store,109900.0,139900.0,76.0,Space Grey
2,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 16GB RAM | 512G...,68999.0,68999.0,22.0,Black
3,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 8GB RAM | 256GB...,64000.0,64000.0,8.0,Black
4,https://www.daraz.com.np/products/acer-nitro-v...,Acer Nitro V 15 i7 13th Gen 13620H | 16GB DDR5...,137999.0,137999.0,2.0,Black
...,...,...,...,...,...,...
528,https://www.daraz.com.np/products/asus-x515-i5...,Asus X515 i5 11th Gen 8GB RAM 512GB ssd 2GB Nv...,75990.0,88000.0,,Silver
529,https://www.daraz.com.np/products/asus-vivoboo...,ASUS VivoBook 16 F1605VA Intel Core i5 13th Ge...,89990.0,110000.0,,Black
530,https://www.daraz.com.np/products/lenovo-ideap...,"Lenovo Ideapad 3 AMD Ryzen 3 5300U processor, ...",52000.0,52000.0,,Brown
531,https://www.daraz.com.np/products/lenovo-ideap...,Lenovo IdeaPad slim 3 Windows 11 15.6 inch HD ...,55000.0,55000.0,,Grey


In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Initialize NLTK components
stop_words = set(stopwords.words('english'))

In [None]:
# Function to preprocess the product title
def preprocess_product_title(title):
    title = title.lower()  # Lowercase
    title = re.sub(r'[^a-z0-9\s]', '', title)  # Remove punctuation
    tokens = word_tokenize(title)  # Tokenization
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words

    # Create bigrams
    bigrams = [' '.join(bigram) for bigram in ngrams(tokens, 2)]

    # Remove duplicates while preserving order
    seen = set()
    unique_tokens = [word for word in tokens if not (word in seen or seen.add(word))]

    return ' '.join(unique_tokens).strip()

In [None]:
# Preprocess the product title
cleaned_product_data['processed_title'] = cleaned_product_data['product_title'].apply(preprocess_product_title)
cleaned_product_data

Unnamed: 0,link,product_title,price,actual_price,ratings,color,processed_title
0,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i3 12th Gen | 16GB RAM | 512G...,55999.0,55999.0,13.0,Black,dell vostro 3520 i3 12th gen 16gb ram 512gb ss...
1,https://www.daraz.com.np/products/apple-macboo...,Apple MacBook Air 13-inch M1 256GB - Oliz Store,109900.0,139900.0,76.0,Space Grey,apple macbook air 13inch m1 256gb oliz store
2,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 16GB RAM | 512G...,68999.0,68999.0,22.0,Black,dell vostro 3520 i5 12th gen 16gb ram 512gb ss...
3,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 8GB RAM | 256GB...,64000.0,64000.0,8.0,Black,dell vostro 3520 i5 12th gen 8gb ram 256gb ssd...
4,https://www.daraz.com.np/products/acer-nitro-v...,Acer Nitro V 15 i7 13th Gen 13620H | 16GB DDR5...,137999.0,137999.0,2.0,Black,acer nitro v 15 i7 13th gen 13620h 16gb ddr5 5...
...,...,...,...,...,...,...,...
528,https://www.daraz.com.np/products/asus-x515-i5...,Asus X515 i5 11th Gen 8GB RAM 512GB ssd 2GB Nv...,75990.0,88000.0,,Silver,asus x515 i5 11th gen 8gb ram 512gb ssd 2gb nv...
529,https://www.daraz.com.np/products/asus-vivoboo...,ASUS VivoBook 16 F1605VA Intel Core i5 13th Ge...,89990.0,110000.0,,Black,asus vivobook 16 f1605va intel core i5 13th ge...
530,https://www.daraz.com.np/products/lenovo-ideap...,"Lenovo Ideapad 3 AMD Ryzen 3 5300U processor, ...",52000.0,52000.0,,Brown,lenovo ideapad 3 amd ryzen 5300u processor 4gb...
531,https://www.daraz.com.np/products/lenovo-ideap...,Lenovo IdeaPad slim 3 Windows 11 15.6 inch HD ...,55000.0,55000.0,,Grey,lenovo ideapad slim 3 windows 11 156 inch hd i...


In [None]:
# Function to extract specific attributes from the 'product_title" column
def extract_attributes(product_title):
    # Extracting brand as the first word
    brand = product_title.split()[0]

    # Extracting model (next 1 or 2 words after the brand)
    model = " ".join(product_title.split()[1:])

    model_match = re.search(r'([a-z0-9\s]+?)(?=\s*[-]|i\d+|amd\s*\w*|m\d+|\(\d*\w*\s*)', model)
    filtered_model = model_match.group(0) if model_match else " ".join(product_title.split()[1:3])

    # Extracting processor
    processor_match = re.search(r'(i\d+\s*\d+\w* gen|i\d+\s*-\d+\w*|i\d+-\d+\s*\w*|m\d+|amd\s*[\w\d]+|i\d+)', product_title)
    processor = processor_match.group(0) if processor_match else None

    # Extracting RAM
    ram_match = re.search(r'(\d+gb\s*ram|\d+GB\s*-r\w+)', product_title)
    ram = ram_match.group(0) if ram_match else None

    # Extracting storage
    storage_match = re.search(r'(\d+gb\s*(ssd|hdd))', product_title)
    storage = storage_match.group(0) if storage_match else None

    return pd.Series([brand, filtered_model, processor, ram, storage])

In [None]:
# # Applying the extract_attributes function to the "product_title" column
# cleaned_product_data[['Brand', 'Model', 'Processor', 'RAM', 'Storage']] = cleaned_product_data['processed_title'].apply(extract_attributes)
# cleaned_product_data

Unnamed: 0,link,product_title,price,actual_price,ratings,color,processed_title,Brand,Model,Processor,RAM,Storage
0,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i3 12th Gen | 16GB RAM | 512G...,55999.0,55999.0,13.0,Black,dell vostro 3520 i3 12th gen 16gb ram 512gb ss...,dell,vostro 3520,i3 12th gen,16gb ram,512gb ssd
1,https://www.daraz.com.np/products/apple-macboo...,Apple MacBook Air 13-inch M1 256GB - Oliz Store,109900.0,139900.0,76.0,Space Grey,apple macbook air 13inch m1 256gb oliz store,apple,macbook air 13inch,m1,,
2,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 16GB RAM | 512G...,68999.0,68999.0,22.0,Black,dell vostro 3520 i5 12th gen 16gb ram 512gb ss...,dell,vostro 3520,i5 12th gen,16gb ram,512gb ssd
3,https://www.daraz.com.np/products/dell-vostro-...,Dell Vostro 3520 i5 12th Gen | 8GB RAM | 256GB...,64000.0,64000.0,8.0,Black,dell vostro 3520 i5 12th gen 8gb ram 256gb ssd...,dell,vostro 3520,i5 12th gen,8gb ram,256gb ssd
4,https://www.daraz.com.np/products/acer-nitro-v...,Acer Nitro V 15 i7 13th Gen 13620H | 16GB DDR5...,137999.0,137999.0,2.0,Black,acer nitro v 15 i7 13th gen 13620h 16gb ddr5 5...,acer,nitro v 15,i7 13th gen,,512gb ssd
...,...,...,...,...,...,...,...,...,...,...,...,...
528,https://www.daraz.com.np/products/asus-x515-i5...,Asus X515 i5 11th Gen 8GB RAM 512GB ssd 2GB Nv...,75990.0,88000.0,,Silver,asus x515 i5 11th gen 8gb ram 512gb ssd 2gb nv...,asus,x515,i5 11th gen,8gb ram,512gb ssd
529,https://www.daraz.com.np/products/asus-vivoboo...,ASUS VivoBook 16 F1605VA Intel Core i5 13th Ge...,89990.0,110000.0,,Black,asus vivobook 16 f1605va intel core i5 13th ge...,asus,vivobook 16 f1605va intel core,i5 13th gen,8gb ram,512gb ssd
530,https://www.daraz.com.np/products/lenovo-ideap...,"Lenovo Ideapad 3 AMD Ryzen 3 5300U processor, ...",52000.0,52000.0,,Brown,lenovo ideapad 3 amd ryzen 5300u processor 4gb...,lenovo,ideapad 3,amd ryzen,4gb ram,
531,https://www.daraz.com.np/products/lenovo-ideap...,Lenovo IdeaPad slim 3 Windows 11 15.6 inch HD ...,55000.0,55000.0,,Grey,lenovo ideapad slim 3 windows 11 156 inch hd i...,lenovo,ideapad slim,,,


In [None]:
cleaned_product_data.iloc[368]

Unnamed: 0,368
link,https://www.daraz.com.np/products/dell-inspiro...
product_title,Dell Inspiron 16 5625 (AMD Ryzen 5 - 5625U Pro...
price,94500.0
actual_price,102000.0
ratings,
color,Silver
processed_title,dell inspiron 16 5625 amd ryzen 5 5625u proces...


In [None]:
none_storage_count = cleaned_product_data.isnull().sum()
none_storage_count

Unnamed: 0,0
link,0
product_title,0
price,0
actual_price,0
ratings,436
color,0
processed_title,0


In [None]:
# # Replace NaN values with 0.0 in the 'ratings' column
# cleaned_product_data['ratings'] = cleaned_product_data['ratings'].fillna(0.0)

# # Replace None values with 'Unknown' in the 'Processor', 'RAM', and 'Storage' columns
# cleaned_product_data['Processor'] = cleaned_product_data['Processor'].fillna('Unknown')
# cleaned_product_data['RAM'] = cleaned_product_data['RAM'].fillna('Unknown')
# cleaned_product_data['Storage'] = cleaned_product_data['Storage'].fillna('Unknown')

# cleaned_product_data

KeyError: 'Processor'

In [None]:
# Drop the 'product_title' column
cleaned_product_data = cleaned_product_data.drop('product_title', axis=1)

# Display the first few rows to confirm the column has been dropped
cleaned_product_data.head()

Unnamed: 0,link,price,actual_price,ratings,color,processed_title
0,https://www.daraz.com.np/products/dell-vostro-...,55999.0,55999.0,13.0,Black,dell vostro 3520 i3 12th gen 16gb ram 512gb ss...
1,https://www.daraz.com.np/products/apple-macboo...,109900.0,139900.0,76.0,Space Grey,apple macbook air 13inch m1 256gb oliz store
2,https://www.daraz.com.np/products/dell-vostro-...,68999.0,68999.0,22.0,Black,dell vostro 3520 i5 12th gen 16gb ram 512gb ss...
3,https://www.daraz.com.np/products/dell-vostro-...,64000.0,64000.0,8.0,Black,dell vostro 3520 i5 12th gen 8gb ram 256gb ssd...
4,https://www.daraz.com.np/products/acer-nitro-v...,137999.0,137999.0,2.0,Black,acer nitro v 15 i7 13th gen 13620h 16gb ddr5 5...


In [None]:
# Save the preprocessed Data
cleaned_product_data.to_csv('preprocessed_product_data.csv', index=False)