In [1]:
import numpy as np 
import pandas as pd
import re  
import matplotlib.pyplot as plt  
import seaborn as sns  
from sklearn.neighbors import NearestNeighbors  
from sklearn.preprocessing import StandardScaler  
from IPython.display import display, HTML
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer  
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import KFold  
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score  
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.neighbors import NearestNeighbors  
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
df_sample= pd.read_csv('Amazon-Products.csv')

# __Исследование данных__

In [6]:
def styled_heading(text, gradient=True):
    background = "linear-gradient(to right, #ff7e5f, #feb47b);" if gradient else "#e4d9bd"
    
    return f"""
    <p style="
        background: {background}; 
        font-family: 'Poppins', sans-serif; 
        font-weight: bold; 
        color: white; 
        font-size: 120%; 
        text-align: center; 
        border-radius: 10px; 
        padding: 10px; 
        box-shadow: 3px 4px 15px rgba(0, 0, 0, 0.2);
        transition: transform 0.3s ease-in-out, box-shadow 0.3s ease-in-out;
        border: 2px solid #ff6b6b;">
        {text}
    </p>
    """

def print_error(message):
    error_style = """
    <p style="background-color: #ff3131;
               font-family: 'Poppins', sans-serif;
               font-weight: bold;
               color: white;
               font-size: 110%;
               text-align: center;
               border-radius: 10px;
               padding: 10px;
               border: 2px solid black;
               box-shadow: 0 4px 10px rgba(0, 0, 0, 0.3);">
        ❌ Error: {message}
    </p>
    """
    display(HTML(error_style.format(message=message)))

def colored_line(color='#323c6a'):
    return f"""<hr style="border: none; height: 3px; background: {color}; margin: 10px 0;">"""

def print_dataset_analysis(train_dataset, n_top=5, heading_color='#323c6a', line_color='#323c6a'):
    try:
        train_heading = styled_heading(f" Top {n_top} rows of Dataset")
        display(HTML(colored_line(line_color)))
        display(HTML(train_heading))
        display(HTML(colored_line(line_color)))
        display(HTML(train_dataset.head(n_top).to_html()))

        summary_heading = styled_heading(" Summary of Dataset")
        display(HTML(colored_line(line_color)))
        display(HTML(summary_heading))
        display(HTML(colored_line(line_color)))
        display(HTML(train_dataset.describe().to_html()))

        null_heading = styled_heading(" Null Values in Dataset")
        train_null_count = train_dataset.isnull().sum()
        display(HTML(colored_line(line_color)))
        display(HTML(null_heading))
        display(HTML(colored_line(line_color)))

        if train_null_count.sum() == 0:
            display(HTML("<p style='color: green; font-weight: bold;'> No null values in the dataset.</p>"))
        else:
            null_values_df = train_null_count[train_null_count > 0].to_frame(name='Null Count')
            display(HTML(null_values_df.to_html()))
            display(HTML("<p style='color: red;'> These are the null values in each column.</p>"))

        duplicate_heading = styled_heading(" Duplicate Values in Dataset")
        train_duplicates = train_dataset.duplicated().sum()
        display(HTML(colored_line(line_color)))
        display(HTML(duplicate_heading))
        display(HTML(colored_line(line_color)))
        display(HTML(f"<p style='color: blue;'>🔄 {train_duplicates} duplicate rows</p>"))


        shape_heading = styled_heading(" Number of Rows and Columns in Dataset")
        display(HTML(colored_line(line_color)))
        display(HTML(shape_heading))
        display(HTML(colored_line(line_color)))
        display(HTML(f"<p style='font-weight: bold;'> Rows: {train_dataset.shape[0]}, Columns: {train_dataset.shape[1]}</p>"))

    except Exception as e:
        print_error(str(e))

def print_unique_values(train_dataset, heading_color='#323c6a', line_color='#323c6a'):
    try:
        unique_values_heading = styled_heading(" Unique Values in Dataset")
        display(HTML(colored_line(line_color)))
        display(HTML(unique_values_heading))
        display(HTML(colored_line(line_color)))

        unique_values_table = """
        <table border='1' style="border-collapse: collapse; width: 100%; text-align: left;">
        <tr style="background-color: #ff7e5f; color: white; font-weight: bold;">
            <th style="padding: 10px; border: 1px solid black;">Column Name</th>
            <th style="padding: 10px; border: 1px solid black;">Data Type</th>
            <th style="padding: 10px; border: 1px solid black;">Unique Values</th>
        </tr>
        """

        for column in train_dataset.columns:
            unique_values = train_dataset[column].unique()[:7]
            unique_values_str = ', '.join(map(str, unique_values))
            data_type = train_dataset[column].dtype
            unique_values_table += f"""
            <tr>
                <td style="padding: 8px; border: 1px solid black;">{column}</td>
                <td style="padding: 8px; border: 1px solid black;">{data_type}</td>
                <td style="padding: 8px; border: 1px solid black;">{unique_values_str}</td>
            </tr>
            """

        unique_values_table += "</table>"
        display(HTML(unique_values_table))

    except Exception as e:
        print_error(str(e))

In [7]:
print_dataset_analysis(df_sample)

Unnamed: 0.1,Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price
0,0,"Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1 Convertible, Copper, Anti-Viral + Pm 2.5 Filter, 2023 Model, White, Gls18I3...",appliances,Air Conditioners,https://m.media-amazon.com/images/I/31UISB90sYL._AC_UL320_.jpg,https://www.amazon.in/Lloyd-Inverter-Convertible-Anti-Viral-GLS18I3FWAMC/dp/B0BRKXTSBT/ref=sr_1_4?qid=1679134237&s=kitchen&sr=1-4,4.2,2255,"₹32,999","₹58,990"
1,1,"LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (Copper, Super Convertible 6-in-1 Cooling, HD Filter with Anti-Virus Protectio...",appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctDL._AC_UL320_.jpg,https://www.amazon.in/LG-Convertible-Anti-Virus-Protection-RS-Q19YNZE/dp/B0BQ3MXML8/ref=sr_1_5?qid=1679134237&s=kitchen&sr=1-5,4.2,2948,"₹46,490","₹75,990"
2,2,"LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Copper, Super Convertible 6-In-1 Cooling, Hd Filter With Anti Virus Protection,...",appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctDL._AC_UL320_.jpg,https://www.amazon.in/LG-Inverter-Convertible-protection-RS-Q13JNYE/dp/B0BPYN9JGF/ref=sr_1_6?qid=1679134237&s=kitchen&sr=1-6,4.2,1206,"₹34,490","₹61,990"
3,3,"LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (Copper, Super Convertible 6-in-1 Cooling, HD Filter with Anti-Virus Protectio...",appliances,Air Conditioners,https://m.media-amazon.com/images/I/51JFb7FctDL._AC_UL320_.jpg,https://www.amazon.in/LG-Convertible-Anti-Virus-Protection-RS-Q19JNXE/dp/B0BQ3MJ1TG/ref=sr_1_7?qid=1679134237&s=kitchen&sr=1-7,4.0,69,"₹37,990","₹68,990"
4,4,"Carrier 1.5 Ton 3 Star Inverter Split AC (Copper,ESTER Dxi, 4-in-1 Flexicool Inverter, 2022 Model,R32,White)",appliances,Air Conditioners,https://m.media-amazon.com/images/I/41lrtqXPiWL._AC_UL320_.jpg,https://www.amazon.in/Carrier-Inverter-Split-Copper-Flexicool/dp/B0B67RLLJC/ref=sr_1_8?qid=1679134237&s=kitchen&sr=1-8,4.1,630,"₹34,490","₹67,790"


Unnamed: 0.1,Unnamed: 0
count,551585.0
mean,7006.200471
std,5740.835523
min,0.0
25%,1550.0
50%,5933.0
75%,11482.0
max,19199.0


Unnamed: 0,Null Count
ratings,175794
no_of_ratings,175794
discount_price,61163
actual_price,17813


## __Очистка данных__

In [9]:
df_sample['name'].unique()

array(['Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1 Convertible, Copper, Anti-Viral + Pm 2.5 Filter, 2023 Model, White, Gls18I3...',
       'LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (Copper, Super Convertible 6-in-1 Cooling, HD Filter with Anti-Virus Protectio...',
       'LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Copper, Super Convertible 6-In-1 Cooling, Hd Filter With Anti Virus Protection,...',
       ..., 'Redwolf Noice Toit Smort - Hoodie (Black)',
       'Redwolf Schrute Farms B&B - Hoodie (Navy Blue)',
       'Mothercare Printed Cotton Elastane Girls Infant Leggings (S21VF342MU-P)'],
      dtype=object)

In [11]:
df_sample['name'].duplicated().sum()

155375

In [13]:
duplicate_names = df_sample[df_sample['name'].duplicated(keep=False)]
duplicate_names_sorted = duplicate_names.sort_values(by='name')
duplicate_names_sorted.head(50)

Unnamed: 0.1,Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price
347148,7293,""" SF-400A"" ,10kg x 1gm kitchen weighing scale ...",appliances,Kitchen & Home Appliances,https://m.media-amazon.com/images/I/31So5vgWk-...,https://www.amazon.in/SF-400A-weighing-Electro...,,,₹415,₹899
10092,9372,""" SF-400A"" ,10kg x 1gm kitchen weighing scale ...",appliances,All Appliances,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/SF-400A-weighing-Electro...,,,₹415,₹899
244112,324,"""A"" PLUS Hygiene Portable Room Air Purifier an...",appliances,Heating & Cooling Appliances,https://m.media-amazon.com/images/I/61HihavsUS...,https://www.amazon.in/Hygiene-Portable-Humidif...,4.1,911.0,"₹1,499","₹2,678"
341117,1262,"""A"" PLUS Hygiene Portable Room Air Purifier an...",appliances,Kitchen & Home Appliances,https://m.media-amazon.com/images/I/61HihavsUS...,https://www.amazon.in/Hygiene-Portable-Humidif...,4.1,911.0,"₹1,499","₹2,678"
2342,1622,"""A"" PLUS Hygiene Portable Room Air Purifier an...",appliances,All Appliances,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Hygiene-Portable-Humidif...,4.1,911.0,"₹1,499","₹2,678"
38402,1730,"""INTERN SOPRANO 21"" UKULELE WITH BAG (BLACK)",accessories,Bags & Luggage,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Intern-INT-UK21LD-BK-Con...,3.9,523.0,"₹1,690","₹2,995"
214330,350,"""INTERN SOPRANO 21"" UKULELE WITH BAG (BLACK)",accessories,Handbags & Clutches,https://m.media-amazon.com/images/I/413gQVbxZl...,https://www.amazon.in/Intern-INT-UK21LD-BK-Con...,3.9,523.0,"₹1,690","₹2,995"
117541,13309,"""PH"" POSHAKHUB Women Georgette Hand Embroidery...",women's clothing,Clothing,https://m.media-amazon.com/images/I/61yFCDHRxg...,https://www.amazon.in/PH-POSHAKHUB-Georgette-E...,4.1,26.0,₹989.10,"₹1,666"
135561,5241,"""PH"" POSHAKHUB Women Georgette Hand Embroidery...",women's clothing,Ethnic Wear,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/PH-POSHAKHUB-Georgette-E...,4.1,26.0,₹989.10,"₹1,666"
117071,12839,"""PH"" POSHAKHUB Women's Black Amerian Crepe Foi...",women's clothing,Clothing,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/PH-POSHAKHUB-Printed-Jum...,4.2,57.0,₹809.10,"₹1,304"


На самом деле эти значения не дублируются - адреса изображений, скидки и другие значения отличаются, их лучше оставить, чтобы обеспечить большую релевантность в системе рекомендаций.

In [15]:
def clean_product_names(name):
    if isinstance(name, str):
        name = re.sub(r'[^a-zA-Z0-9\s]+', '', name)
        name = name.lower()
        name = name.strip()
    return name

df_sample['name'] = df_sample['name'].apply(clean_product_names)

In [16]:
df_sample['name'].unique()

array(['lloyd 15 ton 3 star inverter split ac 5 in 1 convertible copper antiviral  pm 25 filter 2023 model white gls18i3',
       'lg 15 ton 5 star ai dual inverter split ac copper super convertible 6in1 cooling hd filter with antivirus protectio',
       'lg 1 ton 4 star ai dual inverter split ac copper super convertible 6in1 cooling hd filter with anti virus protection',
       ..., 'redwolf noice toit smort  hoodie black',
       'redwolf schrute farms bb  hoodie navy blue',
       'mothercare printed cotton elastane girls infant leggings s21vf342mup'],
      dtype=object)

In [19]:
df_sample['main_category'].unique()

array(['appliances', 'car and motorbike', 'tv audio and cameras',
       'sports and fitness', 'grocery and gourmet foods',
       'home and kitchen', 'pet supplies', 'stores',
       'toys and baby products', "kids' fashion", 'bags and luggage',
       'accessories', "women's shoes", 'beauty and health', "men's shoes",
       "women's clothing", 'industrial supplies', "men's clothing",
       'music'], dtype=object)

Надо бы стандартизировать:

In [18]:
def clean_category(category):
    category = category.lower()
    category = category.strip()
    category = category.replace('&', 'and')
    category = category.replace(',', '')
    if "home" in category:
        category = 'home and kitchen'
    return category
df_sample['main_category'] = df_sample['main_category'].apply(clean_category)