In [None]:
import pandas as pd
import numpy as np
categories_df = pd.read_csv('../data/amazon_categories.csv')
products_df = pd.read_csv('../data/amazon_products.csv')
data = pd.merge(categories_df, products_df, left_on='id', right_on='category_id')
data = data.drop('id', axis=1)

In [None]:
data.head(10)

In [None]:
pd.options.display.max_rows = 4000
print(data['category_id'].value_counts())

In [None]:
category_id_to_keep_str = ['45','46','47','48','49','50','52','71','72','84','90','91','97','101','103','104','105','107','108','109','110','111','112','113','114','116','118','120','121','122','123','173','174', '270']
category_id_to_keep_str = ['46']
category_id_to_keep = [int(id) for id in category_id_to_keep_str]
filtered_data = data[data['category_id'].isin(category_id_to_keep)]
filtered_data.info()

In [None]:
filtered_data.head(20)

In [None]:
print(filtered_data['boughtInLastMonth'].value_counts())

In [None]:
filtered_data = filtered_data[~filtered_data['boughtInLastMonth'].isin([0])]


In [None]:
filtered_data.info()

In [None]:
filtered_data.head(20)

In [None]:
import os
from openai import AzureOpenAI
from dotenv import load_dotenv
import json

load_dotenv()

client = AzureOpenAI(
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key = os.getenv("AZURE_OPENAI_API_KEY"),
    api_version = os.getenv("AZURE_OPENAI_VERSION")
)
deployment_name = os.getenv("AZURE_OPENAI_COMPLETION_DEPLOYMENT_NAME")
model_name = os.getenv("AZURE_OPENAI_COMPLETION_MODEL")

def create_summary(name, content):
    prompt = f"Create a simple but correct summary of an online shop product description based on the following details in less than 500 characters. Remove rating and pricing information and keep color, size and relevant configuration. If details are available describe how the product can be used and who the ideal customer for this product is. Do not add or use information that is not part of the prompt. Your description should be in english. This is the name of the product: {name} This are the details: {content}"

    response = client.chat.completions.create(
        model = model_name, 
        temperature=1.0,
        messages = [{"role" : "assistant", "content" : prompt}],
    )
    return response.choices[0].message.content


In [None]:
import requests
import json

def web_search_basic(
    name, subscription_key, auth_header_name="Ocp-Apim-Subscription-Key", mkt="en-us"    
):
    """Bing Web Search Basic REST call

    This sample makes a call to the Bing Web Search API with a text query and returns relevant pages
    Documentation: https://docs.microsoft.com/en-us/bing/search-apis/bing-web-search/overview

    May throw HTTPError in case of invalid parameters or a server error.

    Args:
        subscription_key (str): Azure subscription key of Bing Web Search service
        auth_header_name (str): Name of the authorization header
        query (str): Query to search for
        mkt (str): Market to search in
    """
    # Construct a request
    endpoint = "https://api.bing.microsoft.com/v7.0/search"
    params = {"q": "site:amazon.de what is the description of " + name, "mkt": mkt}
    headers = {auth_header_name: subscription_key}

    # Call the API
    try:
        response = requests.get(endpoint, headers=headers, params=params, timeout=10)
        response.raise_for_status()

        json_response = response.json()

        description = json_response["webPages"]["value"][0]["snippet"]

        for item in json_response["webPages"]["value"]:           
            description = item["snippet"] + " "

        summary = create_summary(name, description)
        summary = summary.replace("\n", "")
        return summary
    except Exception as ex:
        print(ex)
        print("++The above exception was thrown and handled succesfully++")
        return response


In [None]:
import time, sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)


subscription_key = os.getenv("BING_SUBSCRIPTION_KEY")

print(len(filtered_data))

limit = 300
counter = 0
# loop through the filtered data and search for the description of each product
for ind in filtered_data.head(limit).index:
    name = filtered_data['title'][ind]

    # print(name)

    description = web_search_basic(name, subscription_key)

    filtered_data.at[ind, 'description'] = description
    counter = counter + 1
    update_progress(counter / limit)


print("Processing complete!")

filtered_data.to_csv('../data/filtered_dataset.csv', index=False)

In [None]:
filtered_data.to_csv('../data/filtered_dataset.csv', index=False)