In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import pytesseract
import os
from PIL import Image
from fuzzywuzzy import fuzz

# Set style untuk visualisasi
sns.set(style='whitegrid')

# 1. Analisis Dataset Teks Shopping List
df_text = pd.read_csv('data/text/shopping_list.csv')
print('Info Dataset Teks Shopping List:')
print(df_text.info())
print('\nContoh Data:')
print(df_text.head())

# Distribusi item
plt.figure(figsize=(10, 5))
sns.countplot(y='item_name', data=df_text, order=df_text['item_name'].value_counts().index)
plt.title('Distribusi Item di Shopping List')
plt.show()

# 2. Analisis Dataset Gambar Shopping List
def extract_text_from_images(image_folder):
    image_texts = []
    for img_name in os.listdir(image_folder):
        img_path = os.path.join(image_folder, img_name)
        img = cv2.imread(img_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        text = pytesseract.image_to_string(gray)
        image_texts.append({'image': img_name, 'text': text})
    return pd.DataFrame(image_texts)

df_images = extract_text_from_images('data/images')
print('Info Dataset Gambar:')
print(df_images.head())

# Visualisasi gambar dan teks
img_path = 'data/images/synthetic_list_1.png'
img = cv2.imread(img_path)
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.title('Contoh Gambar Shopping List')
plt.axis('off')
plt.show()
print('Teks yang diekstrak:', df_images.loc[df_images['image'] == 'synthetic_list_1.png', 'text'].values[0])

# Distribusi panjang teks
df_images['text_length'] = df_images['text'].apply(len)
sns.histplot(df_images['text_length'], bins=10)
plt.title('Distribusi Panjang Teks dari Gambar')
plt.show()

# 3. Analisis Dataset Produk E-commerce (Olist)
df_products = pd.read_csv('data/ecommerce/olist_products_dataset.csv')
df_categories = pd.read_csv('data/ecommerce/product_category_name_translation.csv')
df_order_items = pd.read_csv('data/ecommerce/olist_order_items_dataset.csv')

# Gabungkan kategori produk dengan terjemahan
df_products = df_products.merge(df_categories, on='product_category_name', how='left')
print('Info Dataset Produk:')
print(df_products.info())
print('\nContoh Data Produk:')
print(df_products[['product_id', 'product_category_name_english', 'product_name_lenght']].head())

# Distribusi kategori produk
plt.figure(figsize=(12, 6))
sns.countplot(y='product_category_name_english', data=df_products, order=df_products['product_category_name_english'].value_counts().index[:10])
plt.title('10 Kategori Produk Teratas')
plt.show()

# Distribusi harga produk
df_order_items = df_order_items.merge(df_products[['product_id', 'product_category_name_english']], on='product_id', how='left')
sns.histplot(df_order_items['price'], bins=50)
plt.title('Distribusi Harga Produk')
plt.xlabel('Harga (BRL)')
plt.show()

# 4. Mapping Item Shopping List ke Produk
def match_items(shopping_item, product_list):
    matches = []
    for product in product_list:
        score = fuzz.ratio(shopping_item.lower(), product.lower())
        if score > 70:  # Threshold kemiripan
            matches.append((product, score))
    return sorted(matches, key=lambda x: x[1], reverse=True)[:3]

# Contoh mapping
shopping_item = 'Susu UHT'
product_names = df_products['product_category_name_english'].fillna('').tolist()
matches = match_items(shopping_item, product_names)
print(f'Match untuk {shopping_item}:')
for product, score in matches:
    print(f'- {product}: {score}%')

# Simulasi pembuatan link (menggunakan nama produk sebagai kata kunci)
def generate_ecommerce_links(item):
    shopee_link = f'https://shopee.co.id/search?keyword={item.replace(' ', '%20')}'
    return shopee_link

for item in df_text['item_name'].head():
    print(f'Link untuk {item}: {generate_ecommerce_links(item)}')