In [1]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
embedding = embedding_model.encode('What is the last order date?')

print(len(embedding))

384


In [22]:
from qdrant_client import QdrantClient
import os
from dotenv import load_dotenv
load_dotenv()

client = QdrantClient(
    url="https://d7061209-094a-4e53-8514-261b9664f4b1.us-east-1-0.aws.cloud.qdrant.io:6333",
    api_key=os.getenv('QDRANT_API_KEY'),
    timeout=300
)

In [23]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='sales_data')])

In [24]:
from qdrant_client.models import VectorParams, Distance, PointStruct

if not client.collection_exists(collection_name="amazon_sales_data"):
    client.create_collection(
        collection_name="amazon_sales_data",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE)
    )

In [29]:
data = [
    "My name is Eren Jeager.",
    "I am the Manga Protagonist",
    "I have the power of Attack Titan",
    "I will become the founding TItan"
]

for i,sentence in enumerate(data):
    print(sentence)
    sentence_embedding = embedding_model.encode(sentence)
    print(len(sentence_embedding))
    client.upsert(
        collection_name="amazon_sales_data",
        wait=True,
        points=[PointStruct(id=i, vector=sentence_embedding.tolist(), payload={"text": sentence})]
    )


My name is Eren Jeager.
384
I am the Manga Protagonist
384
I have the power of Attack Titan
384
I will become the founding TItan
384


In [None]:
# {"id":0,"payload":{"text":"My name is Eren Jeager."},"vector":[-0.08021367,0.077232674,0.009683972,-0.012284201,-0.02412347,-0.037259124,0.13313149,0.013217617,0.04877893,-0.023283033,-0.023609154,-0.10090429,0.0021865855,-0.06954921,0.019132357,-0.11597423,0.0323909,0.1132109,0.0018544986,-0.11150991,-0.101764,0.076562665,0.07328048,-0.045957968,-0.02729991,-0.0725306,0.02957693,0.051280603,-0.032551125,-0.12800857,0.035059907,-0.08827656,0.051677365,0.035032444,-0.0060879765,-0.0051962724,-0.069427975,-0.022000683,-0.0183215,0.007077316,0.003458833,-0.020686304,-0.091841854,-0.08495421,0.0076427963,0.0064750244,-0.041669857,0.066852234,0.016110387,0.029999755,-0.06796628,-0.04531114,-0.028113047,-0.017279565,0.07375291,0.022957498,0.055931605,0.02690958,-0.020058218,0.08498525,-0.005730725,0.003072651,-0.109918796,-0.06691236,-0.064041235,-0.019870112,0.0438106,0.07425611,-0.0044348883,0.008682139,0.06409181,-0.014376789,0.011437945,0.05639392,-0.015651517,-0.07211493,-0.028716091,0.06898219,0.044066884,0.08031868,-0.07679837,-0.070746265,-0.054260887,-0.080806784,-0.053057555,0.005430981,0.009056933,0.025669755,0.016405163,-0.007960849,-0.023694322,-0.037435576,-0.005332007,0.03286239,-0.014463317,-0.058593545,-0.014777087,0.036502,-0.059861362,0.08082024,-0.0010815614,0.025385188,0.08667456,0.08457056,0.017320018,0.031950817,-0.002515597,0.008070259,0.041573808,-0.09049667,-0.016119061,-0.010252027,-0.022486484,-0.01130871,0.041577395,-0.039479386,0.03849049,-0.045412187,-0.029426135,-0.027651867,0.07083362,0.116295874,-0.090052314,0.013956171,0.0098455,-0.0317156,0.019464936,-2.0868627e-33,-0.016803818,0.024293162,0.016904287,0.11963235,-0.025264867,0.012457736,-0.12028544,0.028832974,-0.0380688,-0.092006765,0.02304834,-0.07254785,-0.02607385,0.0115309,-0.06480589,0.031387888,0.012821324,0.06022447,0.070814274,0.04459399,-0.036736134,-0.0735673,0.026313523,0.003594487,0.028474757,-0.08203188,-0.000489336,-0.08570028,0.010933096,0.022168618,0.08810326,-0.031786762,0.03342303,0.022795884,0.025252094,-0.015149288,0.017298711,-0.020344542,-0.023722136,0.0039966116,0.0360682,0.043727037,-0.010181375,0.061759055,-0.0066735414,-0.012339429,0.13088684,0.018603748,0.045273263,0.016178496,-0.084499575,-0.036866616,-0.012975127,0.034183063,-0.07870345,0.09773774,0.038132988,0.05878455,-0.049306925,-0.026554955,-0.019383203,0.0772709,-0.03127997,0.09806207,0.07158047,0.061094552,0.020833965,-0.0060264044,0.06065412,-0.028315775,-0.025105149,-0.059640676,0.10011277,0.07650795,-0.01253392,0.025608137,-0.037742473,0.017637404,-0.10315377,-0.02020886,-0.01831238,0.024807319,-0.08648056,0.022577316,0.044200674,0.03410425,0.028396517,-0.048535228,-0.0022969593,0.027534438,0.039594807,0.016146434,0.04071901,-0.03757804,-0.09415141,1.622547e-33,0.03587382,0.007008477,-0.0052152076,0.0582435,0.061827224,-0.0648075,0.049328428,0.13700466,-0.058615044,-0.0134177515,0.013126324,-0.07841013,0.06937662,-0.01713183,-0.02807243,0.07217004,0.008137113,0.04303801,0.010461334,-0.051161483,-0.11261328,0.028370304,-0.07118663,0.050415576,-0.014124503,-0.010868401,0.045490034,0.0036656475,-0.06859565,-0.025067657,-0.0097935265,-0.02131845,-0.115495704,-0.025337135,-0.03722762,0.01835233,0.079542644,-0.0006059098,0.0018755713,-0.061378818,-0.039472297,0.06528638,-0.031448763,0.006881429,-0.017749095,-0.027296193,0.0061024027,0.048883867,-0.030993534,-0.03232896,-0.034058988,0.0068669487,0.00097620283,-0.028310493,0.041263994,-0.021515729,0.0962571,-0.008956543,-0.0512396,0.074139014,-0.029062256,0.048125755,0.029449856,0.089784585,0.09798354,-0.10711775,-0.090962924,0.019356994,0.021218212,-0.029016513,0.08003458,-0.06615405,-0.011397215,0.022949273,-0.083914645,0.008986926,0.05207649,0.06089451,-0.028656852,0.041872907,0.030648954,-0.048196007,-0.04808645,0.014393893,0.017703978,0.0014002958,0.020187669,0.010929512,0.010234363,0.014684535,0.026793448,-0.01723357,0.036991853,-0.014907512,-0.0077281394,-1.7662376e-8,0.050452687,-0.0230493,0.03445931,-0.013685451,-0.0040137083,-0.028479846,0.003768014,0.010040977,-0.058563437,0.04749859,-0.021353377,0.08315722,-0.054933738,0.004675717,0.09875394,-0.040974054,0.01399346,0.10552997,-0.030146409,-0.024889227,0.011171243,0.029989397,-0.031240912,-0.0882971,0.06629935,0.015746454,0.025119912,0.0018529418,-0.001066323,-0.013623688,0.015432152,0.13038224,-0.008190465,0.045035854,-0.002932293,-0.04338443,-0.06016696,0.001815137,-0.015553206,0.029901447,-0.0032151416,0.088203624,0.053716302,-0.014213221,-0.044673033,0.068479694,0.039725352,-0.025956573,0.022666799,0.0140725,0.023674212,0.004017945,0.113510996,0.059366964,0.014589575,0.019980313,0.0013747595,0.022212505,0.054865386,0.0009957941,0.10529928,-0.011614637,-0.061334483,-0.027663376]}

In [35]:
prompt_template = """
    You are a helpful AI assistant analyzing sales data. Use the following context to answer the question accurately.
    
    Context: {context}
    
    Question: {question}
    
    Instructions:
    - Provide specific, data-driven answers based on the context
    - If the context contains numerical data, include relevant numbers in your response
    - If you cannot find the answer in the context, say so clearly
    - Be concise but comprehensive
    - For questions about trends or patterns, explain what the data shows
    - Do not make data on your own, only give answers based on relevant data, otherwise don't answer.
    
    Answer:
"""

query = "Who is the Manga Protagonist?"
query_embedding = embedding_model.encode(query)

results = client._client.query_points(
    collection_name="amazon_sales_data",
    query=query_embedding,
    with_payload=True,
    limit=2
)

for point in results.points:
    print(point.payload['text'])

# Build context string from results
context = "\n".join([point.payload['text'] for point in results.points])


I am the Manga Protagonist
My name is Eren Jeager.


In [36]:
from groq import Groq

llm = Groq(api_key=os.getenv("GROQ_API_KEY"))

filled_prompt = prompt_template.format(context=context, question=query)

response = llm.chat.completions.create(
    model="llama3-70b-8192",
    messages=[
        {
            "role": "system",
            "content": filled_prompt
        },
        {
            "role": "user",
            "content":query
        }
    ],
    temperature=0.1,
    max_tokens=1024
)
answer = response.choices[0].message.content
print("answer:", answer)

answer: According to the provided information, the Manga Protagonist is Eren Jeager.


In [37]:
import pandas as pd
sales_df = pd.read_csv('../data/final/final_sales_data.csv')

In [40]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1', device='cuda')
sales_df['new_embedding'] = model.encode(sales_df['text'].tolist(), show_progress_bar=True).tolist()


Batches: 100%|██████████| 4031/4031 [27:06<00:00,  2.48it/s] 


In [48]:
len(sales_df['new_embedding'][0])

1024

In [44]:
print(len(model.encode('hii')))

1024


In [42]:
# # Upload to Qdrant
# client.upsert(collection_name="sales_data", points=points)


import os
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams

client = QdrantClient(
    url="https://d7061209-094a-4e53-8514-261b9664f4b1.us-east-1-0.aws.cloud.qdrant.io:6333",
    api_key=os.getenv('QDRANT_API_KEY'),
    port=6333,
    timeout=300,  # Increase timeout in seconds
)

# Create or recreate the collection
client.recreate_collection(
    collection_name="new_sales_data",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE)
)

# Define batch upload function
def batch_upload(sales_df, batch_size=500):
    for start in range(0, len(sales_df), batch_size):
        end = start + batch_size
        points_batch = [
            PointStruct(
                id=i,
                vector=row['new_embedding'],
                payload={
                    "text": row['text'],
                    "metadata": row.to_dict()
                }
            )
            for i, row in sales_df.iloc[start:end].iterrows()
        ]
        client.upsert(collection_name="sales_data", points=points_batch)

# Call the batch upload
batch_upload(sales_df)

  client.recreate_collection(


UnexpectedResponse: Unexpected Response: 400 (Bad Request)
Raw response content:
b'{"status":{"error":"Wrong input: Vector dimension error: expected dim: 384, got 1024"},"time":0.200789019}'

In [1]:
# # Upload to Qdrant
# client.upsert(collection_name="sales_data", points=points)


import os
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, Distance, VectorParams

new_client = QdrantClient(
    url="https://d7061209-094a-4e53-8514-261b9664f4b1.us-east-1-0.aws.cloud.qdrant.io:6333",
    api_key=os.getenv('QDRANT_API_KEY'),
    port=6333,
    timeout=300,  # Increase timeout in seconds
)

# # Create or recreate the collection
# new_client.recreate_collection(
#     collection_name="test_sales_data",
#     vectors_config=VectorParams(size=1024, distance=Distance.COSINE)
# )

# # Define batch upload function
# def batch_upload(sales_df, batch_size=500):
#     for start in range(0, len(sales_df), batch_size):
#         end = start + batch_size
#         points_batch = [
#             PointStruct(
#                 id=i,
#                 vector=row['new_embedding'],
#                 payload={
#                     "text": row['text'],
#                     "metadata": row.to_dict()
#                 }
#             )
#             for i, row in sales_df.iloc[start:end].iterrows()
#         ]
#         new_client.upsert(collection_name="test_sales_data", points=points_batch)

# # Call the batch upload
# batch_upload(sales_df)

In [16]:
llm_query = {
    "parameters": {
        "date_range": ["04-20-22", "04-30-22"],
        "status": "cancelled",
        "ship_city": "BANGALORE"
    },
    "expanded_query": (
        "give me the count of all orders in bangalore "
        "between 04-20-22 to 04-30-22 which are cancelled"
    ),
    "query_representations": [
        "count of all orders in bangalore between 04-20-22 and 04-30-22 which are cancelled",
        "cancelled orders count in bangalore from 04-20-22 and 04-30-22",
        "how many cancelled orders are there in bangalore on 04-20-22 and 04-30-22"
    ],
    "qdrant_filter": {
        "must": [
            {"key": "metadata.status", "match": {"value": "cancelled"}},
            {"key": "metadata.ship_city", "match": {"value": "BANGALORE"}}
        ]
    }
}


In [None]:
{"key": "metadata.date", "match": {"value": "04-20-22"}},
{"key": "metadata.date", "match": {"value": "04-21-22"}},
{"key": "metadata.date", "match": {"value": "04-22-22"}},
{"key": "metadata.date", "match": {"value": "04-23-22"}},
{"key": "metadata.date", "match": {"value": "04-24-22"}},
{"key": "metadata.date", "match": {"value": "04-25-22"}},
{"key": "metadata.date", "match": {"value": "04-26-22"}},
{"key": "metadata.date", "match": {"value": "04-27-22"}},
{"key": "metadata.date", "match": {"value": "04-28-22"}},
{"key": "metadata.date", "match": {"value": "04-29-22"}},
{"key": "metadata.date", "match": {"value": "04-30-22"}}

In [3]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("hkunlp/instructor-large")

In [4]:
query_embedding = embedding_model.encode("give me the count of all canceled orders in bangalore between 04-20-22 to 04-30-22")

In [5]:
query_embedding.shape

(768,)

In [9]:
from qdrant_client import models

In [17]:
for q_rep in llm_query['query_representations']:
    embedding = embedding_model.encode(q_rep)
    print(embedding.shape, q_rep)
    search_result = new_client.query_points(
        collection_name="sales_data_v3",
        query=embedding,
        # Corrected filter with date range
        # query_filter = models.Filter(
        #     must=[
        #         models.FieldCondition(
        #             key="metadata.status",
        #             match=models.MatchValue(value="cancelled")
        #         ),
        #         models.FieldCondition(
        #             key="metadata.ship_city",
        #             match=models.MatchValue(value="BANGALORE")
        #         ),
        #         # models.FieldCondition(
        #         #     key="metadata.date",
        #         #     match=models.MatchValue(value="04-20-22")
        #         # ),
        #         # models.FieldCondition(
        #         #     key="metadata.date",
        #         #     match=models.MatchValue(value="04-30-22")
        #         # )
        #     ]
        # ),
        query_filter=llm_query['qdrant_filter'],
        with_payload=True,
        limit=15 * 2,
        score_threshold=0.3
        
    )
    print(search_result)

(768,) count of all orders in bangalore between 04-20-22 and 04-30-22 which are cancelled
points=[ScoredPoint(id=1748588089008827595, version=685, score=0.90370595, payload={'text': 'Order 406-5569075-7749953 (Cancelled) - Date: 05-16-22, Amount: 0.00 INR, Product: NW008 (NW008-ST-CP-L), Category: Set, Size: L, Ship to: Bangalore, KARNATAKA 560048.0, Fulfilled by: Unknown, Courier: Cancelled', 'metadata': {'order_id': '406-5569075-7749953', 'date': '05-16-22', 'status': 'cancelled', 'amount': 0.0, 'currency': 'INR', 'product_style': 'NW008', 'sku': 'NW008-ST-CP-L', 'category': 'Set', 'size': 'L', 'ship_city': 'BANGALORE', 'ship_state': 'KARNATAKA', 'ship_country': 'IN', 'ship_postal_code': '560048.0', 'fulfilment': 'Amazon', 'fulfilled_by': 'Unknown', 'b2b': False, 'courier_status': 'Cancelled'}}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=1748587797531724568, version=506, score=0.90354335, payload={'text': 'Order 407-8917781-1638716 (Cancelled) - Date: 05-30-22, Am

In [3]:
# store sales data in mysql 
!pip install sqlalchemy pymysql

Defaulting to user installation because normal site-packages is not writeable


In [11]:
import pymysql

# Connect to MySQL server
conn = pymysql.connect(
    host='localhost',
    user='ml',
    password='rgbXYZ@9182',
)
cur = conn.cursor()

# Create the database
db_name = 'sales_db'
cur.execute(f'CREATE DATABASE IF NOT EXISTS {db_name}')
conn.commit()
# cur.close()
# conn.close()

In [8]:
import pandas as pd
from sqlalchemy import create_engine
# Create a connection to the MySQL database
engine = create_engine('mysql+pymysql://ml:rgbXYZ%409182@localhost:3306/sales_db')

df = pd.read_csv("../src/data_generation/shoe_store_dataset.csv")

table_name = "sales_data"
df.to_sql(table_name, con=engine, if_exists='replace', index=False)

print(f"✅ Data uploaded successfully to table `{table_name}` in the database.")

✅ Data uploaded successfully to table `sales_data` in the database.


In [13]:
cur.execute(f"use {db_name}")
cur.execute(f"SELECT * FROM {table_name} LIMIT 5")
rows = cur.fetchall()
for row in rows:
    print(row)
# Close the connection
cur.close()
conn.close()

('SH00000001', 'SH011', '2024-05-20', 'SALE', 'Paragon Sliders - Orange', 9600133890838, 'SH539898', 'Sliders', 'Orange', 7, 'Paragon', 'Sandals', 6404, 656, 1, 65.6, 656, 590.4, 696.67, 2024)
('SH00000001', 'SH011', '2024-05-20', 'SALE', 'Crocs Sports Sandals - Red', 6542351161559, 'SH377370', 'Sports Sandals', 'Red', 6, 'Crocs', 'Sandals', 6404, 953, 1, 0.0, 953, 953.0, 1124.54, 2024)
('SH00000001', 'SH011', '2024-05-20', 'SALE', 'Van Heusen Dress Boots - Cognac', 1849593103413, 'SH205907', 'Dress Boots', 'Cognac', 7, 'Van Heusen', 'Formal Shoes', 6403, 4600, 1, 920.0, 4600, 3680.0, 4342.4, 2024)
('SH00000001', 'SH011', '2024-05-20', 'SALE', 'Action Sandals - Multi-color', 5534192832764, 'SH771088', 'Sandals', 'Multi-color', 6, 'Action', 'Kids Footwear', 6403, 1066, 1, 159.9, 1066, 906.1, 1069.2, 2024)
('SH00000002', 'SH012', '2023-12-17', 'SALE', 'Metro Heels - Black', 4139537672423, 'SH881177', 'Heels', 'Black', 8, 'Metro', 'Ladies Footwear', 6403, 4086, 1, 0.0, 4086, 4086.0, 4821.

In [1]:
import pandasai as pai

df1 = pai.read_csv("../src/data_generation/shoe_store_dataset.csv")

response = df1.chat("What is the average revenue by store?")
print(response)

AttributeError: module 'pandasai' has no attribute 'read_csv'

In [3]:
import pandasai as pai

df = pai.read_csv("../src/data_generation/shoe_store_dataset.csv")

AttributeError: module 'pandasai' has no attribute 'read_csv'

In [None]:
# Assuming you have already imported pandas and pandasai
import pandas as pd
from pandasai import PandasAI
from pandasai.helpers.openai_info import get_openai_callback
from pandasai.llm.openai import OpenAI
from pandasai_litellm

# Load CSV data
df = pd.read_csv("your_sales_data.csv")  # Replace with your actual CSV path

# Initialize PandasAI with your LLM
llm = OpenAI(api_token="your-api-key")  # Replace with your key
pai = PandasAI(llm)

# Create the dataset with schema
sales_dataset = pai.create(
    path="my-org/sales-data",
    df=df,
    description="Sales invoices dataset containing transaction-level data",
    columns=[
        {"name": "InvoiceNo", "type": "string", "description": "Unique invoice number"},
        {"name": "StoreCode", "type": "string", "description": "Store where the transaction took place"},
        {"name": "Date", "type": "date", "description": "Date of the transaction"},
        {"name": "ReceiptType", "type": "string", "description": "Type of receipt (SALE, RETURN, EXCHANGE)"},
        {"name": "Product_Desc", "type": "string", "description": "Product description"},
        {"name": "EAN", "type": "string", "description": "European Article Number (barcode)"},
        {"name": "POSItemID", "type": "string", "description": "POS item ID"},
        {"name": "Article_Name", "type": "string", "description": "Name of the product"},
        {"name": "Colour", "type": "string", "description": "Product color"},
        {"name": "Size", "type": "string", "description": "Size of the product"},
        {"name": "Brand", "type": "string", "description": "Brand name"},
        {"name": "P_Group", "type": "string", "description": "Product group/category"},
        {"name": "HSN", "type": "integer", "description": "HSN code for taxation"},
        {"name": "MRP", "type": "float", "description": "Maximum retail price"},
        {"name": "Quantity", "type": "integer", "description": "Quantity sold"},
        {"name": "Discount", "type": "float", "description": "Discount given on the item"},
        {"name": "Amount", "type": "float", "description": "Total price before discount"},
        {"name": "NetAmount", "type": "float", "description": "Price after discount"},
        {"name": "GrossAmount", "type": "float", "description": "Final price with taxes"},
        {"name": "Year", "type": "integer", "description": "Year of transaction"}
    ]
)
