# Financial CSV Agent with Native Multi-Step Cohere API

In [11]:
import os
from typing import List

import numpy as np
import cohere
import langchain
import langchain_core
import langchain_experimental
import pandas as pd
from langchain.agents import Tool
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_experimental.utilities import PythonREPL
from dotenv import load_dotenv

# utility function
import src.utils.feedback as feedback
import src.financial_calculator as fc

In [2]:
# versions
print('cohere version:', cohere.__version__)
print('langchain version:', langchain.__version__)
print('langchain_core version:', langchain_core.__version__)
print('langchain_experimental version:', langchain_experimental.__version__)

cohere version: 5.6.2
langchain version: 0.2.11
langchain_core version: 0.2.24
langchain_experimental version: 0.0.63


### setup variables envi

In [3]:
load_dotenv()

True

In [4]:
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
CHAT_URL= "https://api.cohere.ai/v1/chat"
COHERE_MODEL = 'command-r-plus'
co = cohere.Client(api_key=COHERE_API_KEY)

In [7]:
df_grm = fc.process_data('data/food_dataset_v1.csv')

In [8]:
df_grm.head()

Unnamed: 0,merchant_id,date,product,weekly_total_price,weekly_total_discount_price,weekly_total_discounts,merchant_name,merchant_area,category,display,description,total_orders,total_revenue,average_discount,most_ordered_item,least_ordered_item,most_ordered_item_category
0,M001,2024-07-28,"Ice Americano, Hot Latte, Ice Cappucino, Ice M...",1538500.0,0.0,0,"330 Kopi, Ciledug",jakarta,Kopi/Minuman/Roti,Coffee,Sajian Kopi Hitam Dengan Beans Pilihan (Arabica),64,1538500.0,0.0,Ice Americano,Ice Americano,Kentang Goreng
1,M002,2024-07-28,"Hemat Combo 3, Hemat Combo 1, Hemat Super 5, C...",1776000.0,0.0,0,"Abe Steak, Cibubur",jakarta,Aneka nasi/Minuman/Barat,Paket Hemat,1 Chicken + 1 Sirloin + 1 Tenderloin + 3 Es Te...,54,1776000.0,0.0,Chicken Drumstick,Hemat Combo 3,Chicken Drumstick
2,M003,2024-07-28,"Tas ABUBA ( Max 5 Box ), Mango Juice, Vanilla ...",6956400.0,0.0,0,"Abuba Steak, Cikarang",jakarta,Barat,Kantong Belanja,"Kandungan gizi dengan Gramasi 1 gelas, Total ...",99,6956400.0,0.0,Lychee Popping Boba,Tas ABUBA ( Max 5 Box ),Coca Cola
3,M004,2024-07-28,"tempe Crispi, Lonsay, Es Nutri Sari,Sweet Guav...",186000.0,0.0,0,Aneka Gorengan Siaga,jakarta,Jajanan/Minuman,Aneka Gorengan Siaga,tempe + cabe,36,186000.0,0.0,tempe Crispi,tempe Crispi,Prinia Orange Lychee
4,M005,2024-07-28,"Nasi Goreng Sapi, Tiam MIE, Telur CEPLOK, Tia...",204000.0,0.0,0,Arangsimpur,jakarta,Chinese,Nasi Goreng,"Nasi Goreng, Dengan Daging Sapi, Digoreng Meng...",6,204000.0,0.0,Nasi Goreng Sapi,Nasi Goreng Sapi,Koloke


In [12]:
df_review = feedback.feedback_analysis('data/merchant_reviews_sample.csv')

  Merchant_ID    rating                                            reviews
0        M001  4.168409  [["Honey Dijon flavor: okay flavor, but both m...
1        M002  4.247874  [["I'm not sure why, but this is our favorite ...
2        M003  4.096031  [["OMG these things are delicious!@ I'm not a ...
3        M004  4.188612  [['A good tasting light chip that has some val...
4        M005  4.209476  [["I wouldn't even think of buying this produc...
Processing reviews...
Processing sentiment...


In [13]:
df_review

Unnamed: 0,Merchant_ID,rating,reviews,summary,sentiment
0,M001,4.168409,"[[""Honey Dijon flavor: okay flavor, but both m...",A customer describes their experience with dif...,Negative
1,M002,4.247874,"[[""I'm not sure why, but this is our favorite ...",A selection of products which have received en...,Negative
2,M003,4.096031,"[[""OMG these things are delicious!@ I'm not a ...",The reactions to the products in this review a...,Negative
3,M004,4.188612,[['A good tasting light chip that has some val...,These customer reviews vary on their opinion o...,Negative
4,M005,4.209476,"[[""I wouldn't even think of buying this produc...",Some customers focus on the price of the produ...,Positive
5,M006,4.205811,"[[""While the restaurants always do it best, th...",The products reviewed here include curry paste...,Negative
6,M007,4.234375,[['A nice case of chips that are quite tasty. ...,These reviews show how much people love Lay's ...,Negative
7,M008,4.097403,"[['Green Mountain ""Nantucket Blend"" K-Cups mak...","Green Mountain ""Nantucket Blend"" K-Cups make a...",Negative
8,M009,4.290323,[['4 bags of chips were open when I received t...,The customer is happy with the chips they rece...,Positive
9,M010,4.164482,[['I got a wild hair for taffy and ordered thi...,A customer ordered a five pound bag of taffy a...,Negative


## Define Python Tool
Here we define the python tool using langchain's PythonREPL. We also define functions_map that will later be used by the Cohere Agent to correctly map function name to the actual function. Lastly, we define the tools that will be passed in the Cohere API.

In [15]:
# python tool
python_repl = PythonREPL()
python_tool = Tool(
    name="python_repl",
    description="Executes python code and returns the result. The code runs in a static sandbox without interactive mode, so print output or save output to a file.",
    func=python_repl.run,
)
python_tool.name = "python_interpreter"

class ToolInput(BaseModel):
    code: str = Field(description="Python code to execute.")
python_tool.args_schema = ToolInput

def run_python_code(code: str) -> dict:
    """
    Function to run given python code
    """
    input_code = ToolInput(code=code)
    return {'python_answer': python_tool.func(input_code.code)}

functions_map = {
    "run_python_code": run_python_code,
}

tools = [
    {
        "name": "run_python_code",
        "description": "given a python code, runs it",
        "parameter_definitions": {
            "code": {
                "description": "executable python code",
                "type": "str",
                "required": True
            }
        }
    },]


# create cohere agent
def cohere_agent(
    message: str,
    preamble: str,
    tools: List[dict],
    force_single_step=False,
    verbose: bool = False,
) -> str:
    """
    Function to handle multi-step tool use api.

    Args:
        message (str): The message to send to the Cohere AI model.
        preamble (str): The preamble or context for the conversation.
        tools (list of dict): List of tools to use in the conversation.
        verbose (bool, optional): Whether to print verbose output. Defaults to False.

    Returns:
        str: The final response from the call.
    """

    counter = 1

    response = co.chat(
        model=COHERE_MODEL,
        message=message,
        preamble=preamble,
        tools=tools,
        force_single_step=force_single_step,
    )

    if verbose:
        print(f"\nrunning 0th step.")
        print(response.text)

    while response.tool_calls:
        tool_results = []

        if verbose:
            print(f"\nrunning {counter}th step.")

        for tool_call in response.tool_calls:
            output = functions_map[tool_call.name](**tool_call.parameters)
            outputs = [output]
            tool_results.append({"call": tool_call, "outputs": outputs})

            if verbose:
                print(
                    f"= running tool {tool_call.name}, with parameters: {tool_call.parameters}"
                )
                print(f"== tool results: {outputs}")

        response = co.chat(
            model=COHERE_MODEL,
            message="",
            chat_history=response.chat_history,
            preamble=preamble,
            tools=tools,
            force_single_step=force_single_step,
            tool_results=tool_results,
        )

        if verbose:
            print(response.text)

            counter += 1

    return response.text

In [47]:
# df_completed = pd.merge(df_gmr, df_review, left_on= 'merchant_id', right_on='Merchant_ID', how='inner')
# df.drop(['Merchant_ID'], axis=1, inplace=True)
# df_completed = df_completed.drop_duplicates(subset=['merchant_id'])

Unnamed: 0,merchant_id,date,product,weekly_total_price,weekly_total_discount_price,weekly_total_discounts,merchant_name,merchant_area,category,display,description,total_orders,total_revenue,average_discount,most_ordered_item,least_ordered_item
0,M001,2024-07-21,"Hot Millbro, Hot 330 Helateh, Hot Ilusi Kopi, ...",516500.0,0.0,0,"330 Kopi, Ciledug",jakarta,Kopi/Minuman/Roti,Signature,Sajian Susu Coklat Milo Plus Espresso Dengan R...,20,516500.0,0.0,Hot Millbro,Hot Millbro
1,M001,2024-07-28,"Ice Almara Kopi (kopi Susu Gula Aren), Ice Haz...",1022000.0,0.0,0,"330 Kopi, Ciledug",jakarta,Kopi/Minuman/Roti,Signature,Sajian Kopi Susu Gula Aren Yang Berbeda Dari K...,44,1022000.0,0.0,Ice Almara Kopi (kopi Susu Gula Aren),Ice Almara Kopi (kopi Susu Gula Aren)
2,M002,2024-07-21,"Chicken Steak Pops, Tenderloin Steak Beef Pop,...",482000.0,0.0,0,"Abe Steak, Cibubur",jakarta,Aneka nasi/Minuman/Barat,New Varian,"Chicken, Kentang, Mix Vegetable, Brown Sauce",19,482000.0,0.0,Chicken Steak Pops,Chicken Steak Pops
3,M002,2024-07-28,"Hemat Super 5, Rib Eye Import 175 Gram, Lemon ...",1294000.0,0.0,0,"Abe Steak, Cibubur",jakarta,Aneka nasi/Minuman/Barat,Paket Hemat,5 Chicken Steak + 5 Es Teh Manis,35,1294000.0,0.0,Hemat Super 5,Hemat Super 5
4,M003,2024-07-21,"Lychee Popping Boba, Mango Juice, Ice Age Melt...",2690600.0,0.0,0,"Abuba Steak, Cikarang",jakarta,Barat,New Menu,"Kandungan gizi dengan Gramasi 1 gelas, Total ...",33,2690600.0,0.0,Lychee Popping Boba,Mango Juice


In [26]:
# test
question_dict ={
    "question1": "What is the average weekly revenue for the merchant?",
    "question2": "What is the common sentiment for the merchant?",
    "question3": "What is the average weekly review for the merchant?",
    "question4": "What is the average weekly revenue for the merchant?",
    "question5": "What is the average weekly sentiment for the merchant?",
    "question6": "Which item has the highest revenue?",
    "question7": "Which merchant has the highest revenue?",

}

preamble = """
You are an expert who answers the user's question. You are working with two pandas dataframe in Python. The name of the dataframe is `data/gmr_metrics.csv` 
for weekly report for merchant and `data/review_summary.csv` for weekly summarization of review and weekly sentiment for each merchant.
Here is a preview of the dataframe of GRM:
{head_df} 
Here is a preview of the dataframe of Review: 
{head_df_review}
""".format(head_df=df_grm.drop('product', axis = 1).head().to_markdown(), 
           head_df_review=df_review.drop('reviews', axis = 1).head().to_markdown())

print(preamble)


You are an expert who answers the user's question. You are working with two pandas dataframe in Python. The name of the dataframe is `data/gmr_metrics.csv` 
for weekly report for merchant and `data/review_summary.csv` for weekly summarization of review and weekly sentiment for each merchant.
Here is a preview of the dataframe of GRM:
|    | merchant_id   | date                |   weekly_total_price |   weekly_total_discount_price |   weekly_total_discounts | merchant_name         | merchant_area   | category                 | display              | description                                                                                                           |   total_orders |   total_revenue |   average_discount | most_ordered_item   | least_ordered_item      | most_ordered_item_category   |
|---:|:--------------|:--------------------|---------------------:|------------------------------:|-------------------------:|:----------------------|:----------------|:--------------------

In [31]:
# import pandas as pd

# # Load dataframes (assuming you have these CSV files in the specified paths)
# df_grm = pd.read_csv('../data/gmr_metrics.csv')
# df_review = pd.read_csv('../data/review_summary.csv')

def run_python_code(code: str) -> dict:
    """
    Function to run given python code
    """
    try:
        input_code = ToolInput(code=code)
        result = python_tool.func(input_code.code)
        return {'python_answer': result}
    except Exception as e:
        return {'error': str(e)}

functions_map = {
    "run_python_code": run_python_code,
}

tools = [
    {
        "name": "run_python_code",
        "description": "given a python code, runs it",
        "parameter_definitions": {
            "code": {
                "description": "executable python code",
                "type": "str",
                "required": True
            }
        }
    },
]

# Create a function to handle the cohere agent interaction
def cohere_agent(
    message: str,
    preamble: str,
    tools: List[dict],
    force_single_step=False,
    verbose: bool = False,
) -> str:
    counter = 0
    response = co.chat(
        model=COHERE_MODEL,
        message=message,
        preamble=preamble,
        tools=tools,
        force_single_step=force_single_step,
    )

    if verbose:
        print(f"\nrunning 0th step.")
        print(response.text)

    while response.tool_calls and counter < 5:
        tool_results = []

        if verbose:
            print(f"\nrunning {counter + 1}th step.")

        for tool_call in response.tool_calls:
            try:
                output = functions_map[tool_call.name](**tool_call.parameters)
                outputs = [output]
                tool_results.append({"call": tool_call, "outputs": outputs})

                if verbose:
                    print(
                        f"= running tool {tool_call.name}, with parameters: {tool_call.parameters}"
                    )
                    print(f"== tool results: {outputs}")
            except Exception as e:
                if verbose:
                    print(f"Error running tool {tool_call.name}: {e}")
                tool_results.append({"call": tool_call, "outputs": [{"error": str(e)}]})

        response = co.chat(
            model=COHERE_MODEL,
            message="",
            chat_history=response.chat_history,
            preamble=preamble,
            tools=tools,
            force_single_step=force_single_step,
            tool_results=tool_results,
        )

        if verbose:
            print(response.text)

        counter += 1

    return response.text

# Questions dictionary
question_dict = {
    "question1": "What is the average weekly revenue for the merchant?",
    "question2": "What is the common sentiment for the merchant?",
    "question3": "What is the average weekly review for the merchant?",
    "question4": "What is the average weekly revenue for the merchant?",
    "question6": "Which item has the highest revenue?",
    "question7": "Which merchant has the highest revenue?",
}

# Create the preamble
preamble = """
You are an expert who answers the user's question. You are working with two pandas dataframe in Python. Also, you are financial analyst that can analyze both 
numerical and categorical data. The name of the dataframe is `data/gmr_metrics.csv`. When you are asked a question, you will analyze it in details and provide
the answer. If the question is not clear, you can ask for clarification. If the user asks a question that is not in the list, you can ask the user to rephrase.
for weekly report for merchant and `data/review_summary.csv` for weekly summarization of review and weekly sentiment for each merchant.
When user ask about which item has highest/lowest/common, you provide the item name along with the value. 
Here is a preview of the dataframe of GRM:
{head_df} 
Here is a preview of the dataframe of Review: 
{head_df_review}
""".format(head_df=df_grm.drop('product', axis=1).head().to_markdown(), 
           head_df_review=df_review.drop('reviews', axis=1).head().to_markdown())

print(preamble)

# Iterate through the questions and get answers
for key, question in question_dict.items():
    print(f"Question: {question}")
    try:
        response = cohere_agent(
            message=question,
            preamble=preamble,
            tools=tools,
            verbose=True,
        )
        print(f"Answer: {response}")
    except Exception as e:
        print(f"Error handling question {key}: {e}")



You are an expert who answers the user's question. You are working with two pandas dataframe in Python. Also, you are financial analyst that can analyze both 
numerical and categorical data. The name of the dataframe is `data/gmr_metrics.csv`. When you are asked a question, you will analyze it in details and provide
the answer. If the question is not clear, you can ask for clarification. If the user asks a question that is not in the list, you can ask the user to rephrase.
for weekly report for merchant and `data/review_summary.csv` for weekly summarization of review and weekly sentiment for each merchant.
When user ask about which item has highest/lowest/common, you provide the item name along with the value. 
Here is a preview of the dataframe of GRM:
|    | merchant_id   | date                |   weekly_total_price |   weekly_total_discount_price |   weekly_total_discounts | merchant_name         | merchant_area   | category                 | display              | description     

In [34]:
question_dict = {
    "question1": "Give me analysis  and insights of the merchant revenue and sentiment, for the sentiment, provide the common sentiment, and what word is used the most in the reviews?",
}

In [35]:
for key, question in question_dict.items():
    print(f"Question: {question}")
    try:
        response = cohere_agent(
            message=question,
            preamble=preamble,
            tools=tools,
            verbose=True,
        )
        print(f"Answer: {response}")
    except Exception as e:
        print(f"Error handling question {key}: {e}")

Question: Give me analysis  and insights of the merchant revenue and sentiment, for the sentiment, provide the common sentiment, and what word is used the most in the reviews?

running 0th step.
I will use Python to analyse the data and provide insights on merchant revenue and sentiment.

running 1th step.
= running tool run_python_code, with parameters: {'code': 'import pandas as pd\n\ndf_grm = pd.read_csv(\'data/gmr_metrics.csv\')\ndf_review = pd.read_csv(\'data/review_summary.csv\')\n\n# Calculate total revenue for each merchant\ntotal_revenue = df_grm.groupby(\'merchant_id\')[\'total_revenue\'].sum()\n\n# Calculate average sentiment score for each merchant\navg_sentiment = df_review.groupby(\'Merchant_ID\')[\'sentiment\'].mean()\n\n# Common sentiment\ncommon_sentiment = df_review[\'sentiment\'].mode()[0]\n\n# Most common words in reviews\nmost_common_words = df_review[\'summary\'].str.split().explode().value_counts()\n\nprint("Total revenue for each merchant:\\n", total_revenue)\np

In [8]:
import src.financial_calculator as fc

In [17]:
df = pd.read_csv('data/food_dataset_v1.csv')

In [18]:
weekly_report =fc.process_data('data/food_dataset_v1.csv')

In [20]:
weekly_report.tail()

Unnamed: 0,merchant_id,date,product,weekly_total_price,weekly_total_discount_price,weekly_total_discounts,merchant_name,merchant_area,category,display,description,total_orders,total_revenue,average_discount,most_ordered_item,least_ordered_item,most_ordered_item_category
682,M683,2024-07-28,"Crunchy Signature Kakao, Crunchy Dirty Mokao, ...",612000.0,25000.0,1,"XIBOBA, Tenggilis Surabaya",surabaya,Minuman,Kakao Series,Cadburry chocolate milk dipadukan dengan choco...,25,587000.0,0.04085,Crunchy Signature Kakao,Crunchy Signature Kakao,Brown Sugar Boba Milk Tea
683,M684,2024-07-28,Dirty Matcha Dalgona With Hokkaido Milk Puddin...,612000.0,25000.0,1,"XIBOBA, Tunjungan Plaza",surabaya,Minuman,Dalgona Matcha Series,Signature biscoff sauce dipadukan dengan susu ...,25,587000.0,0.04085,Dirty Matcha Dalgona With Hokkaido Milk Pudding,Dirty Matcha Dalgona With Hokkaido Milk Pudding,Brown Sugar Boba Milk Tea
684,M685,2024-07-28,Promo Opening Beli 2 Paket Yummy 1 Gratis 1 Ay...,221500.0,171500.0,11,"Yasaka Fried Chicken, Sukolegok",surabaya,Aneka nasi/Ayam & bebek/Cepat saji,Promo Opening,2 Paket Yummy 1 (Paha Bawah/Sayap + Nasi) Grat...,12,50000.0,0.774266,Promo Opening Beli 2 Paket Yummy 1 Gratis 1 Ay...,Promo Opening Beli 2 Paket Yummy 1 Gratis 1 Ay...,Promo Opening Cuman 10 Ribu Dapat 1 Ayam Crisp...
685,M686,2024-07-28,"ICE OVALTINE, ICE BUBLEGUM, ICE TEMUMANTAN, BA...",1240000.0,0.0,0,"Yakini, Krian",surabaya,Bakmie/Kopi/Minuman,ORIGINAL,Ice Creamy Choco Chocolate ( Gelas Jumbo ),99,1240000.0,0.0,ICE OVALTINE,ICE OVALTINE,ICE GREEN TEA
686,M687,2024-07-28,"Black Oreo, Lemon Tea, Ice Cream Vanilla Caram...",2557550.0,0.0,0,"Zeger!, Sidoarjo",surabaya,Minuman/Kopi,Flavour Series,Nikmatnya Premium Black Oreo bikin kamu makin...,79,2557550.0,0.0,Black Oreo,Black Oreo,Lemon Tea


In [22]:
df.head()

Unnamed: 0,merchant_id,date,product,weekly_total_price,weekly_total_discount_price,weekly_total_discounts,merchant_name,merchant_area,category,display,description,total_orders,total_revenue,average_discount,most_ordered_item,least_ordered_item
0,M001,2024-07-21,"Hot Millbro, Hot 330 Helateh, Hot Ilusi Kopi, ...",516500.0,0.0,0,"330 Kopi, Ciledug",jakarta,Kopi/Minuman/Roti,Signature,Sajian Susu Coklat Milo Plus Espresso Dengan R...,20,516500.0,0.0,Hot Millbro,Hot Millbro
1,M001,2024-07-28,"Ice Almara Kopi (kopi Susu Gula Aren), Ice Haz...",1022000.0,0.0,0,"330 Kopi, Ciledug",jakarta,Kopi/Minuman/Roti,Signature,Sajian Kopi Susu Gula Aren Yang Berbeda Dari K...,44,1022000.0,0.0,Ice Almara Kopi (kopi Susu Gula Aren),Ice Almara Kopi (kopi Susu Gula Aren)
2,M002,2024-07-21,"Chicken Steak Pops, Tenderloin Steak Beef Pop,...",482000.0,0.0,0,"Abe Steak, Cibubur",jakarta,Aneka nasi/Minuman/Barat,New Varian,"Chicken, Kentang, Mix Vegetable, Brown Sauce",19,482000.0,0.0,Chicken Steak Pops,Chicken Steak Pops
3,M002,2024-07-28,"Hemat Super 5, Rib Eye Import 175 Gram, Lemon ...",1294000.0,0.0,0,"Abe Steak, Cibubur",jakarta,Aneka nasi/Minuman/Barat,Paket Hemat,5 Chicken Steak + 5 Es Teh Manis,35,1294000.0,0.0,Hemat Super 5,Hemat Super 5
4,M003,2024-07-21,"Lychee Popping Boba, Mango Juice, Ice Age Melt...",2690600.0,0.0,0,"Abuba Steak, Cikarang",jakarta,Barat,New Menu,"Kandungan gizi dengan Gramasi 1 gelas, Total ...",33,2690600.0,0.0,Lychee Popping Boba,Mango Juice


In [21]:
text = (
    "Ice cream is a sweetened frozen food typically eaten as a snack or dessert. "
    "It may be made from milk or cream and is flavoured with a sweetener, "
    "either sugar or an alternative, and a spice, such as cocoa or vanilla, "
    "or with fruit such as strawberries or peaches. "
    "It can also be made by whisking a flavored cream base and liquid nitrogen together. "
    "Food coloring is sometimes added, in addition to stabilizers. "
    "The mixture is cooled below the freezing point of water and stirred to incorporate air spaces "
    "and to prevent detectable ice crystals from forming. The result is a smooth, "
    "semi-solid foam that is solid at very low temperatures (below 2 °C or 35 °F). "
    "It becomes more malleable as its temperature increases.\n\n"
    'The meaning of the name "ice cream" varies from one country to another. '
    'In some countries, such as the United States, "ice cream" applies only to a specific variety, '
    "and most governments regulate the commercial use of the various terms according to the "
    "relative quantities of the main ingredients, notably the amount of cream. "
    "Products that do not meet the criteria to be called ice cream are sometimes labelled "
    '"frozen dairy dessert" instead. In other countries, such as Italy and Argentina, '
    "one word is used fo\r all variants. Analogues made from dairy alternatives, "
    "such as goat's or sheep's milk, or milk substitutes "
    "(e.g., soy, cashew, coconut, almond milk or tofu), are available for those who are "
    "lactose intolerant, allergic to dairy protein or vegan."
)

response = co.summarize(
    text=text,
)
print(response)



In [24]:
# only get summary from response

response.summary

'Ice cream is a frozen dessert or snack usually made with cream or milk and flavoured with sweeteners, fruit or spices. It is semi-solid foam with ice crystals and air spaces. It becomes more malleable as its temperature increases. The meaning of the name "ice cream" varies and in some countries, such as Italy, it is used to refer to all variants. Dairy alternatives are available for those allergic to or intolerant of lactose.'

In [25]:
df = pd.read_csv('data/reviews.csv')

In [26]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [27]:
df_review = pd.DataFrame()
df_review['rating'] = df['Score']
df_review['review'] = df['Text']

In [28]:
df_review.head()

Unnamed: 0,rating,review
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [31]:
num_merchants = 687
merchant_ids = [f'M{str(i).zfill(3)}' for i in range(1, num_merchants + 1)]

np.random.seed(0)  # For reproducibility
df_review['Merchant_ID'] = np.random.choice(merchant_ids, size=len(df_review))

In [32]:
df_review

Unnamed: 0,rating,review,Merchant_ID
0,5,I have bought several of the Vitality canned d...,M685
1,1,Product arrived labeled as Jumbo Salted Peanut...,M560
2,4,This is a confection that has been around a fe...,M630
3,2,If you are looking for the secret ingredient i...,M193
4,5,Great taffy at a great price. There was a wid...,M360
...,...,...,...
568449,5,Great for sesame chicken..this is a good if no...,M350
568450,2,I'm disappointed with the flavor. The chocolat...,M569
568451,5,"These stars are small, so you can give 10-15 o...",M444
568452,5,These are the BEST treats for training and rew...,M168


In [33]:
#rearrage Merchant_ID column to the first column

cols = df_review.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_review = df_review[cols]

In [34]:
df_review.head()

Unnamed: 0,Merchant_ID,rating,review
0,M685,5,I have bought several of the Vitality canned d...
1,M560,1,Product arrived labeled as Jumbo Salted Peanut...
2,M630,4,This is a confection that has been around a fe...
3,M193,2,If you are looking for the secret ingredient i...
4,M360,5,Great taffy at a great price. There was a wid...


In [35]:
# group by merchant_id and get the average rating, and list of reviews

df_review_grouped = df_review.groupby('Merchant_ID').agg(
    rating=('rating', 'mean'),
    reviews=('review', list),
).reset_index()

In [36]:
df_review_grouped.head()

Unnamed: 0,Merchant_ID,rating,reviews
0,M001,4.168409,"[Honey Dijon flavor: okay flavor, but both my ..."
1,M002,4.247874,"[I'm not sure why, but this is our favorite pa..."
2,M003,4.096031,[OMG these things are delicious!@ I'm not a ca...
3,M004,4.188612,[A good tasting light chip that has some value...
4,M005,4.209476,[I wouldn't even think of buying this product ...


In [37]:
df_review_grouped.shape

(687, 3)

In [38]:
# for each merchant, applying cohere summarization to the list of reviews,and get summary and store in a new column named 'review_summary'

df_review_grouped['review_summary'] = df_review_grouped['reviews'].apply(
    lambda reviews: co.summarize(text=' '.join(reviews)).summary
)


BadRequestError: status_code: 400, body: {'message': 'invalid request: text size limit exceeded by 281856 characters.'}

In [39]:
def summarize_reviews(reviews):
    text = ' '.join(reviews)
    max_text_size = 3000
    # Truncate the text if it's too long
    if len(text) > max_text_size:
        text = text[:max_text_size]
    # Get the summary
    summary = co.summarize(text=text).summary
    return summary

df_review_grouped['review_summary'] = df_review_grouped['reviews'].apply(summarize_reviews)

In [41]:
df_review_grouped.head()

Unnamed: 0,Merchant_ID,rating,reviews,review_summary
0,M001,4.168409,"[Honey Dijon flavor: okay flavor, but both my ...",I bought the Honey Dijon flavor of Kettle Chip...
1,M002,4.247874,"[I'm not sure why, but this is our favorite pa...",The best tasting pancake & waffle mix and an e...
2,M003,4.096031,[OMG these things are delicious!@ I'm not a ca...,This brand of stevia is the best tasting brand...
3,M004,4.188612,[A good tasting light chip that has some value...,These are the healthiest potato chips you can ...
4,M005,4.209476,[I wouldn't even think of buying this product ...,I wouldn't purchase a product unless I knew th...
