In [1]:
#%pip install python-decouple

In [2]:
#import libraries
from decouple import config
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.foundation_models.prompts import PromptTemplate, PromptTemplateManager
from ibm_watsonx_ai.foundation_models.utils.enums import ModelTypes
from ibm_watsonx_ai.foundation_models.schema import TextGenParameters
import pandas as pd
from sklearn.metrics import classification_report 
import import_ipynb
import sys
sys.path.append('../')
from Preprocessing import Train_Test_split as ts

Training data shape: (452068, 3)
Test data shape: (113018, 3)
Training samples: 361654, Validation samples: 90414, Test samples: 113018


In [3]:
#get the API key and project key
WX_API_KEY = config('WX_API_KEY')
PROJECT_ID = "f79c2d38-7ee2-4de6-931a-dad71b72d34f"

In [4]:
credentials = Credentials(
    url = "https://us-south.ml.cloud.ibm.com",
    api_key = WX_API_KEY
)

client = APIClient(
    credentials=credentials, 
    project_id=PROJECT_ID
)

In [5]:
'''PARAMS = TextGenParameters(
    temperature=0.2,      # Higher temperature means more randomness
    max_new_tokens=3, # Maximum number of tokens to generate
    min_new_tokens=1, # Minimum number of tokens to generate
)

model = ModelInference(
    api_client=client,
    model_id="mistralai/mistral-small-3-1-24b-instruct-2503",
    params=PARAMS
)
TextGenParameters.show()'''

'PARAMS = TextGenParameters(\n    temperature=0.2,      # Higher temperature means more randomness\n    max_new_tokens=3, # Maximum number of tokens to generate\n    min_new_tokens=1, # Minimum number of tokens to generate\n)\n\nmodel = ModelInference(\n    api_client=client,\n    model_id="mistralai/mistral-small-3-1-24b-instruct-2503",\n    params=PARAMS\n)\nTextGenParameters.show()'

In [6]:
PARAMS = TextGenParameters(
    temperature=0.1,      # Higher temperature means more randomness
    max_new_tokens=2, # Maximum number of tokens to generate
    min_new_tokens=1, # Minimum number of tokens to generate
)

model = ModelInference(
    api_client=client,
    model_id="mistralai/mistral-small-3-1-24b-instruct-2503",
    params=PARAMS
)
TextGenParameters.show()

+-----------------------+----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
| PARAMETER             | TYPE                                   | EXAMPLE VALUE                                                                                                                             |
| decoding_method       | str, TextGenDecodingMethod, NoneType   | sample                                                                                                                                    |
+-----------------------+----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
| length_penalty        | dict, TextGenLengthPenalty, NoneType   | {'decay_factor': 2.5, 'start_index': 5}                                                                  

In [7]:
#This is partly generated using Copilot suggestions
#This system prompt provides the instruction to the model to provide the sentiment score per review and outputs that in number between 1-3

def sentiment_prompt(text):
    prompt = """ 
    Return only a number based on the sentiment for the text. Do not give an explanation. Your response should only be a single character:
    1 for Negative
    2 for Neural
    3 for Positive.

    text: {}

    """
    return prompt.format(text).replace('\n', '')

In [8]:
#applying the LLM to each reviews

def get_sentiment_llm(review): #takes a single restaurant review as input and returns the sentiment classification
    prompt = sentiment_prompt(review) #creates promprt based on the review
    response = model.generate_text(prompt) #sends the prompt to the LLM 
    return response #model's response is returned

In [9]:
df = ts.BI_df[["business_id", "text", "city", "date", "stars_3_scale", "address"]]
df

Unnamed: 0,business_id,text,city,date,stars_3_scale,address
31,aJvxWyQIG5OLfBw3qAe8xA,"Ordered Caramel frappe at Drive thru, BIG MIST...",Nashville,2017-12-29 19:38:31,1,2308 Murfreesboro Pike
46,MjZQqZAmJeMco_Vq-Y9h-g,Drum-roll please! Review #100 coming right up!...,Carmel,2014-02-05 19:38:24,3,13170 North Hazel Dell Pkwy
54,u7MJKcNdZXYyTeb67vD5jw,We stopped here for my Chai and Hubby's coffee...,Mount Laurel,2017-02-09 04:35:39,3,5 Hartford Rd
89,saJFbz12EnzanelpD8_xXQ,There's been three times that I've ordered a g...,Riverview,2016-08-25 14:08:18,1,10716 Big Bend Rd
104,KiE0h68HGOO7ZXAqkMBdiw,"I went in when they had 4 people working, wait...",Santa Barbara,2016-01-30 01:10:42,1,1015 De La Vina St
...,...,...,...,...,...,...
586567,-85kJMtb9wqNWDT8yLbitw,Fair service- messed up my drink cuppa times....,Franklin,2021-12-13 22:08:14,2,"438 Main St, Space 168, Space 168"
586568,Ddg-J_j0YFErk7wpMtH_0A,On my way out of Reno last week I noted a new ...,Reno,2021-12-28 19:26:37,3,986 Ambassador Dr
586638,2rmpfdyV2POqpXtmPHO_IQ,always closing drive thru. i've driven here so...,Tucson,2021-12-28 18:12:04,1,6363 N La Cholla
586646,GxuxCctcz3Hyk0wnuly7vQ,This is now one of two Starbucks in the shoppi...,Tucson,2017-01-21 00:19:27,3,4811 E Grant


In [10]:
df1 = df.iloc[0:5000].copy()
df1["llm_sentiment"] = df1["text"].apply(get_sentiment_llm)

In [11]:
df1

Unnamed: 0,business_id,text,city,date,stars_3_scale,address,llm_sentiment
31,aJvxWyQIG5OLfBw3qAe8xA,"Ordered Caramel frappe at Drive thru, BIG MIST...",Nashville,2017-12-29 19:38:31,1,2308 Murfreesboro Pike,1
46,MjZQqZAmJeMco_Vq-Y9h-g,Drum-roll please! Review #100 coming right up!...,Carmel,2014-02-05 19:38:24,3,13170 North Hazel Dell Pkwy,3
54,u7MJKcNdZXYyTeb67vD5jw,We stopped here for my Chai and Hubby's coffee...,Mount Laurel,2017-02-09 04:35:39,3,5 Hartford Rd,3
89,saJFbz12EnzanelpD8_xXQ,There's been three times that I've ordered a g...,Riverview,2016-08-25 14:08:18,1,10716 Big Bend Rd,1
104,KiE0h68HGOO7ZXAqkMBdiw,"I went in when they had 4 people working, wait...",Santa Barbara,2016-01-30 01:10:42,1,1015 De La Vina St,1
...,...,...,...,...,...,...,...
136027,RpxJ4-rYuqAI2an7WGELFQ,Now OPEN inside Silver Legacy on the casino fl...,Reno,2014-01-12 18:05:32,2,407 N Virginia St,2
136094,4QnaPR1FJlONov3ANyuOdg,Plenty places to sit inside and outside well d...,Valrico,2015-01-26 03:54:07,3,3482 Lithia Pinecrest Rd,3
136137,B6k9V9q3BaSCjWtDyNipfA,There is a reason this is one of the two locat...,Bala Cynwyd,2011-09-12 18:33:50,3,138 Montogomery Ave,3
136146,_T8tOBDr3NKNxd0yl5GwjA,I think this is the best Starbucks in the Sant...,Goleta,2018-03-22 01:51:44,3,7030 Marketplace Dr,3


In [12]:
df2 = df.iloc[5000:10000].copy()
df2["llm_sentiment"] = df2["text"].apply(get_sentiment_llm)

In [13]:
df3 = df.iloc[10000:15000].copy()
df3["llm_sentiment"] = df3["text"].apply(get_sentiment_llm)

In [14]:
df4 = df.iloc[15000:].copy()
df4["llm_sentiment"] = df4["text"].apply(get_sentiment_llm)

In [15]:
final_df = pd.concat([df1,df2,df3,df4])

In [16]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21575 entries, 31 to 586656
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   business_id    21575 non-null  object        
 1   text           21575 non-null  object        
 2   city           21575 non-null  object        
 3   date           21575 non-null  datetime64[ns]
 4   stars_3_scale  21575 non-null  int64         
 5   address        21575 non-null  object        
 6   llm_sentiment  21575 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 1.3+ MB


In [17]:
final_df.head(10)

Unnamed: 0,business_id,text,city,date,stars_3_scale,address,llm_sentiment
31,aJvxWyQIG5OLfBw3qAe8xA,"Ordered Caramel frappe at Drive thru, BIG MIST...",Nashville,2017-12-29 19:38:31,1,2308 Murfreesboro Pike,1
46,MjZQqZAmJeMco_Vq-Y9h-g,Drum-roll please! Review #100 coming right up!...,Carmel,2014-02-05 19:38:24,3,13170 North Hazel Dell Pkwy,3
54,u7MJKcNdZXYyTeb67vD5jw,We stopped here for my Chai and Hubby's coffee...,Mount Laurel,2017-02-09 04:35:39,3,5 Hartford Rd,3
89,saJFbz12EnzanelpD8_xXQ,There's been three times that I've ordered a g...,Riverview,2016-08-25 14:08:18,1,10716 Big Bend Rd,1
104,KiE0h68HGOO7ZXAqkMBdiw,"I went in when they had 4 people working, wait...",Santa Barbara,2016-01-30 01:10:42,1,1015 De La Vina St,1
135,saJFbz12EnzanelpD8_xXQ,Most of the time I go through the drive thru h...,Riverview,2015-04-01 16:20:54,3,10716 Big Bend Rd,3
165,BauybYsfqd0y6KDrJ6ZxjQ,i dont know what has happened to the in store ...,Tucson,2013-11-14 03:36:28,1,"7288 E Broadway Blvd, Ste 150",1
169,BauybYsfqd0y6KDrJ6ZxjQ,Nothing makes my busy day easy like my iced co...,Tucson,2013-10-24 07:10:38,1,"7288 E Broadway Blvd, Ste 150",1
205,RCy4M2ND4YK0uRbodV_v8g,Starbucks...so aren't they all just clones? a...,New Orleans,2011-08-24 21:28:36,2,800 Harrison Ave,1
219,aJvxWyQIG5OLfBw3qAe8xA,Much better than the one on Bell Rd. near Red ...,Nashville,2012-11-06 19:48:05,3,2308 Murfreesboro Pike,3


In [18]:
%pip install pyarrow

Note: you may need to restart the kernel to use updated packages.


In [19]:
#final_df.to_parquet('llm_sentiment.parquet')
final_df.to_parquet('llm_sentiment_temp0_1.parquet')

In [20]:
#final_df.to_json("llm_sentiment.json", orient="records", lines=True)
final_df.to_json("llm_sentiment_temp0_1.json", orient="records", lines=True)