In [None]:
!pip install google-search-results cohere pandas numpy sklearn matplotlib ntlk

In [None]:
from serpapi import GoogleSearch
import cohere
from cohere.responses.classify import Example
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import collections
from collections import Counter
import sklearn

In [None]:
cohere_key = ''
place_id = " "
serp_key = " "
co = cohere.Client(cohere_key)
params = {
  "engine": " ",
  "place_id": place_id,
  "api_key": serp_key,
}

In [None]:
training_data = pd.read_csv('a1_restaurantReviews_HistoricDump.csv')

In [None]:
mapping = {0: "Negative", 1: "Positive"}  
training_data["Liked"] = training_data["Liked"].replace(mapping)
#append the data to an examples
examples = []
for index, row in training_data.iterrows():
    example = Example(row["Review"], row["Liked"])
    examples.append(example)
print(training_data)

In [None]:
#search for the reviews (serp api)
search = GoogleSearch(params)
results = search.get_dict()
reviews = results["reviews"]
#append the review to review array
reviews_array = []
for user_data in reviews:
    comment_text = user_data['comment']['text']
    reviews_array.append(comment_text)



In [None]:
response_1 = co.classify(
  inputs=reviews_array,
  examples=examples,
)

reviews_dict = {"text": [], "sentiment": [], "confidence": []}
for data in response_1.classifications:
    text = data.input
    sentiment = data.prediction
    confidence = data.confidence*100
    review_dict = {"text": text, "sentiment": sentiment, "confidence": confidence}
    reviews_dict["text"].append(text)
    reviews_dict["sentiment"].append(sentiment)
    reviews_dict["confidence"].append(confidence)
    

    
    
reviews_sentiment_df = pd.DataFrame(reviews_dict)
print(reviews_sentiment_df)

In [None]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Define a function to extract adjectives and nouns from a text
def extract_adj_noun(text):
    tokens = word_tokenize(text.lower())
    pos_tags = pos_tag(tokens)
    adj_noun = [word for word, tag in pos_tags if tag in ['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS']]
    return ' '.join(adj_noun)

# Apply the function to the review text column
reviews_sentiment_df['adj_noun'] = reviews_sentiment_df['text'].apply(extract_adj_noun)
print(reviews_sentiment_df)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object and fit it to the adj_noun column
vectorizer = CountVectorizer()
vectorizer.fit(reviews_sentiment_df['adj_noun'])

# Transform the adj_noun column into a sparse matrix of term frequencies
vector = vectorizer.transform(reviews_sentiment_df['adj_noun'])

# Print the matrix
print(vector)

In [None]:
reviews_sentiment_df['sentiment'].value_counts().plot.pie(figsize=(6,6),title="Distribution of reviews per sentiment",labels=['',''],autopct='%1.1f%%')
labels=["Positive","Negative"]
plt.legend(labels,loc=3)
plt.gca().set_aspect('equal')

In [None]:
from sklearn.model_selection import train_test_split
X = reviews_sentiment_df['text']
y = reviews_sentiment_df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(vector,y,test_size = 0.2)

print(y_train)
plt.pie(y_train.value_counts(), 
        labels=['Negative Review','Positive Review'], 
        autopct='%0.1f%%')
plt.axis('equal')
plt.title("Vector data")
plt.show()

In [None]:
bad_reviews = []
good_reviews = []

for data in response_1.classifications:
  if float(data.confidence) > 0.6:
    if data.prediction == "Positive":
      good_reviews.append(data.input)
    if data.prediction == "Negative":
      bad_reviews.append(data.input)

#join the string
bad_reviews_string = " ".join(bad_reviews)
good_reviews_string = " ".join(good_reviews)
#this is for the reviews  
text=(
  bad_reviews_string
)
#what most of the good/bad repsonse talk about 
response_2 = co.summarize(
  text=text,
  length="short",
  extractiveness="high",
  format="bullets",
  additional_command="give only keywords"
)

prompt1 = "get key words from the text: " + bad_reviews_string
prompt2 = "get key words from the text: " + good_reviews_string
# model = co.topic_modeling(good_reviews)
# print(model.topics)
advice1 = co.generate(
    model='command-nightly',  
    prompt = prompt1,  
    max_tokens=200,  
    temperature=0.750)
advice2 = co.generate(
    model='command-nightly',  
    prompt = prompt2,  
    max_tokens=200,  
    temperature=0.750)
print("Bad reviews keywords\n")
print(advice1.generations[0])
print("\n")
print("Good reviews keywords\n")
print(advice2.generations[0])

In [None]:
!pip install imblearn

In [None]:
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [None]:
# create logistic regression model + SMOTE
logreg = LogisticRegression()
# train model on  vectorised training data
model = logreg.fit(X_train, y_train)

In [None]:
import shap

explainer = shap.Explainer(model, X_train, feature_names=vectorizer.get_feature_names_out())
shap_values = explainer(X_test)

In [None]:
shap.plots.beeswarm(shap_values,max_display=10)