# Solution
# (7) Practice Learning Activity: Monitor and improve Virtual Agent performance through user satisfaction ratings and feedback
##### (GenAI Life Cycle Phase 7: Monitoring and Improvement self-practice)
---

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from detoxify import Detoxify
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# Load CSV file into a pandas DataFrame
csv_path = "/home/ailtk-learner/Documents/GitHub/capstone-ailtk/case-navigation-module/case-files/yelp_academic_dataset_business.csv"
df = pd.read_csv(csv_path)

# Preview the dataset
print(df.head())

# ---- WORD CLOUD: REVIEWS ----
plt.figure(figsize=(8, 6))
review_text = " ".join(df["review"].dropna().astype(str))  # Adjust column name if necessary
wordcloud_reviews = WordCloud(width=600, height=400, background_color="white", colormap="viridis").generate(review_text)
plt.imshow(wordcloud_reviews, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud: Reviews")
plt.show()

# ---- TOXICITY ANALYSIS ----
detoxify_model = Detoxify('original')
toxicity_threshold = 0.5
toxicity_scores_list = []

for i, review in enumerate(df['review'].dropna()):  # Adjust column name if necessary
    toxicity_scores = detoxify_model.predict(review)
    toxicity_scores = {key: float(value) for key, value in toxicity_scores.items()}
    toxicity_scores_list.append(toxicity_scores)

    if any(score > toxicity_threshold for score in toxicity_scores.values()):
        print(f"Warning: Potentially unsafe content detected in review {i}.")
        print(f"Details: {toxicity_scores}")

# Convert the toxicity scores list to a DataFrame
toxicity_df = pd.DataFrame(toxicity_scores_list)

# ---- HEATMAP: TOXICITY SCORES ----
plt.figure(figsize=(10, 8))
sns.heatmap(toxicity_df, annot=True, cmap=sns.color_palette("coolwarm", as_cmap=True), vmin=0, vmax=1, cbar=True)
plt.title('Toxicity Scores Heatmap')
plt.xlabel('Toxicity Categories')
plt.ylabel('Reviews')
plt.show()

# ---- FEEDBACK DISTRIBUTION ----
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x="stars", palette="coolwarm")  # Adjust column name if necessary
plt.title("Review Rating Distribution")
plt.xlabel("Stars")
plt.ylabel("Count")
plt.show()

# ---- TEXT PREPROCESSING ----
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)

df_cleaned = df['review'].dropna().apply(preprocess_text)  # Adjust column name if necessary

# ---- BIGRAM ANALYSIS ----
vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english')
X = vectorizer.fit_transform(df_cleaned)
ngram_freq = X.toarray().sum(axis=0)
ngram_terms = vectorizer.get_feature_names_out()

ngram_df = pd.DataFrame(list(zip(ngram_terms, ngram_freq)), columns=["Bigram", "Frequency"])
ngram_df = ngram_df.sort_values(by="Frequency", ascending=False)

print(ngram_df.head(10))

# ---- FILTER REVIEWS WITH SPECIFIC BIGRAMS ----
bigrams_to_check = ['customer service', 'needs better']

def contains_bigram(text, bigrams):
    if isinstance(text, str):
        return any(bigram in text for bigram in bigrams)
    return False

filtered_df = df[df['review'].notna() & df['review'].apply(lambda x: contains_bigram(x, bigrams_to_check))]

# Display filtered entries
print(filtered_df)
