In [85]:
import pandas as pd

In [86]:
# Change display setting to show full text
pd.set_option('display.max_colwidth', None)

bad_review = pd.read_json('data.json')
bad_review.head()

Unnamed: 0,rating,review
0,1,"For the first 2 weeks, it was working perfectly but now I feel like I wasted money."
1,1,It defrosts very quickly
2,5,It is quite good
3,5,Everything about it is perfect ????
4,5,"Skyrun did a great job, it freezes very last I love it"


In [87]:
bad_review.tail()

Unnamed: 0,rating,review
255,5,nice made in nigeria
256,5,I love it so so much
257,5,This product is magical ????????????. I had to order for the deep freezer ????
258,5,"I totally love it, it is very cold when plugged and the freezer part is pretty big for a small sized fridge. Money well spent"
259,4,Very nice


In [88]:
bad_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   rating  260 non-null    int64 
 1   review  260 non-null    object
dtypes: int64(1), object(1)
memory usage: 4.2+ KB


In [89]:
bad_review.isnull()

Unnamed: 0,rating,review
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
255,False,False
256,False,False
257,False,False
258,False,False


In [90]:
# Filter reviews with a rating of 1
bad_review[bad_review['rating'] == 1]

Unnamed: 0,rating,review
0,1,"For the first 2 weeks, it was working perfectly but now I feel like I wasted money."
1,1,It defrosts very quickly
17,1,"It is not cooling.\nIn fact, it does not work at all"
38,1,"I disappointed with this product you sent to me, very poor quality and high price"
49,1,"It cools but the freezer don't freeze, I don't like it"
69,1,"item is way smaller than it appeared on screen, a total disappointment"
70,1,Smaller than it appeared. Total waste of money!
74,1,very small
76,1,It's not what I expected
78,1,"It’s not working well, how do I refund and collect another one?"


In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Preprocess the reviews and extract text features
# bad_review should be a list of strings
bad_review = bad_review  # Ensure bad_review is a list of review text strings

# Create the TfidfVectorizer and transform the review data
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(bad_review["review"])

# Perform topic modeling to identify common complaints
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

# Display common words in each topic (common complaints)
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx+1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

Topic 1:
['picture', 'size', 'okay', 'bigger', 'looks', 'quality', 'good', 'smaller', 'expected', 'perfect']
Topic 2:
['fast', 'serving', 'really', 'use', 'cool', 'good', 'portable', 'nice', 'product', 'love']
Topic 3:
['cold', 'really', 'freezer', 'exactly', 'expected', 'fridge', 'cools', 'like', 'good', 'fast']
Topic 4:
['good', 'quite', 'product', 'bigger', 'fine', 'size', 'picture', 'just', 'small', 'works']
Topic 5:
['quickly', 'sides', 'work', 'ok', 'hot', 'product', 'great', 'fine', 'perfectly', 'working']


In [92]:
from collections import Counter

# List of complaint keywords to look for
complaints_keywords = ['poor quality', 'defective', 'bad', 'quality', 'return', 'refund']

# Initialize a counter to count the occurrences of each keyword
complaints_count = Counter()

# Iterate over the list of reviews in bad_review
for review in bad_review["review"]:
    # Convert each review to lowercase for case-insensitive matching
    review_lower = review.lower()
    
    # Count the occurrence of each complaint keyword in the review
    for word in complaints_keywords:
        if word in review_lower:
            complaints_count[word] += 1

# Print the count of each common complaint keyword
print("Common Complaints Count:", complaints_count)

Common Complaints Count: Counter({'quality': 4, 'bad': 2, 'poor quality': 1, 'refund': 1, 'return': 1})


In [None]:
# Convert counter to a DataFrame for analysis
complaints_df = pd.DataFrame(list(complaints_count.items()), columns=['Keyword', 'Frequency'])

# Plotting the results as a bar chart
plt.figure(figsize=(8, 5))
plt.bar(complaints_df['Keyword'], complaints_df['Frequency'], color='skyblue')
plt.xlabel('Complaint Keywords')
plt.ylabel('Frequency')
plt.title('Most Frequent Complaint Keywords in Reviews')
plt.xticks(rotation=45)
plt.tight_layout()  # Adjust layout for better fit
plt.show()

In [None]:
def visualize_word_cloud():
    words = " "
    for msg in bad_review["review"]:
        msg = str(msg.lower())
        words = words + msg + " "

    word_cloud = WordCloud(width=3000, height=2500, random_state=1, background_color="white").generate(words)
    # plot the WordCloud image
    plt.figure(figsize=(14, 7), facecolor=None)
    plt.imshow(word_cloud)
    plt.axis("off")
    plt.tight_layout(pad=0)

    plt.show()


visualize_word_cloud()

In [None]:
openai_api_key = "your_open_ai_api_key"
openai_client = OpenAI(api_key=openai_api_key)


def analyze_reviews_with_openai():

    # Define your dataset (example format)
    product_details = """
    Product Name: Skyrun 70 Litres Double Door Top Mount Fridge (BCD-85HC) - Silver
    Features:   Large capacity:  Easily meet the needs of family life.
                Safety lock: Availability of lock with key to keep your food safe-stored even when placing outhouse.
                Fast cooling: Refrigerated, ingredients keep fresh; Freezing, ingredients remain good.
                Interior LED Lighting: Eco & long-lasting LED lighting with stylish cold white feature, casting bright & even illumination for great visibility inside.
                Energy-saving: Fulfill all the European regulations with energy Class up to A++, save electricity and save your money.
                Anti-rust cabinet: Anti-rust design, suitable for changing climate.
    """

    bad_reviews = "\n".join(bad_review["review"])

    # Define your prompt with the dataset
    prompt = f"""
    Your input is a dataset of product details and a dataset of the bad reviews of that product. Your task is to first learn and understand the product, then go over the bad reviews and analyze them based on the following requirements:
    1. What are the most common complaints, including a statistical analysis and examples?
    2. What are the most common problems people experience with the product, including a statistical analysis and examples?
    3. What people suggested that can be improved, including a statistical analysis and examples.

    Product Details:
    {product_details}

    Bad Reviews:
    {bad_reviews}
    """
    messages = [{"role": "user", "content": prompt}]

    # Send request to OpenAI API
    response = openai_client.chat.completions.create(
        model="gpt-4",  # or "text-davinci-003" or gpt-4o-mini
        messages=messages,
        max_tokens=1500,  # Adjust based on output length requirements
        temperature=0.5,  # Adjust creativity level
    )

    # return the response
    return response.choices[0].message.content