In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud

# Settings for better display in the notebook
pd.set_option('display.max_colwidth', 200)
sns.set_style('whitegrid')
print("Libraries imported successfully.")


Libraries imported successfully.


In [4]:
DATA_FILEPATH = '../data/treehut_comments.csv'

try:
    df = pd.read_csv(DATA_FILEPATH, sep=',', on_bad_lines='skip')
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: Data file not found at {DATA_FILEPATH}")

# --- Data Cleaning & Preparation ---
print("Original Data Shape:", df.shape)
df.dropna(subset=['comment_text'], inplace=True)
df['comment_text'] = df['comment_text'].astype(str)
df['timestamp'] = df['timestamp'] = pd.to_datetime(df['timestamp'], format='ISO8601')
df.set_index('timestamp', inplace=True)

print("\nData Info After Cleaning:")
df.info()

print("\n--- Data Head ---")
display(df.head())


Data loaded successfully.
Original Data Shape: (17841, 4)

Data Info After Cleaning:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17812 entries, 2025-03-01 00:13:57.153000+00:00 to 2025-04-02 18:29:59.086000+00:00
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   media_id       17812 non-null  int64 
 1   media_caption  17804 non-null  object
 2   comment_text   17812 non-null  object
dtypes: int64(1), object(2)
memory usage: 556.6+ KB

--- Data Head ---


Unnamed: 0_level_0,media_id,media_caption,comment_text
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-03-01 00:13:57.153000+00:00,1090986906404998,"Soft skin, soft life 🩷🌸🫧 get your hands on this self-care gem!",I bet this is good
2025-03-01 00:23:06.879000+00:00,17950254656929862,Why use one scrub when you can use them all at once 🤩\n\n#treehut #treehutcollection #megascrub,i know this smells so good
2025-03-01 00:04:05.094000+00:00,1090109319826090,Morning routine with Tree Hut 🍊🫧 Now available online and in-store!,Love it
2025-03-01 00:41:59.467000+00:00,1098364052333950,Why use one scrub when you can use them all at once 🤩,Please carry these in Canada! I miss them so much!
2025-03-01 02:21:29.715000+00:00,1083943630442659,Vanilla Serum-Infused Hand Wash: A sweet escape for dry hands.,I love it ..✌️


In [5]:
print("--- Top 10 Most Commented-On Posts (by caption) ---")
display(df['media_caption'].value_counts().head(10))


--- Top 10 Most Commented-On Posts (by caption) ---


media_caption
APPLICATIONS ARE NOW CLOSED! 🚨 \nThank you to everyone who applied! 💖\n\nBIG NEWS! 🚨 \n\nOur 2025 PR List Applications are officially OPEN! 🎉 Want in? Drop “TreeHut PR” in the comments & we’ll send you the link! 💌✨                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        5762
✨🌴SPRING BREAK GIVEAWAY 🌴✨\n\nWith spring break around the corner, we’r

In [6]:
# --- INVESTIGATION A: OVERALL COMMUNITY SENTIMENT ---

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function to get the sentiment category from a text
def get_sentiment_category(text: str) -> str:
    """
    Analyzes the sentiment of a text and returns 'positive', 'negative', or 'neutral'.
    """
    # VADER's polarity_scores method returns a dictionary with scores.
    # The 'compound' score is a normalized, weighted composite score.
    score = analyzer.polarity_scores(text)['compound']
    
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to the 'comment_text' column to create a new 'sentiment' column
df['sentiment'] = df['comment_text'].apply(get_sentiment_category)

print("Sentiment analysis complete.")

# Display the distribution of sentiments as percentages
sentiment_distribution = df['sentiment'].value_counts(normalize=True) * 100
print("\nSentiment Distribution (%):")
display(sentiment_distribution)

# Display a few examples of each sentiment type for a sanity check
print("\n--- Positive Comment Examples ---")
display(df[df['sentiment'] == 'positive']['comment_text'].head())

print("\n--- Negative Comment Examples ---")
display(df[df['sentiment'] == 'negative']['comment_text'].head())

Sentiment analysis complete.

Sentiment Distribution (%):


sentiment
neutral     68.835616
positive    26.914440
negative     4.249944
Name: proportion, dtype: float64


--- Positive Comment Examples ---


timestamp
2025-03-01 00:13:57.153000+00:00                                    I bet this is good
2025-03-01 00:23:06.879000+00:00                            i know this smells so good
2025-03-01 00:04:05.094000+00:00                                               Love it
2025-03-01 00:41:59.467000+00:00    Please carry these in Canada! I miss them so much!
2025-03-01 02:21:29.715000+00:00                                        I love it ..✌️
Name: comment_text, dtype: object


--- Negative Comment Examples ---


timestamp
2025-03-01 02:49:52.965000+00:00                               Bad bad bad!!!!
2025-03-01 02:30:07.229000+00:00    Wasteful! And this actually pisses me off!
2025-03-01 09:49:34.924000+00:00                         What's with the eyes😭
2025-03-01 09:35:30.464000+00:00                         🍊🔥🧡@maiden_butterfly_
2025-03-01 18:26:36.091000+00:00                       Stop showing armpits!!!
Name: comment_text, dtype: object

In [7]:
# --- INVESTIGATION B: PRODUCT & SCENT BUZZ ---

# Define a comprehensive list of known product names and scents
PRODUCT_KEYWORDS = [
    'tropic glow', 'sunlit glow', 'coco colada', 'vanilla', 'pink champagne', 
    'ocean glow', 'santal haze', 'palm grove', 'jelly bear', 'tangerine', 
    'moroccan rose', 'watermelon', 'passionfruit', 'lychee', 'strawberry', 'peach'
]

# Create a dictionary to store the mention count for each product
product_mentions = {}
for product in PRODUCT_KEYWORDS:
    # Use str.contains() for powerful and fast substring searching.
    # `case=False` makes the search case-insensitive (e.g., 'Vanilla' and 'vanilla' are counted).
    # `na=False` treats any potential NaN values as not containing the keyword.
    product_mentions[product] = df['comment_text'].str.contains(product, case=False, na=False).sum()

# Convert the dictionary to a Pandas Series for easy sorting and plotting
product_buzz = pd.Series(product_mentions).sort_values(ascending=False)

print("--- Product & Scent Mention Counts ---")
display(product_buzz)

--- Product & Scent Mention Counts ---


vanilla           96
moroccan rose     70
coco colada       66
watermelon        43
tangerine         36
tropic glow       26
strawberry        18
jelly bear        13
ocean glow         9
peach              8
passionfruit       2
pink champagne     1
lychee             1
palm grove         0
sunlit glow        0
santal haze        0
dtype: int64

In [8]:
# --- INVESTIGATION C: GEOGRAPHIC EXPANSION OPPORTUNITIES ---

LOCATION_KEYWORDS = ['canada', 'uk', 'europe', 'australia', 'germany', 'mexico', 'brazil', 'netherlands', 'sweden', 'shipping to']

# Create a dictionary to store location mention counts
location_mentions = {}
for loc in LOCATION_KEYWORDS:
    location_mentions[loc] = df['comment_text'].str.contains(loc, case=False, na=False).sum()

# Convert to a sorted Series
location_demand = pd.Series(location_mentions).sort_values(ascending=False)

print("--- International Demand Signals ---")
# We only care about locations that were actually mentioned, so filter for counts > 0
display(location_demand[location_demand > 0])

--- International Demand Signals ---


uk             20
canada         11
brazil          5
australia       2
germany         1
europe          1
netherlands     1
dtype: int64

In [9]:
# --- INVESTIGATION D: GIVEAWAY VS. ORGANIC CONTENT ---

# Segment the DataFrame into giveaway and organic comments.
# This is a powerful way to separate different user intents.
df_giveaway = df[df['media_caption'].str.contains("giveaway", case=False, na=False)]
df_organic = df[~df['media_caption'].str.contains("giveaway", case=False, na=False)]

print(f"Identified {len(df_giveaway)} comments on giveaway posts.")
print(f"Identified {len(df_organic)} comments on organic posts.")

# Analyze sentiment for each segment
giveaway_sentiment = df_giveaway['sentiment'].value_counts(normalize=True) * 100
organic_sentiment = df_organic['sentiment'].value_counts(normalize=True) * 100

print("\n--- Giveaway Post Sentiment Distribution (%) ---")
display(giveaway_sentiment)

print("\n--- Organic Post Sentiment Distribution (%) ---")
display(organic_sentiment)

Identified 6154 comments on giveaway posts.
Identified 11658 comments on organic posts.

--- Giveaway Post Sentiment Distribution (%) ---


sentiment
neutral     82.027949
positive    16.850829
negative     1.121222
Name: proportion, dtype: float64


--- Organic Post Sentiment Distribution (%) ---


sentiment
neutral     61.871676
positive    32.226797
negative     5.901527
Name: proportion, dtype: float64