In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud

# Settings for better display in the notebook
pd.set_option('display.max_colwidth', 200)
sns.set_style('whitegrid')
print("Libraries imported successfully.")


Libraries imported successfully.


In [4]:
DATA_FILEPATH = '../data/treehut_comments.csv'

try:
    df = pd.read_csv(DATA_FILEPATH, sep=',', on_bad_lines='skip')
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: Data file not found at {DATA_FILEPATH}")

# --- Data Cleaning & Preparation ---
print("Original Data Shape:", df.shape)
df.dropna(subset=['comment_text'], inplace=True)
df['comment_text'] = df['comment_text'].astype(str)
df['timestamp'] = df['timestamp'] = pd.to_datetime(df['timestamp'], format='ISO8601')
df.set_index('timestamp', inplace=True)

print("\nData Info After Cleaning:")
df.info()

print("\n--- Data Head ---")
display(df.head())


Data loaded successfully.
Original Data Shape: (17841, 4)

Data Info After Cleaning:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17812 entries, 2025-03-01 00:13:57.153000+00:00 to 2025-04-02 18:29:59.086000+00:00
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   media_id       17812 non-null  int64 
 1   media_caption  17804 non-null  object
 2   comment_text   17812 non-null  object
dtypes: int64(1), object(2)
memory usage: 556.6+ KB

--- Data Head ---


Unnamed: 0_level_0,media_id,media_caption,comment_text
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-03-01 00:13:57.153000+00:00,1090986906404998,"Soft skin, soft life ü©∑üå∏ü´ß get your hands on this self-care gem!",I bet this is good
2025-03-01 00:23:06.879000+00:00,17950254656929862,Why use one scrub when you can use them all at once ü§©\n\n#treehut #treehutcollection #megascrub,i know this smells so good
2025-03-01 00:04:05.094000+00:00,1090109319826090,Morning routine with Tree Hut üçäü´ß Now available online and in-store!,Love it
2025-03-01 00:41:59.467000+00:00,1098364052333950,Why use one scrub when you can use them all at once ü§©,Please carry these in Canada! I miss them so much!
2025-03-01 02:21:29.715000+00:00,1083943630442659,Vanilla Serum-Infused Hand Wash: A sweet escape for dry hands.,I love it ..‚úåÔ∏è


In [None]:
print("--- Top 10 Most Commented-On Posts (by caption) ---")
display(df['media_caption'].value_counts().head(10))
