In [None]:
# Setup: installs (idempotent) and NLTK data
import sys, subprocess

def ensure(package):
    try:
        __import__(package)
    except Exception:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

for pkg in ['pandas', 'numpy', 'matplotlib', 'seaborn', 'nltk', 'scikit-learn']:
    ensure(pkg)

import nltk
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except LookupError:
    nltk.download('vader_lexicon')



In [None]:
# Imports and data loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

DATA_PATH = Path('climate_nasa.csv')

# Read CSV
raw_df = pd.read_csv(DATA_PATH)
print(f"Loaded {len(raw_df):,} rows, {raw_df.shape[1]} columns")
raw_df.head()


In [None]:
# Basic cleaning and typing

df = raw_df.copy()

# Standardize column names
expected_cols = ['date', 'likesCount', 'profileName', 'commentsCount', 'text']
df.columns = [c.strip() for c in df.columns]

# Parse datetime, coerce errors to NaT
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Ensure numeric types
for col in ['likesCount', 'commentsCount']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Text normalization
if 'text' in df.columns:
    df['text'] = df['text'].astype(str).fillna('').str.strip()
    # Remove duplicate whitespace
    df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True)

# Drop rows with no text
df = df[df['text'].str.len() > 0].reset_index(drop=True)

print(df.dtypes)
print(df.isna().sum())
print(f"Remaining rows: {len(df):,}")
df.head()


In [None]:
# Sentiment analysis with VADER
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

sent_scores = df['text'].apply(sia.polarity_scores).apply(pd.Series)
df = pd.concat([df, sent_scores], axis=1)

# Label sentiment based on compound score
sentiment_bins = pd.cut(
    df['compound'], bins=[-1.0, -0.05, 0.05, 1.0], labels=['negative', 'neutral', 'positive'], include_lowest=True
)
df['sentiment'] = sentiment_bins.astype(str)

df[['compound', 'neg', 'neu', 'pos', 'sentiment']].describe(include='all')


In [None]:
# Trend analysis over time

if 'date' in df.columns and df['date'].notna().any():
    tmp = df.set_index('date').copy()
    monthly = tmp.resample('MS').agg({
        'compound': 'mean',
        'text': 'count',
        'likesCount': 'mean',
        'commentsCount': 'mean'
    }).rename(columns={'text': 'num_comments'})

    fig, axes = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
    monthly['compound'].plot(ax=axes[0], color='tab:green', title='Average monthly sentiment (compound)')
    monthly['num_comments'].plot(ax=axes[1], color='tab:blue', title='Monthly comment volume')
    (monthly[['likesCount', 'commentsCount']]).plot(ax=axes[2], title='Monthly avg engagement (likes, replies)')
    plt.tight_layout()
else:
    print('Date column missing or unparsable; skipping time trends.')


In [None]:
# Engagement analysis: correlations and simple visuals

# Text length as a proxy for effort/verbosity
df['text_len'] = df['text'].str.len()

eng_cols = ['likesCount', 'commentsCount']
avail_eng = [c for c in eng_cols if c in df.columns]

if avail_eng:
    corr = df[avail_eng + ['compound', 'text_len']].corr(numeric_only=True)
    print(corr)

    sns.pairplot(df, vars=avail_eng + ['compound', 'text_len'], kind='reg', plot_kws={'scatter_kws': {'alpha': 0.2}})
    plt.suptitle('Engagement vs sentiment and text length', y=1.02)
    plt.show()
else:
    print('Engagement columns not present; skipping engagement analysis.')


In [None]:
# Topic modeling with LDA (scikit-learn)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

NUM_TOPICS = 5
MAX_FEATURES = 5000

# Vectorize
vectorizer = CountVectorizer(lowercase=True, stop_words='english', max_features=MAX_FEATURES,
                             token_pattern=r'(?u)\b[a-zA-Z]{3,}\b')
X = vectorizer.fit_transform(df['text'])

lda = LatentDirichletAllocation(n_components=NUM_TOPICS, random_state=42, learning_method='batch')
lda.fit(X)

feature_names = np.array(vectorizer.get_feature_names_out())

def top_words_per_topic(model, feature_names, n_top_words=12):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_idx = topic.argsort()[-n_top_words:][::-1]
        topics.append((topic_idx, feature_names[top_idx]))
    return topics

topics = top_words_per_topic(lda, feature_names)
for topic_idx, words in topics:
    print(f"Topic {topic_idx}:", ', '.join(words))

# Assign dominant topic to each document
doc_topic = lda.transform(X)
df['topic'] = doc_topic.argmax(axis=1)

df[['topic', 'sentiment']].groupby(['topic', 'sentiment']).size().unstack(fill_value=0)


In [None]:
# Save enriched dataset and some figures
out_dir = Path('analysis_outputs')
out_dir.mkdir(exist_ok=True)

df_out_path = out_dir / 'climate_comments_enriched.csv'
df.to_csv(df_out_path, index=False)
print(f'Saved enriched CSV to {df_out_path.resolve()}')

# Bar plot: sentiment distribution
plt.figure(figsize=(6,4))
df['sentiment'].value_counts().reindex(['negative','neutral','positive']).plot(kind='bar', color=['#d62728','#7f7f7f','#2ca02c'])
plt.title('Sentiment distribution')
plt.xticks(rotation=0)
plt.tight_layout()
fig1 = out_dir / 'sentiment_distribution.png'
plt.savefig(fig1, dpi=150)
print(f'Saved {fig1.resolve()}')
plt.show()

# Bar plot: topics top words
topic_top_words = {topic: words for topic, words in topics}
fig, axes = plt.subplots(len(topic_top_words), 1, figsize=(10, 2*len(topic_top_words)))
if len(topic_top_words) == 1:
    axes = [axes]
for t, ax in enumerate(axes):
    words = topic_top_words[t]
    ax.bar(range(len(words)), [1]*len(words))
    ax.set_xticks(range(len(words)))
    ax.set_xticklabels(words, rotation=45, ha='right')
    ax.set_title(f'Topic {t} top words')
plt.tight_layout()
fig2 = out_dir / 'topics_top_words.png'
plt.savefig(fig2, dpi=150)
print(f'Saved {fig2.resolve()}')
plt.show()
