In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "cardiffnlp/twitter-roberta-base-sentiment"

# Download and cache the full model locally
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Optional: Save locally if you want to avoid internet next time
model.save_pretrained("./local_roberta_sentiment")
tokenizer.save_pretrained("./local_roberta_sentiment")


('./local_roberta_sentiment\\tokenizer_config.json',
 './local_roberta_sentiment\\special_tokens_map.json',
 './local_roberta_sentiment\\vocab.json',
 './local_roberta_sentiment\\merges.txt',
 './local_roberta_sentiment\\added_tokens.json',
 './local_roberta_sentiment\\tokenizer.json')

In [3]:
from transformers import pipeline

model_path = "./local_roberta_sentiment"  # or the full Windows path
sentiment_pipeline = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path, framework="pt")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd

# Load model from local path
model_path = "./local_roberta_sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Create the pipeline with truncation and batch size
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    truncation=True,
    padding=True,
    max_length=512,
    batch_size=16  # helps avoid memory errors too
)

# Load your data
df = pd.read_csv("data/reddit_posts.csv")
df["full_text"] = df["title"].fillna('') + " " + df["selftext"].fillna('')

# Run prediction safely
results = sentiment_pipeline(df["full_text"].tolist())

# Extract results
df["label"] = [r["label"].lower() for r in results]

# Save
df.to_csv("reddit_sentiment_labeled.csv", index=False)
print("✅ Saved reddit_sentiment_labeled.csv")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


✅ Saved reddit_sentiment_labeled.csv


In [8]:
# Load the Reddit posts
df = pd.read_csv("data/reddit_posts.csv")
df["full_text"] = df["title"].fillna('') + " " + df["selftext"].fillna('')

# Truncate long texts manually
def truncate_text(text, tokenizer, max_length=512):
    tokens = tokenizer.encode(text, max_length=max_length, truncation=True)
    return tokenizer.decode(tokens, skip_special_tokens=True)

df["full_text"] = df["full_text"].apply(lambda x: truncate_text(x, tokenizer))


In [9]:
# Run the predictions
results = sentiment_pipeline(df["full_text"].tolist())

# Map labels
label_mapping = {
    "LABEL_0": "negative",
    "LABEL_1": "neutral",
    "LABEL_2": "positive"
}
df["label"] = [label_mapping[r["label"]] for r in results]

# Save results
df.to_csv("reddit_sentiment_labeled.csv", index=False)
print("Saved reddit_sentiment_labeled.csv with readable labels.")


✅ Saved reddit_sentiment_labeled.csv with readable labels.


In [10]:
import dash
from dash import dcc, html
import pandas as pd
import plotly.graph_objs as go
from wordcloud import WordCloud
import base64
from io import BytesIO

# Load sentiment-labeled data
df = pd.read_csv("reddit_sentiment_labeled.csv")

# Function to generate word cloud base64
def generate_wordcloud_base64(text_series):
    if text_series.empty or text_series.str.strip().str.len().sum() == 0:
        return None
    wordcloud = WordCloud(width=1200, height=700, background_color='white').generate(' '.join(text_series.dropna()))
    img = BytesIO()
    wordcloud.to_image().save(img, format='PNG')
    return base64.b64encode(img.getvalue()).decode()

# Generate word clouds
positive_wc = generate_wordcloud_base64(df[df['label'] == 'positive']["full_text"])
neutral_wc = generate_wordcloud_base64(df[df['label'] == 'neutral']["full_text"])
negative_wc = generate_wordcloud_base64(df[df['label'] == 'negative']["full_text"])


In [11]:
app = dash.Dash(__name__)
app.title = "Reddit Sentiment Dashboard"

app.layout = html.Div([
    html.H1("Reddit Depression Post Sentiment Dashboard", style={'textAlign': 'center'}),

    # Sentiment Distribution Bar Chart
    dcc.Graph(
        id='sentiment-distribution',
        figure={
            'data': [go.Bar(
                x=df['label'].value_counts().index,
                y=df['label'].value_counts().values,
                marker=dict(color=['#1f77b4', '#ff7f0e', '#2ca02c'])
            )],
            'layout': go.Layout(title="Sentiment Distribution", xaxis={'title': 'Sentiment'}, yaxis={'title': 'Count'})
        }
    ),

    # Polarity Histogram (optional, if polarity column exists)
    dcc.Graph(
        id='polarity-distribution',
        figure={
            'data': [go.Histogram(
                x=df.get('polarity', pd.Series(dtype=float)),
                nbinsx=20,
                marker=dict(color='skyblue'),
                opacity=0.75
            )],
            'layout': go.Layout(title="Polarity Distribution", xaxis={'title': 'Polarity'}, yaxis={'title': 'Frequency'})
        }
    ),

    # Wordclouds
    html.Div([
        html.Div([
            html.H3("Wordcloud - Positive", style={'textAlign': 'center'}),
            html.Img(src=f'data:image/png;base64,{positive_wc}', style={'width': '100%', 'height': '500px'}) if positive_wc else html.P("No positive text found.")
        ], style={'width': '32%', 'display': 'inline-block'}),

        html.Div([
            html.H3("Wordcloud - Neutral", style={'textAlign': 'center'}),
            html.Img(src=f'data:image/png;base64,{neutral_wc}', style={'width': '100%', 'height': '500px'}) if neutral_wc else html.P("No neutral text found.")
        ], style={'width': '32%', 'display': 'inline-block'}),

        html.Div([
            html.H3("Wordcloud - Negative", style={'textAlign': 'center'}),
            html.Img(src=f'data:image/png;base64,{negative_wc}', style={'width': '100%', 'height': '500px'}) if negative_wc else html.P("No negative text found.")
        ], style={'width': '32%', 'display': 'inline-block'}),
    ], style={'display': 'flex', 'justify-content': 'space-around', 'padding': '30px'})
])

if __name__ == "__main__":
    app.run(debug=True)
