<a href="https://colab.research.google.com/github/elixirutkarsh/Do-Preprocessing-and-Sentiment-Analysis-Using-pre-trained-models-on-the-Diabetes-Reddit-Dataset/blob/main/Do_Preprocessing_and_Sentiment_Analysis_(Using_pre_trained_models)_on_the_Diabetes_Reddit_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline

# Step 1: Load the Diabetes Reddit Dataset
df = pd.read_csv("diabetes_reddit_dataset.csv")  # Replace with your dataset path
comments = df["comment"].values

# Step 2: Preprocessing
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words("english"))

processed_comments = []
for comment in comments:
    # Tokenization
    tokens = word_tokenize(comment)

    # Removing stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Joining tokens back into a sentence
    processed_comment = " ".join(filtered_tokens)

    processed_comments.append(processed_comment)

# Step 3: Sentiment Analysis using pre-trained models
# Sentiment Analysis with VADER
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

sentiments_vader = []
for comment in processed_comments:
    sentiment_scores = sia.polarity_scores(comment)
    sentiment = "positive" if sentiment_scores['compound'] >= 0 else "negative"
    sentiments_vader.append(sentiment)

# Sentiment Analysis with Transformers
classifier = pipeline("sentiment-analysis")

sentiments_transformers = []
for comment in processed_comments:
    result = classifier(comment)[0]
    sentiment = result['label']
    sentiments_transformers.append(sentiment)

# Step 4: Combine results with the original dataset
df["processed_comment"] = processed_comments
df["sentiment_vader"] = sentiments_vader
df["sentiment_transformers"] = sentiments_transformers

# Step 5: Save the updated dataset
df.to_csv("diabetes_reddit_dataset_sentiment.csv", index=False)  # Replace with your desired output filename
