In [None]:
%pip install vaderSentiment
%pip install gensim

In [None]:
import praw, datetime, requests, json, time, numpy as np, pandas as pd

import nltk.sentiment.vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import seaborn as sns
from IPython import display
from pprint import pprint
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [None]:
reddit = praw.Reddit(client_id = open('cred/client_id.txt').read(),
                     client_secret = open('cred/client_secret.txt').read(),
                     user_agent = open('cred/user_agent.txt').read(),
                     )

## Scrape reddit.

In [None]:
# Define reddit post URL
reddit_post_url = "https://www.reddit.com/r/electricvehicles/comments/1e7x13p/it_is_not_the_evs_that_are_lacking_in_the_us_its/"

# Fetch the post by URL
submission = reddit.submission(url=reddit_post_url)

# Print basic post details
print("Title:", submission.title)
print("Author:", submission.author)
print("Score:", submission.score)
print("Number of Comments:", submission.num_comments)
print("Post Content:", submission.selftext)
print("URL:", submission.url)

# Fetch and print comments
print("\nComments:")
submission.comments.replace_more(limit=None)  # Replace "More Comments" placeholders
for comment in submission.comments.list():
    print(comment.body)
    print("-" * 80)

## Preprocess data.

In [None]:
# Create a list to hold comment data
comments_data = []

# Fetch and store comments
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
    comments_data.append({
        'comment_id': comment.id,
        'author': comment.author.name if comment.author else "Deleted",
        'score': comment.score,
        'comment_text': comment.body
    })

# Create a DataFrame
comments_df = pd.DataFrame(comments_data)

# Display basic info
print(comments_df.head())


In [None]:
# Define cleaning function
def clean_comment(comment):
    comment = comment.lower()  # Lowercase
    comment = re.sub(r'http\S+|www\S+|https\S+', '', comment)  # Remove URLs
    comment = re.sub(r'[^a-z\s]', '', comment)  # Remove special characters
    # Tokenize comments into words
tokenized_comments = [comment.split() for comment in comments_df['cleaned_comment']]
# Identify bigrams and trigrams
bigram = Phrases(tokenized_comments, min_count=5, threshold=10)  # Adjust thresholds as needed
trigram = Phrases(bigram[tokenized_comments], threshold=10)
# Apply Phraser for efficiency
bigram_model = Phraser(bigram)
trigram_model = Phraser(trigram)
# Apply the bigram and trigram models to the tokenized comments
comments_with_phrases = [trigram_model[bigram_model[comment]] for comment in tokenized_comments]
# Example output
print("Example tokenized comment with phrases:", comments_with_phrases[0])
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

# Apply cleaning to the DataFrame
comments_df['cleaned_comment'] = comments_df['comment_text'].apply(clean_comment)

print(comments_df[['comment_text', 'cleaned_comment']].head())









In [None]:
analyzer = SentimentIntensityAnalyzer()

# Analyze sentiment
comments_df['sentiment_score'] = comments_df['cleaned_comment'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
comments_df['sentiment_label'] = comments_df['sentiment_score'].apply(
    lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral')
)

print(comments_df[['cleaned_comment', 'sentiment_score', 'sentiment_label']].head())

# Identifying themes

In [None]:
# Tokenize comments for topic modeling
tokenized_comments = [comment.split() for comment in comments_df['cleaned_comment']]
dictionary = corpora.Dictionary(tokenized_comments)
corpus = [dictionary.doc2bow(text) for text in tokenized_comments]

# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=15)

# Display topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

# Visualising

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(x='sentiment_label', data=comments_df, order=['positive', 'neutral', 'negative'])
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
from wordcloud import WordCloud

all_comments = ' '.join(comments_df['cleaned_comment'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_comments)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Frequent Words')
plt.show()
