In [None]:
import datetime, time
import requests, json
import re, string
import numpy as np
import pandas as pd
import seaborn as sns

# Reddit API
import praw

# Natural Language Processing (NLP)
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Topic Modeling
from gensim import corpora
from gensim.models import LdaModel, Phrases
from gensim.models.phrases import Phraser

# Visualization
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Transformers
from transformers import pipeline

# Miscellaneous
from pprint import pprint
from IPython import display


In [2]:
# Initialize Reddit API
reddit = praw.Reddit(
    client_id=open('cred/client_id.txt').read().strip(),
    client_secret=open('cred/client_secret.txt').read().strip(),
    user_agent=open('cred/user_agent.txt').read().strip(),
)

# Hard-coded list of Reddit post URLs to scrape
posts_to_scrape = [
    "https://www.reddit.com/r/electriccars/comments/1cr9w0q/32_of_consumers_were_considering_an_ev_but_cited/",
    "https://www.reddit.com/r/electricvehicles/comments/1e7x13p/it_is_not_the_evs_that_are_lacking_in_the_us_its",
    "https://www.reddit.com/r/electriccars/comments/1c1gtn9/wait_its_an_ev_details_in_comments/",
    "https://www.reddit.com/r/science/comments/4xym1e/range_anxiety_is_scaring_people_away_from/",
    "https://www.reddit.com/r/cars/comments/10wfm08/this_is_ruining_electric_cars_the_charging/",
    "https://www.reddit.com/r/electricvehicles/comments/11ztuhi/ive_owned_an_electric_car_for_four_months_and_not/",
    "https://www.reddit.com/r/electricvehicles/comments/1gsqza5/four_dead_in_fire_as_tesla_doors_fail_to_open/",
    "https://www.reddit.com/r/cars/comments/rshvy0/why_i_sold_my_tesla_model_3_performance_went_back/"
]

# List to hold all data
data = []

# Fetch data for each post
for post_url in posts_to_scrape:
    submission = reddit.submission(url=post_url)
    
    # Get post details
    post_info = {
        'post_id': submission.id,
        'post_url': submission.url,
        'post_title': submission.title,
        'post_text': submission.selftext,
        'post_score': submission.score,
        'post_author': submission.author.name if submission.author else "Deleted",
        'comment_id': None,  # Placeholder for comments
        'comment_author': None,
        'comment_text': None,
        'comment_score': None
    }
    
    # Add post info to the data list
    data.append(post_info)
    
    # Scrape comments
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comment_info = {
            'post_id': submission.id,
            'post_url': submission.url,
            'post_title': submission.title,
            'post_text': submission.selftext,
            'post_score': submission.score,
            'post_author': submission.author.name if submission.author else "Deleted",
            'comment_id': comment.id,
            'comment_author': comment.author.name if comment.author else "Deleted",
            'comment_text': comment.body,
            'comment_score': comment.score
        }
        data.append(comment_info)

# Convert the data to a DataFrame
df = pd.DataFrame(data)

# Save to CSV
output_file = "reddit_posts_and_comments.csv"
df.to_csv(output_file, index=False, encoding="utf-8")
print(f"Data saved to {output_file}")

Data saved to reddit_posts_and_comments.csv


In [None]:
# # Load the CSV file
# df = pd.read_csv("reddit_posts_and_comments.csv")

# # 1. Drop rows with missing content (e.g., deleted posts/comments)
# df = df.dropna(subset=['post_title', 'post_text', 'comment_text'], how='all').reset_index(drop=True)

# # 2. Remove duplicates
# df = df.drop_duplicates()

# # 3. Handle special characters in text fields (e.g., emojis or non-ASCII characters)
# def clean_text(text):
#     if pd.isna(text):  # Check for NaN
#         return text
#     text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
#     text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with one
#     return text

# df['post_title'] = df['post_title'].apply(clean_text)
# df['post_text'] = df['post_text'].apply(clean_text)
# df['comment_text'] = df['comment_text'].apply(clean_text)


# # 7. Save the preprocessed data to a new file
# output_file_cleaned = "reddit_posts_and_comments_cleaned.csv"
# df.to_csv(output_file_cleaned, index=False, encoding="utf-8")
# print(f"Cleaned data saved to {output_file_cleaned}")

   post_id                                           post_url  \
0  1cr9w0q  https://thefutureeconomy.ca/op-eds/vehicle-to-...   
1  1cr9w0q  https://thefutureeconomy.ca/op-eds/vehicle-to-...   
2  1cr9w0q  https://thefutureeconomy.ca/op-eds/vehicle-to-...   
3  1cr9w0q  https://thefutureeconomy.ca/op-eds/vehicle-to-...   
4  1cr9w0q  https://thefutureeconomy.ca/op-eds/vehicle-to-...   

                                          post_title post_text  post_score  \
0  32% of consumers were considering an EV but ci...       NaN         574   
1  32% of consumers were considering an EV but ci...       NaN         574   
2  32% of consumers were considering an EV but ci...       NaN         574   
3  32% of consumers were considering an EV but ci...       NaN         574   
4  32% of consumers were considering an EV but ci...       NaN         574   

     post_author comment_id comment_author  \
0  northstrong87        NaN            NaN   
1  northstrong87    l3wwg7h    XxFezzgigxX   
2 

In [7]:
# Load the CSV file
df = pd.read_csv("reddit_posts_and_comments.csv")

# 1. Drop rows with missing content (e.g., deleted posts/comments)
df = df.dropna(subset=['post_title', 'post_text', 'comment_text'], how='all').reset_index(drop=True)

# 2. Remove duplicates
df = df.drop_duplicates()

# 3. Clean text: Remove non-ASCII characters, URLs, punctuation, and normalize text
def clean_text(text):
    if pd.isna(text):
        return text
    text = text.lower()  # Normalize to lowercase
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    text = re.sub(r'http\S+|www.\S+', '', text)  # Remove URLs
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with one
    return text

df['post_title'] = df['post_title'].apply(clean_text)
df['post_text'] = df['post_text'].apply(clean_text)
df['comment_text'] = df['comment_text'].apply(clean_text)

# 5. Extract themes
theme_words = {
    'charging_stations': ['station', 'stations', 'location', 'public station', 'charging point', 'infrastructure', 'charger', 'kiosk', 'EV station', 'public chargers', 'station network', 'station availability'],
    'charging_network': ['network', 'connected', 'networked', 'charging grid', 'network coverage', 'roaming', 'charging locations', 'map', 'network reliability', 'network expansion', 'partner network'],
    'range_anxiety': ['range', 'range anxiety', 'range fear', 'battery life', 'battery capacity', 'distance', 'travel range', 'anxiety', 'worry', 'trip range', 'unable to charge', 'running out of charge', 'mileage'],
    'charging_speed': ['fast', 'slow', 'speed', 'charging rate', 'fast-charging', 'quick', 'fast charging', 'charging speed', 'time to charge', 'slow charging', 'quick charge', 'fast charger', 'fast charging stations'],
    'availability_of_chargers': ['available', 'availability', 'location', 'access', 'scarce', 'scarce charging', 'find chargers', 'nearby', 'accessible', 'not available', 'out of service', 'open station', 'charger access', 'charger shortage'],
    'cost_of_charging': ['cost', 'price', 'expensive', 'affordable', 'cheap', 'price per kWh', 'electricity cost', 'charging fees', 'rates', 'price of charging', 'cost to charge', 'free charging', 'charging cost', 'pricing model', 'pricing scheme', 'cost per session'],
    'maintenance_issues': ['maintenance', 'repair', 'broken', 'malfunction', 'service', 'failure', 'out of service', 'maintenance required', 'charger broken', 'charger error', 'service required', 'maintenance costs', 'down time', 'maintenance issues'],
    'tesla_charging_network': ['Tesla', 'supercharger', 'Tesla chargers', 'Tesla network', 'Tesla charging', 'supercharger station', 'Tesla charging stations', 'Tesla infrastructure', 'Tesla owners', 'Tesla charging speed'],
    'ev_range': ['EV', 'range', 'battery', 'miles', 'distance', 'range per charge', 'battery life', 'vehicle range', 'driving range', 'range capacity', 'charge range', 'range efficiency'],
    'vehicle model': ['tesla model 3', 'hyundai kona', 'nissan leaf', 'chevy bolt', 'bmw i3'],
    'charging_station_location': ['houston', 'san francisco', 'los angeles', 'chicago', 'new york'],
    'charger_type': ['charger', 'type', 'level 1', 'level 2', 'DC fast charging', 'DC fast charger', 'supercharger', 'home charger', 'wall box', 'charging adapter', 'charging port', 'connector', 'plug type', 'Type 1', 'Type 2', 'CCS', 'CHAdeMO', 'L2', 'L1'],
}

def extract_themes(text):
    if pd.isna(text):
        return []
    return [word for word in theme_words if word in text]

df['themes_post'] = df['post_text'].apply(extract_themes)
df['themes_comment'] = df['comment_text'].apply(extract_themes)

# 6. Sentiment analysis
analyzer = SentimentIntensityAnalyzer()

def sentiment_analysis(text):
    if pd.isna(text):
        return None
    scores = analyzer.polarity_scores(text)
    return scores['compound']

df['sentiment_post'] = df['post_text'].apply(sentiment_analysis)
df['sentiment_comment'] = df['comment_text'].apply(sentiment_analysis)

# 7. Save the enhanced data to a new CSV file
output_file_enhanced = "reddit_posts_and_comments_enhanced.csv"
df.to_csv(output_file_enhanced, index=False, encoding="utf-8")
print(f"Enhanced data saved to {output_file_enhanced}")


Enhanced data saved to reddit_posts_and_comments_enhanced.csv
