In [1]:
import datetime, time
import requests, json
import re, string
import numpy as np
import pandas as pd
import seaborn as sns
from time import sleep

# Reddit API
import praw

# Natural Language Processing (NLP)
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline
from praw import Reddit
import concurrent.futures

# Topic Modeling
from gensim import corpora
from gensim.models import LdaModel, Phrases
from gensim.models.phrases import Phraser

# Visualization
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Transformers
from transformers import pipeline

# Miscellaneous
from pprint import pprint
from IPython import display

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
reddit = praw.Reddit(
    client_id=open('cred/client_id.txt').read().strip(),
    client_secret=open('cred/client_secret.txt').read().strip(),
    user_agent=open('cred/user_agent.txt').read().strip(),
)

posts_to_scrape = [
    "https://www.reddit.com/r/electriccars/comments/1cr9w0q/32_of_consumers_were_considering_an_ev_but_cited/",
    "https://www.reddit.com/r/electricvehicles/comments/1e7x13p/it_is_not_the_evs_that_are_lacking_in_the_us_its",
    "https://www.reddit.com/r/science/comments/4xym1e/range_anxiety_is_scaring_people_away_from/",
    "https://www.reddit.com/r/cars/comments/10wfm08/this_is_ruining_electric_cars_the_charging/"
]


def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())    # Remove punctuation and lowercase the text
    text = text.replace('\n', ' ').replace('\r', ' ')  # Replace newline and carriage return
    return text

data = []

# Loop through each post URL
for post_url in posts_to_scrape:
    try:
        submission = reddit.submission(url=post_url)
        submission.comments.replace_more(limit=None)     # Fetch comments and remove "More comments"

        # Check if the post has comments
        if not submission.comments.list():
            print(f"No comments found for post: {post_url}")
            continue

        # Process comments using list comprehension
        post_data = [
            {
                'comment_id': comment.id,
                'comment_author': comment.author.name if comment.author else "Deleted",
                'comment_score': comment.score,
                'comment_text': preprocess_text(comment.body)
            }
            for comment in submission.comments.list()
        ]
        
        # Append the processed comments to the data list
        data.extend(post_data)
        sleep(2)

    except Exception as e:
        print(f"Error processing post {post_url}: {e}")
        continue

# Save data in chunks to CSV to prevent memory issues
chunksize = 10000  # Larger chunks to reduce file writes
for i in range(0, len(data), chunksize):
    df_chunk = pd.DataFrame(data[i:i + chunksize])
    df_chunk.to_csv("reddit_comments.csv", index=False, mode='a', header=(i == 0), quotechar='"', escapechar='\\', encoding='utf-8')



print("Comments Data:")
print(pd.DataFrame(data).head()) 

pd.DataFrame(data).to_csv("reddit_comments.csv", index=False)


Comments Data:
  comment_id     comment_author  comment_score  \
0    l3wwg7h        XxFezzgigxX             14   
1    l3wufnj       mickthomas68             36   
2    l3wochb        Betanumerus             30   
3    l3wqcrs        bhilliardga              6   
4    l3wuwj1  NotAcutallyaPanda              8   

                                        comment_text  
0  no worries as soon as batteries become more ef...  
1  i was skeptical at first but as i already had ...  
2  people who can charge at home have no excuse r...  
3  ive had my ford lightning for 2 months and for...  
4  for folks who live in or travel to rural areas...  


## All data collected and cleaned.

In [8]:
theme_words = {
    'charging_stations': ['station', 'stations', 'location', 'public station', 'charging point', 'infrastructure', 'charger', 'kiosk', 'EV station', 'public chargers', 'station network', 'station availability'],
    'charging_network': ['network', 'connected', 'networked', 'charging grid', 'network coverage', 'roaming', 'charging locations', 'map', 'network reliability', 'network expansion', 'partner network'],
    'range_anxiety': ['range', 'range anxiety', 'range fear', 'battery life', 'battery capacity', 'distance', 'travel range', 'anxiety', 'worry', 'trip range', 'unable to charge', 'running out of charge', 'mileage'],
    'charging_speed': ['fast', 'slow', 'speed', 'charging rate', 'fast-charging', 'quick', 'fast charging', 'charging speed', 'time to charge', 'slow charging', 'quick charge', 'fast charger', 'fast charging stations'],
    'availability_of_chargers': ['available', 'availability', 'location', 'access', 'scarce', 'scarce charging', 'find chargers', 'nearby', 'accessible', 'not available', 'out of service', 'open station', 'charger access', 'charger shortage'],
    'cost_of_charging': ['cost', 'price', 'expensive', 'affordable', 'cheap', 'price per kWh', 'electricity cost', 'charging fees', 'rates', 'price of charging', 'cost to charge', 'free charging', 'charging cost', 'pricing model', 'pricing scheme', 'cost per session'],
    'maintenance_issues': ['maintenance', 'repair', 'broken', 'malfunction', 'service', 'failure', 'out of service', 'maintenance required', 'charger broken', 'charger error', 'service required', 'maintenance costs', 'down time', 'maintenance issues'],
    'tesla_charging_network': ['Tesla', 'supercharger', 'Tesla chargers', 'Tesla network', 'Tesla charging', 'supercharger station', 'Tesla charging stations', 'Tesla infrastructure', 'Tesla owners', 'Tesla charging speed'],
    'ev_range': ['EV', 'range', 'battery', 'miles', 'distance', 'range per charge', 'battery life', 'vehicle range', 'driving range', 'range capacity', 'charge range', 'range efficiency'],
    'vehicle model': ['tesla model 3', 'hyundai kona', 'nissan leaf', 'chevy bolt', 'bmw i3'],
    'charging_station_location': ['houston', 'san francisco', 'los angeles', 'chicago', 'new york'],
    'charger_type': ['charger', 'type', 'level 1', 'level 2', 'DC fast charging', 'DC fast charger', 'supercharger', 'home charger', 'wall box', 'charging adapter', 'charging port', 'connector', 'plug type', 'Type 1', 'Type 2', 'CCS', 'CHAdeMO', 'L2', 'L1'],
}


df = pd.DataFrame(data)  # Convert 'data' to DataFrame if it's not already

# Initialize BERT-based zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Function to categorize text (both post and comment)
def categorize_text_bert(text, candidate_labels):
    if pd.isna(text) or text == "":
        return []  # Return empty list if text is missing or empty
    result = classifier(text, candidate_labels)
    return result['labels']

# Function to categorize posts and comments
def categorize_comments(df, theme_words):
    categorized_data = []

    # Loop through the comments
    for index, row in df.iterrows():
        # Categorize comment text
        comment_themes = categorize_text_bert(row['comment_text'], list(theme_words.keys()))

        categorized_data.append({
            'comment_id': row['comment_id'],
            'comment_author': row['comment_author'],
            'comment_score': row['comment_score'],
            'comment_text': row['comment_text'],
            'comment_themes': comment_themes
        })

    return pd.DataFrame(categorized_data)


df = pd.DataFrame(data)

# Categorize the comments
categorized_df = categorize_comments(df, theme_words)

# Display the categorized data (Comments)
print("Categorized Data:")
print(categorized_df.head())

categorized_df.to_csv("categorized_comments_data.csv", index=False)

Categorized Data:
  comment_id     comment_author  comment_score  \
0    l3wwg7h        XxFezzgigxX             14   
1    l3wufnj       mickthomas68             36   
2    l3wochb        Betanumerus             30   
3    l3wqcrs        bhilliardga              6   
4    l3wuwj1  NotAcutallyaPanda              8   

                                        comment_text  \
0  no worries as soon as batteries become more ef...   
1  i was skeptical at first but as i already had ...   
2  people who can charge at home have no excuse r...   
3  ive had my ford lightning for 2 months and for...   
4  for folks who live in or travel to rural areas...   

                                      comment_themes  
0  [vehicle model, charger_type, charging_speed, ...  
1  [vehicle model, charger_type, ev_range, availa...  
2  [availability_of_chargers, charger_type, charg...  
3  [vehicle model, charger_type, charging_speed, ...  
4  [availability_of_chargers, ev_range, vehicle m...  
