In [135]:
#imports
import praw
import json
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import logging
logging.set_verbosity_error()



In [136]:

# Opening JSON file
f = open('client_secrets_liz.json')
# returns JSON object as 
# a dictionary
data = json.load(f)


reddit = praw.Reddit(
    client_id=data['client_id'],
    client_secret=data['client_secret'],
    user_agent=data['user_agent']
)

url_link = input("Enter your Reddit thread URL: ")
#url = "https://www.reddit.com/r/esist/comments/6g18xv/theres_so_much_more_about_trump_to_investigate/"
submission = reddit.submission(url=url_link)
title = submission.title
rawComments = []
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
    rawComments.append(comment.body)


df = pd.DataFrame(rawComments)


In [137]:
classifier = pipeline(task="text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)
analyzedData = []
for index in df.index:
    comment = df[0][index]
    analysis = classifier(comment, truncation=True)
    analyzedData.append([comment, analysis])

sentiment_comments = pd.DataFrame(analyzedData)

In [138]:
def has_allowed_top_label(sentiment_analysis, allowed_labels):
    if isinstance(sentiment_analysis, list) and len(sentiment_analysis) > 0 and isinstance(sentiment_analysis[0][0], dict):
        top_label = sentiment_analysis[0][0].get('label')
        return any(top_label == allowed_label for allowed_label in allowed_labels)
    return False

def has_allowed_keywords(comment_text, keywords):
    return any(keyword in comment_text.lower() for keyword in keywords)


tokenizer = AutoTokenizer.from_pretrained("ZachBeesley/toxic-comments")
model = AutoModelForSequenceClassification.from_pretrained("ZachBeesley/toxic-comments", from_tf=True)
classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, max_length=512)

def is_toxic(comment_text,classifier=classifier):
    result = classifier(comment_text)[0]
    return result['label'] == 'Toxic'


In [139]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def preprocess_with_lemmatization(text):
    # convert text to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # split text into words
    tokens = word_tokenize(text)
    # remove stopwords - remove words like a, and, at, etc
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # lemmatization - break down a word to its root meaning
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    text = ' '.join(lemmatized_tokens)
    return text

allowed_keywords = preprocess_with_lemmatization(title)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Elisabeth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Elisabeth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Elisabeth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [140]:
labels = ['disapproval', 'disappointment', 'anger', 'disgust']
filtered_data = []
for index, row in sentiment_comments.iterrows():
    sentiment_analysis = row[1] if pd.notna(row[1]).any else []
    result = has_allowed_top_label(sentiment_analysis, allowed_labels=labels)
    filtered_data.append(result)

filtered_data_keywords = sentiment_comments[0].apply(lambda x: has_allowed_keywords(x.lower() if pd.notna(x) else "", allowed_keywords))
# remove the comments without relevant keywords from title as well as comments with labels not under the labels listed above 
filtered_df = sentiment_comments[filtered_data & filtered_data_keywords & ~(sentiment_comments[0].astype(str).isin(['[removed]', '[deleted]']))]
#check for toxic comments - done on smaller subset of comments as to improve efficiency
toxicity_data = filtered_df[0].apply(lambda x: is_toxic(x) if pd.notna(x) else False)
filtered_df = filtered_df[~toxicity_data]



In [141]:
reasons_list = filtered_df[0].tolist()
full_corpus = "\n".join(reasons_list)
summarizer = pipeline("summarization", model="Falconsai/text_summarization")
summarized = summarizer(full_corpus, max_length=1000, min_length=50, do_sample=False)
print(summarized[0].get('summary_text'))

#some urls to use:
#https://www.reddit.com/r/entertainment/comments/zd5n4x/kanye_west_antisemitism_rapper_akon_backs_west/
#akon backed kanye west on new album drop


Fckin Akon says he doesn't agree with Kanye's comments on Hitler . he says if you overlook the fact that he didn't seem to take issue with antisemitic conspiracy theories or the Holocaust denial, Akon is being pretty reasonable here . He says anyone using the excuse is racist because they ignore the how complex racism is .
