# Company Discourse

### 1. Dependencies (i.e., libraries we use)

In [None]:
import praw # https://praw.readthedocs.io/ -- PRAW for scraping Reddit comments
from praw.models import MoreComments

from sentence_transformers import SentenceTransformer # https://sbert.net/ -- SBERT for sentence comparison

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm, tqdm_notebook # https://tqdm.github.io/ -- measures runtime of loops
from datetime import datetime # https://docs.python.org/3/library/datetime.html -- to manipulate time units
import os.path # https://docs.python.org/3/library/os.path.html -- this lets us tell whether to load or create/save dataframe

from credentials import reddit_credential

### 2. Functions for Scraping Reddit Comments

In [None]:
# Reddit account info to use PRAW
# We should find some way to hide this part of the code eventually (quasi-personal info)
reddit = praw.Reddit(
    client_id=reddit_credential["client_id"],
    client_secret=reddit_credential["client_secret"],
    password=reddit_credential["password"],
    user_agent=reddit_credential["user_agent"],
    username=reddit_credential["username"],
)

reddit.read_only = True # we are only going to read data, so let's keep it this way

In [None]:
# submission.comments consists of one of the following objects
# Comment
# MoreComments -- which may contain Comment objects or MoreComment objects
# The following function break all MoreComments objects into Comment objects

def break_into_comments(submission):
    comment_like_objects = list(submission.comments)
    folded_comments = []
    saved_comments = []
     # list() -- only needed to make it more concrete when we check the code
    while comment_like_objects: # loop continues until comment_like_objects become empty
        for comment in comment_like_objects:
            if (isinstance(comment, MoreComments)):
                folded_comments.append(comment)
            else:
                saved_comments.append(comment)
        comment_like_objects = folded_comments # updating comment_like_objects
        folded_comments = []

    return saved_comments

In [None]:
# input: either submission or comment
# output: time of the submission in local time as datetime object

def time_of(submission):
    ts = submission.created_utc # time given in unix timestamp
    time = datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') # This changes unix timestamp into local time
    time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') # This changes into string datetime object

    return time

In [None]:
# input: subreddit
# output: comments from hot N submissions
# within within [time_1, time_2] if time_1 and time_2 are typed in
# time_1, time_2 are strings of the form '2024-05-09 23:10:02'
# time_1 and time_2 only seem to work if it is within the day of the current time

def extract_comments(subreddit, N, time_1=None, time_2=None):
    saved_comments = []

    if time_1 and time_2:
        time_1 = datetime.strptime(time_1, '%Y-%m-%d %H:%M:%S')
        time_2 = datetime.strptime(time_2, '%Y-%m-%d %H:%M:%S')

        for submission in tqdm(subreddit.hot(limit=N)):
            if time_1 <= time_of(submission) and time_of(submission) <= time_2:
                saved_comments += break_into_comments(submission)
    else:
        for submission in subreddit.hot(limit=N):
            saved_comments += break_into_comments(submission)

    return saved_comments

### 3. Functions for Vectorizing Reddit Comments

In [None]:
# Defining SBERT model for generating sentence embeddings
sentence_model = SentenceTransformer("thenlper/gte-large")

# function that vectorizes a sentence
def get_sentence_embedding(text):
    if not text.strip():
    #.strip() gets rid of new lines
        print("Attempted to get embedding for empty text.")
        return []

    embedding = sentence_model.encode(text)

    return embedding.tolist()

In [None]:
# input: comments
# output: vectors in 1024-dim space corresponding to comments

def comment_embeddings(comments):
    embeddings = list(tqdm(map(get_sentence_embedding, [comment.body for comment in comments])))
    return embeddings

### 4. Constructing and Saving/Loading Data Frames

In [None]:
tqdm.pandas() # This would measure how much time we take whenever we use pd

In [None]:
# The following generates df from comments
# We need to add column for vectors later because generating them together takes forever

def comments_into_df(comments):
    dic = {
            'Author' : [comment.author for comment in comments],
            'Time (PDT)' : [time_of(comment) for comment in comments],
            'Comment' : [comment.body for comment in comments],
            'Vectors' : comment_embeddings(comments),
            'File': comments
            }

    return pd.DataFrame(dic)

In [None]:
# input: subreddit
# output: dataframe from hot N submissions
# within [time_1, time_2] if they are typed in
# time_1, time_2 are strings of the form '2024-05-09 23:10:02'

def extract_df(subreddit, N, time_1=None, time_2=None):
    return comments_into_df(extract_comments(subreddit, N, time_1, time_2))

In [None]:
# input: subreddit_name (string)
# execution: creates (and stores) or loads dataframes from hot N submissions of the given subreddit
# within [time_1, time_2] if they are typed in
# output: data_frame
# time_1, time_2 are strings of the form '2024-05-09 23:10:02'
# the name of the saved file: df_subreddit_name.csv

def load_or_save_df(subreddit_name, N, time_1=None, time_2=None):

    dataframe_name = 'df_' + subreddit_name + '.csv'
    path = os.path.join('.', 'csvs', dataframe_name)

    if os.path.exists(path): # if the file already exits in the directory, we just load it
        df = pd.read_csv(path , converters={'Vectors': pd.eval})
        # converters -- needed because otherwise vectors are loaded as str

        df = df.drop(columns="Unnamed: 0") # drop unwanted column that comes from read_csv
    else:
        subreddit = reddit.subreddit(subreddit_name)
        df = extract_df(subreddit, N, time_1, time_2)
        df.to_csv(path) # save dataframe

    return df

In [None]:
# it seems to get a lot slower when as either time_2 - time_1 or N gets larger

df_costco = load_or_save_df("Costco", 50)
df_mcdonalds = load_or_save_df("McDonalds", 50)
df_samsung = load_or_save_df("samsung", 50)
df_open_ai = load_or_save_df("OpenAI", 50)

# df_microsoft = load_or_save_df("microsoft", N) -- Didn't stop for some reason
# df_apple = load_or_save_df("apple", 50) -- Didn't stop for some reason

In [None]:
df_costco

In [None]:
df_mcdonalds

In [None]:
df_samsung

In [None]:
df_open_ai

### 5. Creating Features of Comments

In [None]:
# function to compare (sentence) vectors
# inputs v, w are vectors in list object forms
# output can be between -1 and 1, where 1 means the best
# for sentence vectors, output seems to be always between 0 and 1

def cos_angle(v, w):
    v = np.array(v)
    w = np.array(w)
    v = v.reshape(1,-1)
    w = w.reshape(1,-1)
    return cosine_similarity(v, w)[0][0]

In [None]:
def insert_feature(df, feature_query, feature_name):
    vectors = df.Vectors
    query_vector = get_sentence_embedding(feature_query)

    cos_angles = map(lambda v: cos_angle(v, query_vector), vectors)
    df[feature_name] = pd.DataFrame(cos_angles)

    return df


In [None]:
df = insert_feature(df_costco, "The quality was very good.", "Quality")
df

In [None]:
df = insert_feature(df, "The price is very reasonable.", "Price")
df

In [None]:
df = insert_feature(df_costco, "The quality was horrible.", "Quality (negative)")
df

In [None]:
df = insert_feature(df, "The price is exorbitant.", "Price (negative)")
df

### 6. Some Demonstrations of Data Frames

First, we consider Price vs. Price (negative) features:

In [None]:
# top 5 price

df.sort_values(by='Price', ascending = False).head()

In [None]:
# bottom 5 price

df.sort_values(by='Price', ascending = False).tail()

In [None]:
# top 5 price (negative)

df.sort_values(by='Price (negative)', ascending = False).head()

In [None]:
# bottom 5 price (negative)

df.sort_values(by='Price (negative)', ascending = False).tail()

Now, we consider Quality vs. Quality (negative) features:

In [None]:
df.sort_values(by='Quality', ascending = False).head()

In [None]:
df.sort_values(by='Quality', ascending = False).tail()

In [None]:
df.sort_values(by='Quality (negative)', ascending = False).head()

In [None]:
df.sort_values(by='Quality (negative)', ascending = False).tail()

**Remark**. It does NOT seem that each feature and its negative version has high magnitute correlation (either positive or negative).

In [None]:
df[['Price','Price (negative)']].corr()

In [None]:
df[['Quality','Quality (negative)']].corr()

**Remark**. From the pairplot below, it seems that negative questions may be better in creating features.

In [None]:
sns.pairplot(df)
plt.show()