## NLP Analysis of Reddit Comments

#### Semantic Search and Topic Modeling

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!pip install -q sentence_transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/86.0 kB[0m [31m915.9 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
# load data
all_comments = pd.read_csv('/content/drive/MyDrive/Applied Paper /data/all_subreddit_comments.csv')

  all_comments = pd.read_csv('/content/drive/MyDrive/Applied Paper /data/all_subreddit_comments.csv')


In [7]:
# set corpus as a list of all the comments in the data set
corpus = list(all_comments['comment_body'])
all_comments.head()

Unnamed: 0,comment_id,comment_body,comment_link_id,comment_utc,comment_subreddit,comment_upvotes,date
0,dr324ax,[deleted],t3_7j1w9l,,,,
1,dr34t24,"NASA : ""So instead of finding X planets every ...",t3_7j1w9l,,,,
2,dr34plh,I also read that Google is involved and taking...,t3_7j1w9l,,,,
3,dr342uy,This is like taking a little kid to the beach....,t3_7j1w9l,,,,
4,dr32wu2,More details here:\n\nhttps://science.slashdot...,t3_7j1w9l,,,,


### Encoding

In [13]:
# use batching to encode in waves
import torch
from sentence_transformers import SentenceTransformer, util

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2') # a fast sentence transformer

# Set batch size
batch_size = 1000

# Initialize an empty list to store the encoded embeddings
encoded_embeddings_list = []

# Iterate through the corpus in batches
for i in range(0, len(corpus), batch_size):
    batch_sentences = corpus[i:i + batch_size]

    # Encode the batch
    batch_embeddings = model.encode(batch_sentences, convert_to_tensor=True)

    # Append the batch embeddings to the list
    encoded_embeddings_list.append(batch_embeddings)

# Concatenate the batch embeddings along the specified axis (dimension)
corpus_embeddings = torch.cat(encoded_embeddings_list, dim=0)


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [26]:
len(corpus_embeddings)

187029

### Cosine Similarity

In [49]:
import torch

# get embedding for search query
search_query = "what is the impact of AI and technology on employee trust uncertainty and vulnerability"

# Query sentence(s):
query = ["what is the impact of AI and technology on employee trust uncertainty and vulnerability"]

# create embedding for query
query_embedding = model.encode(query, convert_to_tensor=True)

# Find the closest 1000 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(1500, len(corpus))

# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]

# using torch get the top score
top_results = torch.topk(cos_scores, k = top_k)


In [50]:
# Create an empty list to store DataFrames
dfs = []

# Iterate through the top results and create DataFrames for each result
for idx in top_results.indices:
    comment = corpus[idx]
    cosine_similarity = cos_scores[idx].item()

    df = pd.DataFrame({'Comment': [comment], 'Cosine_Similarity': [cosine_similarity]})
    dfs.append(df)

# Concatenate the list of DataFrames into a single DataFrame
result_df = pd.concat(dfs, ignore_index=True)

In [51]:
result_df

Unnamed: 0,Comment,Cosine_Similarity
0,How is that different than AI’s impact on any ...,0.582515
1,"To trust a AI, you need to trust the best cand...",0.574457
2,"I'll be honest, I don't know if we can trust t...",0.564018
3,If employers have to choose between productivi...,0.561035
4,Super interesting perspective and I think this...,0.558038
...,...,...
1495,And the robots will be taken care of much bett...,0.405870
1496,In my opinion they’re best to solve two differ...,0.405810
1497,People tend to overestimate the impacts of tec...,0.405794
1498,AI doomsday scenarios are blown way out of pro...,0.405768


In [52]:
# finally match the data back to the orignal data set using the comments themselves

# Merge the original DataFrame with the result_df on the 'Comment' column
merged_data = pd.merge(result_df, all_comments, how = 'left',
                       left_on = 'Comment', right_on = 'comment_body')

# Display the merged DataFrame
merged_data.head()


Unnamed: 0,Comment,Cosine_Similarity,comment_id,comment_body,comment_link_id,comment_utc,comment_subreddit,comment_upvotes,date
0,How is that different than AI’s impact on any ...,0.582515,irn35hr,How is that different than AI’s impact on any ...,t3_xzf2m4,1665327697.0,tech,1.0,2022-10-09 15:01:37+00:00
1,"To trust a AI, you need to trust the best cand...",0.574457,i7lzdye,"To trust a AI, you need to trust the best cand...",t3_ujjg7q,1651876667.0,recruiting,3.0,2022-05-06 22:37:47+00:00
2,"I'll be honest, I don't know if we can trust t...",0.564018,g69f0yc,"I'll be honest, I don't know if we can trust t...",t3_ixqmym,,,,
3,If employers have to choose between productivi...,0.561035,fsz9k1w,If employers have to choose between productivi...,t3_gwz6go,,,,
4,Super interesting perspective and I think this...,0.558038,h374r2y,Super interesting perspective and I think this...,t3_o8ct3v,1624788103.0,humanresources,4.0,2021-06-27 10:01:43+00:00


In [53]:
# store the records to drive
#merged_data.to_csv('/content/drive/MyDrive/Applied Paper /data/semantic_search_comments.csv')
