In [None]:
# pip install transformers

In [1]:
# Import libraries

import requests
import time
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import string

In [2]:
# Reddit thread to scrape
thread = 'Crypto_com' # replace with reddit thread to track

In [3]:
# Scraper 

header = {'User-agent': 'ep 0.1.1'}

# Set empty list to store posts
posts = []

# Set param as none for first iteration
after = None

# Iterate through 5 pages of 25 posts 
for i in tqdm(range(5)):
    if after == None:
        param = {}
    else:
        param = {'after': after}
    url = 'https://www.reddit.com/r/'+thread+'/.json'
    results = requests.get(url, params=param, headers=header)
    if results.status_code == 200: # Check if request successful
        res_json = results.json()
        posts.extend(res_json['data']['children'])
        after = res_json['data']['after']
    else:
        print(results.status_code)
        break
    #  Rest time in seconds
    time.sleep(1)

# Records only posts from unique users
posts = pd.DataFrame(posts)
lst = {}
lst['post_title'] =[]
lst['content'] =[]
lst['name'] =[]
for i in posts['data']:
    if i['name'] not in lst['name']:
        lst['post_title'].append(i['title'])
        lst['content'].append(i['selftext'])
        lst['name'].append(i['name'])

print('Successfully scrapped {} unique posts'.format(len(lst['post_title'])))
scrapped_thread = pd.DataFrame(lst)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(5)):


  0%|          | 0/5 [00:00<?, ?it/s]

Successfully scrapped 127 unique posts


In [4]:
# Save dataset 
file_name='./'+thread+'_thread.csv'
scrapped_thread.to_csv(file_name, index=False)

In [5]:
# Import data
df_thread = pd.read_csv(file_name)
df_title_content = pd.DataFrame(df_thread.post_title + ' -- ' + df_thread.content, columns = ["title_content"]).dropna().reset_index(drop=True)

# Data processing 
# Remove rows with attachments
df_title_content = df_title_content[
                                        (df_title_content.title_content.str.contains("png")==False)&
                                        (df_title_content.title_content.str.contains("ampx200b\n\nhttpsredditcom")==False)&
                                        (df_title_content.title_content.str.contains("&amp;#x200B")==False)
                                    ]
df_title_content = df_title_content.reset_index(drop=True)
df_title_content.sample(20)

Unnamed: 0,title_content
24,Delayed Employer Direct Deposit via ACH to Fia...
41,How to get ETH on the Ethereum chain from ETH ...
28,Received Exclusive Merchandise Welcome Pack af...
58,"Lounge access after may 31th 2024 -- Hi,\n\nth..."
31,Closest Thing To A Brokerage Statement From CD...
43,Help getting account back -- I’ve fallen on ha...
44,Why is the price like this? -- So when i took ...
52,"TUSD at 155% APY?? -- Hello all,\n\nI am check..."
11,"People who managed to create a good grid bot, ..."
12,What happened to moon🌛MOONNODE???? 0% p.a. All...


In [7]:
# Use sentiment model from hugging face: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
# Import libraries 
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
from datetime import datetime
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

# now = datetime.now().strftime("%d%m%Y_%H%M%S")
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)


In [8]:

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


In [9]:
# Use model and defined functions to get sentiment of imported reddit data
sentiment=[]
for i in df_title_content.title_content:
    text = preprocess(i)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    sentiment.append(labels[ranking[0]])

df_title_content['sentiment']=sentiment

# Save post and sentiment to file
df_title_content.to_csv(thread+'_clean_content_sentiment.csv')

In [11]:
# Check number of posts retrieved
print("Number of posts retrieved: "+str(df_title_content.shape[0]))

Number of posts retrieved: 76


In [12]:
# Use openai API to summarise the dataset

# Import library
import openai

def summarize_corpus(corpus):
    # Set up OpenAI API credentials
    openai.api_key = 'YOUR_API_KEY' #replace with own key

    # Provide the prompt and settings for the API call
    prompt = 'Summarize the following text: ' + corpus
    max_tokens = 500  # Maximum number of tokens for the summary

    # Call the OpenAI API to generate the summary
    response = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=0.2,
        n=1,
        stop=None
    )

    # Extract the generated summary from the API response
    summary = response.choices[0].text.strip()

    return summary


# Use nltk to gauge token count

# Import library
import nltk

def count_tokens(corpus):
    tokens = nltk.word_tokenize(corpus)
    token_count = len(tokens)
    return token_count


In [25]:
# Apply defined functions
print(df_title_content.sentiment.value_counts())
for n,k in enumerate(labels):
    corpus = df_title_content[df_title_content.sentiment==str(k)].title_content
    if count_tokens(corpus.to_json()) > 3000: #estimate prompt max token count to be 3000 excluding 500 tokens for output
        corpus = corpus.iloc[:30]
        summary = summarize_corpus(corpus.to_json())
        num_posts = corpus.shape[0]
    else:
        summary = summarize_corpus(corpus.to_json())
        num_posts = corpus.shape[0]
    
    print('\n'"Number of "+str(k)+" posts used:", num_posts)
    print(summary)

sentiment
negative    37
neutral     32
positive     7
Name: count, dtype: int64

Number of negative posts used: 30
This post is about people having issues with Crypto.com, such as not being able to pay friends, withdraw funds, transfer money, and access their accounts. People are also complaining about the bonuses, rewards, customer support, and fees associated with the platform. They are also asking questions about how to use the platform, such as how to withdraw coins to a connected defi wallet, how to unlock a card after entering the wrong pin, and if PayID payments have been blocked in Australia.

Number of neutral posts used: 32
This text contains a variety of questions and comments related to Crypto.com. Questions include: delisting $PEPE, BTC withdrawal fees, network importance, receiving 8% for Rose Gold holders, Priority Pass and guests, benefits of holding a Ruby Card, staking on the Cosmos system, Supercharger calculations, time to unstake ATOM and CRO in DeFi, Priority Pas