<a href="https://colab.research.google.com/github/edisonsiu/sentichain/blob/Reddit-API/SentiChain_Jack_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import os
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
#Check directory to see if files needed are in the directory
!ls '//content/drive/My Drive/Colab Notebooks'

'Format Conversion.ipynb'				   __pycache__
 generative-ai-and-the-future-of-work-in-america-vf1.pdf   SentiChain_Jack.ipynb
 HW1_Jack_Xuhui_Zhang_XZ588.ipynb			   vector_store_cohere
 HW2_Jack_Xuhui_Zhang_XZ588.ipynb			   vector_store_openai
 key.py


In [3]:
%%capture
!pip install plotly
!pip install requests

# **2. Search Reddit by Search for Keywords

In [49]:
import requests
import pandas as pd

def search_reddit(query, limit, timeframe):
    try:
        base_url = f'https://www.reddit.com/search.json?q={query}&limit={limit}&t={timeframe}&sort=top'
        response = requests.get(base_url, headers={'User-agent': 'SentiChain_00001'})
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f'An Error Occurred: {e}')
        return None

def get_results(r):
    if r and 'data' in r and 'children' in r['data']:
        posts_data = [post['data'] for post in r['data']['children'] if not post['data'].get('over_18', False)]
        if posts_data:
            df = pd.DataFrame(posts_data)
            return df[['title', 'selftext', 'ups', 'downs', 'upvote_ratio', 'num_comments', 'score', 'url', 'subreddit']]
        else:
            print("No posts data found in response.")
    else:
        print("Response is empty or has an unexpected format.")
    return pd.DataFrame()


In [50]:
if __name__ == '__main__':


    # User Input here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
    query = 'Dogecoin'
    limit = 1000
    timeframe = 'day'
    # User Input here!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#


    reddit_data = search_reddit(query, limit, timeframe)
    if reddit_data:
        reddit_df = get_results(reddit_data)
        reddit_df.drop_duplicates(subset=['url'], inplace=True)
        print(reddit_df)
    else:
        print("No data received from Reddit.")

                                                title  \
0       be true to yourselves shibes, act accordingly   
1                  I Love Ðogecoin and I love you all   
2         Last three months have been giving me hope.   
3         Ruh roh folks what’s this looking like……….👍   
4   People who just started investing in crypto ri...   
5   I'll keep reposting till we hit the Dollar. We...   
6   Everybody who likes when Ðogecoin price goes u...   
7   It doesn’t matter where you come from if you a...   
8                               Just bought the dip 💎   
9                     DOGE will have own satellite 🛰️   
10                                Did you DOGE today?   
11                     Keep hodling ladies and gents!   
12                             dip before the 15¢ run   
13                                  When do Lambo???    
14                            Absolutely STUNNED!!!!!   
15  now... a rebound in a few minutes... in a few ...   
16  All of the people holding Ð

(1) Define and calculate the engagement_score

In [6]:
def calculate_engagement_score(post):
    num_comments = post['num_comments']
    score = post['score']
    # Adjusting the engagement score: 60% num_comments and 40% score
    return (0.6 * num_comments) + (0.4 * score)

def process_posts(df):
    # First, calculate the engagement score for each post
    df['engagement_score'] = df.apply(calculate_engagement_score, axis=1)

    # Calculate the sum of all engagement scores
    total_engagement_score = df['engagement_score'].sum()

    # Now, calculate the weight for each post
    df['post_weight'] = df['engagement_score'] / total_engagement_score

    return df

# Assuming reddit_df is your DataFrame from the Reddit data
processed_df = process_posts(reddit_df)
print(processed_df[['title', 'selftext', 'engagement_score', 'post_weight']])



                                                title  \
0                  I Love Ðogecoin and I love you all   
1   Thoughts on $PEEP (SOL) - the Solana $PEPE spi...   
2            Is there any downside to staking Solana?   
3   Found these extra drops in my wallet and want ...   
4           Is this a sign to buy Guac? It looks nice   
..                                                ...   
77  📈 Cardano breaks into top 10 DeFi protocols by...   
78  Celestia Price Prediction as Airdrop Token App...   
79                                     Solana network   
80                                              S.O.S   
82  Solana congested network? Here we go again and...   

                                             selftext  engagement_score  \
0                                                                  36.6   
1   Not asking for investing advice, but rather if...              36.0   
2   Hey guys, I’m relatively new to the crypto sce...              25.2   
3              

(2) Get sentiment_score for each posts


In [7]:
%%capture
!pip install  langchain     #Langchain
!pip install  "openai<=0.28.1"  #OpenAI pre-11/07


!pip install tiktoken       #tokenizer for Open AI models
!pip install streamlit      #GUI for apps

In [8]:
# Import the `key.py` file and get OpenAI's API key.
os.chdir('/content/drive/My Drive/Colab Notebooks')
import key

import openai
apikey_openai = key.OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = apikey_openai

In [36]:
import langchain
import time

from langchain.agents import initialize_agent, load_tools
from langchain.agents import AgentType
from langchain.llms import OpenAI
from langchain.utilities import SerpAPIWrapper

In [51]:
import asyncio
from concurrent.futures import ThreadPoolExecutor

# ... (other necessary imports and function definitions)

# Asynchronous function to process posts concurrently
async def process_posts_concurrently(posts, query):
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as executor:
        tasks = [loop.run_in_executor(executor, sentiment_analysis, truncate_text(post), query) for post in posts]
        return await asyncio.gather(*tasks)

# Usage example, assuming processed_df and query are defined
async def main():
    # Start timer
    s = time.perf_counter()

    # Process all selftext values in processed_df
    posts = processed_df['selftext'].tolist()
    sentiment_scores = await process_posts_concurrently(posts, query)

    # Calculate elapsed time
    elapsed_time = time.perf_counter() - s
    print(f"Sentiment analysis executed in {elapsed_time:.2f} seconds.")

    # Add the sentiment_scores to the DataFrame
    processed_df['sentiment_score'] = sentiment_scores

    # Return the DataFrame with the required columns
    return processed_df[['title', 'selftext', 'engagement_score', 'post_weight', 'sentiment_score']]

# Run the main coroutine
new_df = await main()

# new_df now contains the updated DataFrame
print(new_df)


Sentiment analysis executed in 6.36 seconds.
                                                title  \
0                  I Love Ðogecoin and I love you all   
1   Thoughts on $PEEP (SOL) - the Solana $PEPE spi...   
2            Is there any downside to staking Solana?   
3   Found these extra drops in my wallet and want ...   
4           Is this a sign to buy Guac? It looks nice   
..                                                ...   
77  📈 Cardano breaks into top 10 DeFi protocols by...   
78  Celestia Price Prediction as Airdrop Token App...   
79                                     Solana network   
80                                              S.O.S   
82  Solana congested network? Here we go again and...   

                                             selftext  engagement_score  \
0                                                                  36.6   
1   Not asking for investing advice, but rather if...              36.0   
2   Hey guys, I’m relatively new to the crypt

(3) Calculate the weighted_sentiment_score

In [52]:
def calculate_weighted_sentiment_score(df):
    df_copy = df.copy()
    # Calculate the weighted sentiment score
    df_copy['weighted_sentiment'] = df_copy['sentiment_score'] * df_copy['post_weight']
    weighted_sentiment_score = df_copy['weighted_sentiment'].sum()
    return weighted_sentiment_score

# Call the function and get the weighted sentiment score
weighted_sentiment_score = calculate_weighted_sentiment_score(new_df)
print(f"The weighted sentiment score is: {weighted_sentiment_score:.2f}")



The weighted sentiment score is: 0.03
