In [1]:
import requests
import pandas as pd
import datetime
import config
import time
from datetime import datetime, timedelta
#from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch
from bs4 import BeautifulSoup
import re
import feedparser
from readability import Document
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# main_script.py
import config

In [2]:
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [3]:
# Access the API key
api_key = config.API_KEY

global_api_rate_delay = .2  # All API methods are rate limited per IP at 5req/sec.

def make_url(api_key, filter=None, currencies=None, kind=None, region=None, page=None):
    """Handle of URL variables for API POST."""
    url = f'https://cryptopanic.com/api/v1/posts/?auth_token={api_key}'

    if currencies:
        if len(currencies.split(',')) <= 50:
            url += f"&currencies={currencies}"
        else:
            print("Warning: Max Currencies is 50")
            return

    if kind is not None and kind in ['news', 'media']:
        url += f"&kind={kind}"

    filters = ['rising', 'hot', 'bullish', 'bearish', 'important', 'saved', 'lol']
    if filter is not None and filter in filters:
        url += f"&filter={filter}"

    regions = ['en', 'de', 'es', 'fr', 'it', 'pt', 'ru']
    if region is not None and region in regions:
        url += f"&region={region}"

    if page is not None:
        url += f"&page={page}"

    return url


def get_page_json(url=None):
    """
    Get First Page.

    Returns Json.

    """
    time.sleep(global_api_rate_delay)
    if not url:
        url = "https://cryptopanic.com/api/v1/posts/?auth_token={}".format(config.API_KEY)
    page = requests.get(url)
    data = page.json()
    return data


def get_pages_list_json(lookback, url):
    """
    Get history of pages starting from page 1 to the lookback.

    Returns: List of Pages in Json format

    """
    pages_list_json = [get_page_json(url)]

    for i in range(lookback):
        pages_list_json.append(get_page_json(pages_list_json[i]["next"]))

    return pages_list_json


def get_df(data):
    """Return pandas DF."""
    # Ensure that data is a list of dictionaries
    if not all(isinstance(item, dict) for item in data):
        raise ValueError("Data must be a list of dictionaries")
    df = pd.DataFrame(data)
    try:
        df['created_at'] = pd.to_datetime(df['created_at'])
    except Exception as e:
        print(f"An error occurred: {e}")
    return df

def concatenate_pages(pages_list):
    """Concatenate Pages into one Dataframe."""
    # Ensure that pages_list is a list of lists of dictionaries
    if not all(isinstance(page, list) and all(isinstance(item, dict) for item in page) for page in pages_list):
        raise ValueError("Pages list must be a list of lists of dictionaries")
    frames = [get_df(page) for page in pages_list]
    return pd.concat(frames, ignore_index=True)

def get_last_posts(api_key, number_of_posts=200):
    pages_list = []
    page = 1
    total_results = 0
    while total_results < number_of_posts:
        # Conditionally set the filter based on a condition (e.g., 'hot' or 'important')
        if total_results % 2 == 0:
            filter = 'important'
        else:
            filter = 'hot'
        
        url = make_url(api_key, filter=filter, currencies='BTC', page=page)
        
        data = get_page_json(url)
        page_results = data['results']
        pages_list.append(page_results)
        total_results += len(page_results)
        page += 1
        if 'next' not in data or not data['next']:
            break  # No more pages to fetch
    flat_list = [item for sublist in pages_list for item in sublist]
    flat_list = flat_list[:number_of_posts]
    return [flat_list]

# Get the last 200 posts
pages_list = get_last_posts(api_key, number_of_posts=200)
df_last_200_posts = concatenate_pages(pages_list)
print(df_last_200_posts.head())

   kind            domain                                              votes  \
0  news        protos.com  {'negative': 0, 'positive': 8, 'important': 6,...   
1  news      zycrypto.com  {'negative': 8, 'positive': 1, 'important': 3,...   
2  news  cryptopotato.com  {'negative': 4, 'positive': 6, 'important': 3,...   
3  news      zycrypto.com  {'negative': 10, 'positive': 1, 'important': 9...   
4  news       theblock.co  {'negative': 1, 'positive': 4, 'important': 3,...   

                                              source  \
0  {'title': 'Protos.com', 'region': 'en', 'domai...   
1  {'title': 'ZyCrypto', 'region': 'en', 'domain'...   
2  {'title': 'Feed - Cryptopotato.Com', 'region':...   
3  {'title': 'ZyCrypto', 'region': 'en', 'domain'...   
4  {'title': 'The Block', 'region': 'en', 'domain...   

                                               title          published_at  \
0  Justin Trudeau infringed freedoms in truck dri...  2024-01-24T11:01:40Z   
1  Bitcoin Proponent Rips 

In [4]:
# Transform the DataFrame into a list of dictionaries
articles_list = []
for _, row in df_last_200_posts.iterrows():
    article_dict = {
        'title': row['title'],
        'link': row['url'],  # Assuming 'url' is the link to the full article
        'published': row['created_at'],  # 'published_at' could also be used
        'summary': row['slug']  # Assuming you want to use the title as a summary, adjust as needed
        # If you have a summary column you can replace 'title' with the actual summary column name
    }
    articles_list.append(article_dict)

# Now articles_list is in the same format as thedefiant_articles
# You can print to confirm
for article in articles_list[:5]:  # Just print the first 5 for brevity
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Published: {article['published']}")
    print(f"Summary: {article['summary']}")
    print('---')

Title: Justin Trudeau infringed freedoms in truck drivers’ bitcoin funding ban, judge rules
Link: https://cryptopanic.com/news/19199558/Justin-Trudeau-infringed-freedoms-in-truck-drivers-bitcoin-funding-ban-judge-rules
Published: 2024-01-24 11:01:40+00:00
Summary: Justin-Trudeau-infringed-freedoms-in-truck-drivers-bitcoin-funding-ban-judge-rules
---
Title: Bitcoin Proponent Rips Into ‘Centralized Garbage’ XRP, Foresees It Crashing To Virtually Zero Against BTC
Link: https://cryptopanic.com/news/19198010/Bitcoin-Proponent-Rips-Into-Centralized-Garbage-XRP-Foresees-It-Crashing-To-Virtually-Zero-Against-BTC
Published: 2024-01-23 19:56:17+00:00
Summary: Bitcoin-Proponent-Rips-Into-Centralized-Garbage-XRP-Foresees-It-Crashing-To-Virtually-Zero-Against-BTC
---
Title: Cardano (ADA) Outperforms Bitcoin and Ethereum in This Key Metric: Details
Link: https://cryptopanic.com/news/19196957/Cardano-ADA-Outperforms-Bitcoin-and-Ethereum-in-This-Key-Metric-Details
Published: 2024-01-23 13:25:16+00:00


In [5]:
from bs4 import BeautifulSoup
import requests

def get_article_content(url):
    response = requests.get(url)
    if response.ok:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the meta tag with the name attribute 'description'
        meta_description = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
        
        # Extract the content attribute from the meta tag
        description = meta_description['content'] if meta_description else ''
        
        return description
    else:
        return 'Content not found or request failed.'

#  `df_last_200_posts` of the DataFrame containing articles
articles_list = []
for _, row in df_last_200_posts.iterrows():
    article_url = row['url']
    article_description = get_article_content(article_url)  # Retrieve the description
    article_dict = {
        'title': row['title'],
        'link': article_url,
        'published': row['created_at'],
        'summary': row['slug'],  # Or replace with the column that contains the actual summary
        'description': article_description  # Adding the retrieved description
    }
    articles_list.append(article_dict)

# Print the articles to confirm
for article in articles_list[:5]:  # Just print the first 5 for brevity
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Published: {article['published']}")
    print(f"Summary: {article['summary']}")
    print(f"Description: {article['description']}")
    print('---') 

Title: Justin Trudeau infringed freedoms in truck drivers’ bitcoin funding ban, judge rules
Link: https://cryptopanic.com/news/19199558/Justin-Trudeau-infringed-freedoms-in-truck-drivers-bitcoin-funding-ban-judge-rules
Published: 2024-01-24 11:01:40+00:00
Summary: Justin-Trudeau-infringed-freedoms-in-truck-drivers-bitcoin-funding-ban-judge-rules
Description: Justin Trudeau wrongfully invoked the Emergencies Act when blocking truck drivers' Covid-19 protests and bitcoin funding in 2022.
---
Title: Bitcoin Proponent Rips Into ‘Centralized Garbage’ XRP, Foresees It Crashing To Virtually Zero Against BTC
Link: https://cryptopanic.com/news/19198010/Bitcoin-Proponent-Rips-Into-Centralized-Garbage-XRP-Foresees-It-Crashing-To-Virtually-Zero-Against-BTC
Published: 2024-01-23 19:56:17+00:00
Summary: Bitcoin-Proponent-Rips-Into-Centralized-Garbage-XRP-Foresees-It-Crashing-To-Virtually-Zero-Against-BTC
Description: Max Keiser, self-proclaimed Bitcoin maxi and American podcaster, has once again bla

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Define the device (either 'cuda' or 'cpu')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

def analyze_sentiment(articles):
    sentiments = []
    for article in articles:
        # Assume that 'summary' is the key in the dictionary that contains the article text
        summary = article['description']

        # Encode and analyze the sentiment of the summary using your tokenizer and model
        inputs = tokenizer(summary, return_tensors='pt', truncation=True, padding=True)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Get the model's predictions
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        
        # The output has the format (loss, logits) when labels are not provided
        logits = outputs.logits
        predictions = torch.softmax(logits, dim=-1)

        # Get the highest probability sentiment
        _, predicted_class = torch.max(predictions, dim=-1)
        sentiment_label = model.config.id2label[predicted_class.item()]

        # Reattach the date to the sentiment
        dated_sentiment = f"{article['published']} - {sentiment_label}"
        sentiments.append(dated_sentiment)
    return sentiments

# Assuming 'thedefiant_articles' is a list of article dictionaries
sentiments = analyze_sentiment(articles_list)

# Print the sentiment analysis results
for sentiment in sentiments:
    print(sentiment)

2024-01-24 11:01:40+00:00 - negative
2024-01-23 19:56:17+00:00 - negative
2024-01-23 13:25:16+00:00 - neutral
2024-01-23 11:24:36+00:00 - neutral
2024-01-23 10:42:58+00:00 - neutral
2024-01-22 19:51:07+00:00 - negative
2024-01-22 19:31:00+00:00 - neutral
2024-01-21 13:10:07+00:00 - negative
2024-01-20 16:05:00+00:00 - negative
2024-01-19 11:31:05+00:00 - positive
2024-01-19 05:33:05+00:00 - negative
2024-01-18 18:43:05+00:00 - negative
2024-01-18 13:30:00+00:00 - positive
2024-01-18 12:53:22+00:00 - neutral
2024-01-18 10:48:05+00:00 - neutral
2024-01-18 08:08:39+00:00 - positive
2024-01-18 03:08:00+00:00 - positive
2024-01-17 23:11:14+00:00 - negative
2024-01-17 14:10:06+00:00 - negative
2024-01-17 10:07:13+00:00 - negative
2024-01-14 22:11:08+00:00 - negative
2024-01-14 08:18:00+00:00 - negative
2024-01-13 18:21:46+00:00 - negative
2024-01-13 09:20:41+00:00 - negative
2024-01-12 23:16:14+00:00 - neutral
2024-01-12 22:15:00+00:00 - negative
2024-01-12 19:35:00+00:00 - positive
2024-01-

In [7]:
from transformers import pipeline

# Initialize the sentiment analysis pipeline with the FinBERT model
sentiment_analyzer = pipeline("text-classification", model="ProsusAI/finbert")

# Define the create_output_array function
def create_output_array(summaries, sentiment_scores, urls):
    output = []
    for i, summary in enumerate(summaries):
        date, summary_text = summary.split(' - ', 1)

        # Sentiment analysis using the pipeline
        sentiment_score = sentiment_scores[i][0]

        # Tokenize the summary text for sentiment analysis
        input_ids = tokenizer.encode(summary_text, add_special_tokens=True, return_tensors="pt")

        # Make sure the model is in evaluation mode
        model.eval()

        # Perform a forward pass to get the logits
        with torch.no_grad():
            logits = model(input_ids).logits

        # Add the sentiment logits to the output
        output_this = [
            date,
            summary_text,
            sentiment_score['label'],
            sentiment_score['score'],
            urls[i],
            logits.tolist()[0]  # Add logits to the output
        ]
        output.append(output_this)
    return output

# Run sentiment analysis on descriptions using the pipeline
summaries = [f"{article['published']} - {article['description']}" for article in articles_list]
sentiment_scores = [sentiment_analyzer(article['description']) for article in articles_list]

# Extract URLs from the articles list
cleaned_urls = [article['link'] for article in articles_list]

# Combine the summaries, sentiment scores, and URLs into one list
final_output = create_output_array(summaries, sentiment_scores, cleaned_urls)

# Insert headers at the beginning of the final output
final_output.insert(0, ['Date', 'Summary', 'Label', 'Confidence', 'URL', 'Logits'])

# Print or return the final output
for line in final_output:
    print(line)

['Date', 'Summary', 'Label', 'Confidence', 'URL', 'Logits']
['2024-01-24 11:01:40+00:00', "Justin Trudeau wrongfully invoked the Emergencies Act when blocking truck drivers' Covid-19 protests and bitcoin funding in 2022.", 'negative', 0.9055980443954468, 'https://cryptopanic.com/news/19199558/Justin-Trudeau-infringed-freedoms-in-truck-drivers-bitcoin-funding-ban-judge-rules', [-1.5561264753341675, 2.369077205657959, -0.1018725037574768]]
['2024-01-23 19:56:17+00:00', 'Max Keiser, self-proclaimed Bitcoin maxi and American podcaster, has once again blasted Ripple-promoted token XRP, stirring community rage.', 'negative', 0.4572225511074066, 'https://cryptopanic.com/news/19198010/Bitcoin-Proponent-Rips-Into-Centralized-Garbage-XRP-Foresees-It-Crashing-To-Virtually-Zero-Against-BTC', [-0.7193091511726379, 0.5437267422676086, 0.4431598484516144]]
['2024-01-23 13:25:16+00:00', "Check out the latest achievement of Cardano's ADA.", 'neutral', 0.7966058254241943, 'https://cryptopanic.com/news/1

In [8]:
import csv
import datetime

# Get today's date in YYYYMMDD format
todays_date = datetime.datetime.now().strftime('%Y%m%d')

# Define the new CSV file path with today's date
new_csv_file_path = 'exports_by_date/assetsummaries_crypto_panic_bert_{}.csv'.format(todays_date)

# Save the final_output to the CSV file with the new name
with open(new_csv_file_path, mode='w', newline='', encoding='utf-8') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

In [9]:
import os
import pandas as pd

# Define the directory where the CSV files are located
directory = r'D:\Documents\GitHub\cryptopanic_API_Wrapper\exports_by_date'

# Define the output file name
output_filename = 'assetsummaries_crypto_panic_bert_all.csv'

# Combine the directory and output filename to create the full output path
output_path = os.path.join(directory, output_filename)

# Check if the output file exists and delete it if it does
if os.path.exists(output_path):
    os.remove(output_path)
    print(f'{output_filename} already exists and has been deleted.')

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        try:
            # Read each CSV file into a DataFrame with 'ISO-8859-1' encoding
            df = pd.read_csv(file_path, encoding='ISO-8859-1')
            
            # Drop the "Logits" columns if they exist
            if "Logits" in df.columns:
                df = df.drop(columns=["Logits"])
            
            # Append the DataFrame to the list
            dfs.append(df)
        except UnicodeDecodeError:
            print(f"UnicodeDecodeError occurred while reading {file_path}. Skipping this file.")

# Concatenate the list of DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Sort the combined DataFrame in descending order by date
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
combined_df = combined_df.sort_values(by='Date', ascending=False)

# Drop all duplicate rows
combined_df.drop_duplicates(subset=['Date'], keep='first', inplace=True)

# Save the combined and deduplicated DataFrame to a new CSV file
combined_df.to_csv(output_path, index=False)

# Print a message to indicate the process is complete
print(f'Data has been combined, "Logits" columns dropped, sorted in descending order by date, all duplicates removed, and saved to {output_path}.')

assetsummaries_crypto_panic_bert_all.csv already exists and has been deleted.
Data has been combined, "Logits" columns dropped, sorted in descending order by date, all duplicates removed, and saved to D:\Documents\GitHub\cryptopanic_API_Wrapper\exports_by_date\assetsummaries_crypto_panic_bert_all.csv.


In [2]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    # Use the first available GPU, you can specify the GPU index if you have multiple GPUs
    device = torch.device("cuda:0")
else:
    # If CUDA is not available, use the CPU
    device = torch.device("cpu")

# Create a sample tensor and move it to the selected device
your_tensor = torch.randn(3, 3).to(device)

# Check if the tensor is on the GPU
if your_tensor.device.type == 'cuda':
    print("Tensor is on the GPU")
else:
    print("Tensor is on the CPU")


Tensor is on the CPU


In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


NameError: name 'tf' is not defined

In [3]:
import tensorflow as tf

# Check if GPU is available
if tf.test.gpu_device_name():
    print("GPU is available.")
else:
    print("GPU is not available.")

GPU is not available.


In [1]:
import tensorflow as tf

# Check TensorFlow version and GPU availability
print(tf.__version__)
print("GPU Available: ", tf.config.list_physical_devices('GPU'))

# Test a simple computation
a = tf.constant([[1, 2], [3, 4]])
b = tf.constant([[5, 6], [7, 8]])
c = tf.matmul(a, b)

print(c)

2.13.1
GPU Available:  []
tf.Tensor(
[[19 22]
 [43 50]], shape=(2, 2), dtype=int32)


In [3]:
import torch

# Check PyTorch version and GPU availability
print(torch.__version__)
print("GPU Available: ", torch.cuda.is_available())

# Test a simple computation
a = torch.tensor([[1, 2], [3, 4]])
b = torch.tensor([[5, 6], [7, 8]])
c = torch.mm(a, b)

print(c)


2.1.2+cpu
GPU Available:  False
tensor([[19, 22],
        [43, 50]])


In [5]:
import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))

[]
