In [1]:
import requests
import pandas as pd
import datetime
import config
import time
from datetime import datetime, timedelta
#from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch
from bs4 import BeautifulSoup
import re
import feedparser
from readability import Document
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# main_script.py
import config

In [2]:
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [3]:
# Access the API key
api_key = config.API_KEY

global_api_rate_delay = .2  # All API methods are rate limited per IP at 5req/sec.

def make_url(api_key, filter=None, currencies=None, kind=None, region=None, page=None):
    """Handle of URL variables for API POST."""
    url = f'https://cryptopanic.com/api/v1/posts/?auth_token={api_key}'

    if currencies:
        if len(currencies.split(',')) <= 50:
            url += f"&currencies={currencies}"
        else:
            print("Warning: Max Currencies is 50")
            return

    if kind is not None and kind in ['news', 'media']:
        url += f"&kind={kind}"

    filters = ['rising', 'hot', 'bullish', 'bearish', 'important', 'saved', 'lol']
    if filter is not None and filter in filters:
        url += f"&filter={filter}"

    regions = ['en', 'de', 'es', 'fr', 'it', 'pt', 'ru']
    if region is not None and region in regions:
        url += f"&region={region}"

    if page is not None:
        url += f"&page={page}"

    return url


def get_page_json(url=None):
    """
    Get First Page.

    Returns Json.

    """
    time.sleep(global_api_rate_delay)
    if not url:
        url = "https://cryptopanic.com/api/v1/posts/?auth_token={}".format(config.API_KEY)
    page = requests.get(url)
    data = page.json()
    return data


def get_pages_list_json(lookback, url):
    """
    Get history of pages starting from page 1 to the lookback.

    Returns: List of Pages in Json format

    """
    pages_list_json = [get_page_json(url)]

    for i in range(lookback):
        pages_list_json.append(get_page_json(pages_list_json[i]["next"]))

    return pages_list_json


def get_df(data):
    """Return pandas DF."""
    # Ensure that data is a list of dictionaries
    if not all(isinstance(item, dict) for item in data):
        raise ValueError("Data must be a list of dictionaries")
    df = pd.DataFrame(data)
    try:
        df['created_at'] = pd.to_datetime(df['created_at'])
    except Exception as e:
        print(f"An error occurred: {e}")
    return df

def concatenate_pages(pages_list):
    """Concatenate Pages into one Dataframe."""
    # Ensure that pages_list is a list of lists of dictionaries
    if not all(isinstance(page, list) and all(isinstance(item, dict) for item in page) for page in pages_list):
        raise ValueError("Pages list must be a list of lists of dictionaries")
    frames = [get_df(page) for page in pages_list]
    return pd.concat(frames, ignore_index=True)

def get_last_posts(api_key, number_of_posts=200):
    pages_list = []
    page = 1
    total_results = 0
    while total_results < number_of_posts:
        # Conditionally set the filter based on a condition (e.g., 'hot' or 'important')
        if total_results % 2 == 0:
            filter = 'important'
        else:
            filter = 'hot'
        
        url = make_url(api_key, filter=filter, currencies='BTC', page=page)
        
        data = get_page_json(url)
        page_results = data['results']
        pages_list.append(page_results)
        total_results += len(page_results)
        page += 1
        if 'next' not in data or not data['next']:
            break  # No more pages to fetch
    flat_list = [item for sublist in pages_list for item in sublist]
    flat_list = flat_list[:number_of_posts]
    return [flat_list]

# Get the last 200 posts
pages_list = get_last_posts(api_key, number_of_posts=200)
df_last_200_posts = concatenate_pages(pages_list)
print(df_last_200_posts.head())

   kind           domain                                              votes  \
0  news     zycrypto.com  {'negative': 1, 'positive': 6, 'important': 4,...   
1  news   bitcoinist.com  {'negative': 2, 'positive': 11, 'important': 8...   
2  news  coinpaprika.com  {'negative': 6, 'positive': 8, 'important': 5,...   
3  news      newsbtc.com  {'negative': 6, 'positive': 17, 'important': 5...   
4  news     zycrypto.com  {'negative': 1, 'positive': 31, 'important': 2...   

                                              source  \
0  {'title': 'ZyCrypto', 'region': 'en', 'domain'...   
1  {'title': 'Bitcoinist', 'region': 'en', 'domai...   
2  {'title': 'coinpaprika', 'region': 'en', 'doma...   
3  {'title': 'NewsBTC', 'region': 'en', 'domain':...   
4  {'title': 'ZyCrypto', 'region': 'en', 'domain'...   

                                               title          published_at  \
0  XRP, Ether, Solana, Cardano, Shiba Inu Brace F...  2023-12-07T19:00:42Z   
1  BREAKING: Spot Bitcoin ETF ‘M

In [4]:
# Transform the DataFrame into a list of dictionaries
articles_list = []
for _, row in df_last_200_posts.iterrows():
    article_dict = {
        'title': row['title'],
        'link': row['url'],  # Assuming 'url' is the link to the full article
        'published': row['created_at'],  # 'published_at' could also be used
        'summary': row['slug']  # Assuming you want to use the title as a summary, adjust as needed
        # If you have a summary column you can replace 'title' with the actual summary column name
    }
    articles_list.append(article_dict)

# Now articles_list is in the same format as thedefiant_articles
# You can print to confirm
for article in articles_list[:5]:  # Just print the first 5 for brevity
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Published: {article['published']}")
    print(f"Summary: {article['summary']}")
    print('---')

Title: XRP, Ether, Solana, Cardano, Shiba Inu Brace For Trillion-Dollar Storm As Spot Bitcoin ETF Finally Draws Near
Link: https://cryptopanic.com/news/19093586/XRP-Ether-Solana-Cardano-Shiba-Inu-Brace-For-Trillion-Dollar-Storm-As-Spot-Bitcoin-ETF-Finally-Draws-Near
Published: 2023-12-07 19:00:42+00:00
Summary: XRP-Ether-Solana-Cardano-Shiba-Inu-Brace-For-Trillion-Dollar-Storm-As-Spot-Bitcoin-ETF-Finally-Draws-Near
---
Title: BREAKING: Spot Bitcoin ETF ‘May Be Approved Soon’ – Reuters
Link: https://cryptopanic.com/news/19092796/BREAKING-Spot-Bitcoin-ETF-May-Be-Approved-Soon-Reuters
Published: 2023-12-07 14:08:55+00:00
Summary: BREAKING-Spot-Bitcoin-ETF-May-Be-Approved-Soon-Reuters
---
Title: Experts Find a Possible Kill Switch in BlackRock’s Updated ETF
Link: https://cryptopanic.com/news/19088746/Experts-Find-a-Possible-Kill-Switch-in-BlackRocks-Updated-ETF
Published: 2023-12-06 08:37:26+00:00
Summary: Experts-Find-a-Possible-Kill-Switch-in-BlackRocks-Updated-ETF
---
Title: ADA Price C

In [5]:
from bs4 import BeautifulSoup
import requests

def get_article_content(url):
    response = requests.get(url)
    if response.ok:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the meta tag with the name attribute 'description'
        meta_description = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
        
        # Extract the content attribute from the meta tag
        description = meta_description['content'] if meta_description else ''
        
        return description
    else:
        return 'Content not found or request failed.'

#  `df_last_200_posts` of the DataFrame containing articles
articles_list = []
for _, row in df_last_200_posts.iterrows():
    article_url = row['url']
    article_description = get_article_content(article_url)  # Retrieve the description
    article_dict = {
        'title': row['title'],
        'link': article_url,
        'published': row['created_at'],
        'summary': row['slug'],  # Or replace with the column that contains the actual summary
        'description': article_description  # Adding the retrieved description
    }
    articles_list.append(article_dict)

# Print the articles to confirm
for article in articles_list[:5]:  # Just print the first 5 for brevity
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Published: {article['published']}")
    print(f"Summary: {article['summary']}")
    print(f"Description: {article['description']}")
    print('---') 

Title: XRP, Ether, Solana, Cardano, Shiba Inu Brace For Trillion-Dollar Storm As Spot Bitcoin ETF Finally Draws Near
Link: https://cryptopanic.com/news/19093586/XRP-Ether-Solana-Cardano-Shiba-Inu-Brace-For-Trillion-Dollar-Storm-As-Spot-Bitcoin-ETF-Finally-Draws-Near
Published: 2023-12-07 19:00:42+00:00
Summary: XRP-Ether-Solana-Cardano-Shiba-Inu-Brace-For-Trillion-Dollar-Storm-As-Spot-Bitcoin-ETF-Finally-Draws-Near
Description: ...detailed the potential that a spot Bitcoin ETF can have on the Bitcoin, XRP, Ethereum, Solana, Cardano, Shiba Inu markets.
---
Title: BREAKING: Spot Bitcoin ETF ‘May Be Approved Soon’ – Reuters
Link: https://cryptopanic.com/news/19092796/BREAKING-Spot-Bitcoin-ETF-May-Be-Approved-Soon-Reuters
Published: 2023-12-07 14:08:55+00:00
Summary: BREAKING-Spot-Bitcoin-ETF-May-Be-Approved-Soon-Reuters
Description: According to Reuters, significant progress has been made in discussions between the US Securities and Exchange Commission (SEC) and asset managers over the po

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Define the device (either 'cuda' or 'cpu')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

def analyze_sentiment(articles):
    sentiments = []
    for article in articles:
        # Assume that 'summary' is the key in the dictionary that contains the article text
        summary = article['description']

        # Encode and analyze the sentiment of the summary using your tokenizer and model
        inputs = tokenizer(summary, return_tensors='pt', truncation=True, padding=True)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Get the model's predictions
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        
        # The output has the format (loss, logits) when labels are not provided
        logits = outputs.logits
        predictions = torch.softmax(logits, dim=-1)

        # Get the highest probability sentiment
        _, predicted_class = torch.max(predictions, dim=-1)
        sentiment_label = model.config.id2label[predicted_class.item()]

        # Reattach the date to the sentiment
        dated_sentiment = f"{article['published']} - {sentiment_label}"
        sentiments.append(dated_sentiment)
    return sentiments

# Assuming 'thedefiant_articles' is a list of article dictionaries
sentiments = analyze_sentiment(articles_list)

# Print the sentiment analysis results
for sentiment in sentiments:
    print(sentiment)

2023-12-07 19:00:42+00:00 - neutral
2023-12-07 14:08:55+00:00 - positive
2023-12-06 08:37:26+00:00 - neutral
2023-12-06 06:28:14+00:00 - positive
2023-12-05 16:18:03+00:00 - positive
2023-12-05 07:36:58+00:00 - neutral
2023-12-05 03:51:16+00:00 - neutral
2023-12-04 22:20:04+00:00 - neutral
2023-12-04 19:18:45+00:00 - positive
2023-12-04 18:34:41+00:00 - neutral
2023-12-04 16:03:40+00:00 - positive
2023-12-04 14:50:05+00:00 - positive
2023-12-04 07:18:46+00:00 - positive
2023-12-03 22:39:58+00:00 - positive
2023-12-03 12:40:10+00:00 - neutral
2023-12-03 11:27:38+00:00 - positive
2023-12-03 09:24:15+00:00 - positive
2023-12-03 08:47:24+00:00 - negative
2023-12-02 21:55:46+00:00 - positive
2023-12-02 19:58:59+00:00 - positive
2023-12-02 15:11:03+00:00 - positive
2023-12-01 16:05:00+00:00 - negative
2023-12-01 14:31:18+00:00 - positive
2023-12-01 14:03:21+00:00 - neutral
2023-12-01 13:31:21+00:00 - neutral
2023-12-01 10:56:30+00:00 - positive
2023-12-01 10:15:16+00:00 - positive
2023-12-01

In [7]:
from transformers import pipeline

# Initialize the sentiment analysis pipeline with the FinBERT model
sentiment_analyzer = pipeline("text-classification", model="ProsusAI/finbert")

# Define the create_output_array function
def create_output_array(summaries, sentiment_scores, urls):
    output = []
    for i, summary in enumerate(summaries):
        date, summary_text = summary.split(' - ', 1)

        # Sentiment analysis using the pipeline
        sentiment_score = sentiment_scores[i][0]

        # Tokenize the summary text for sentiment analysis
        input_ids = tokenizer.encode(summary_text, add_special_tokens=True, return_tensors="pt")

        # Make sure the model is in evaluation mode
        model.eval()

        # Perform a forward pass to get the logits
        with torch.no_grad():
            logits = model(input_ids).logits

        # Add the sentiment logits to the output
        output_this = [
            date,
            summary_text,
            sentiment_score['label'],
            sentiment_score['score'],
            urls[i],
            logits.tolist()[0]  # Add logits to the output
        ]
        output.append(output_this)
    return output

# Run sentiment analysis on descriptions using the pipeline
summaries = [f"{article['published']} - {article['description']}" for article in articles_list]
sentiment_scores = [sentiment_analyzer(article['description']) for article in articles_list]

# Extract URLs from the articles list
cleaned_urls = [article['link'] for article in articles_list]

# Combine the summaries, sentiment scores, and URLs into one list
final_output = create_output_array(summaries, sentiment_scores, cleaned_urls)

# Insert headers at the beginning of the final output
final_output.insert(0, ['Date', 'Summary', 'Label', 'Confidence', 'URL', 'Logits'])

# Print or return the final output as needed
for line in final_output:
    print(line)

['Date', 'Summary', 'Label', 'Confidence', 'URL', 'Logits']
['2023-12-07 19:00:42+00:00', '...detailed the potential that a spot Bitcoin ETF can have on the Bitcoin, XRP, Ethereum, Solana, Cardano, Shiba Inu markets.', 'neutral', 0.5697973370552063, 'https://cryptopanic.com/news/19093586/XRP-Ether-Solana-Cardano-Shiba-Inu-Brace-For-Trillion-Dollar-Storm-As-Spot-Bitcoin-ETF-Finally-Draws-Near', [1.0695269107818604, -2.694089889526367, 1.3734859228134155]]
['2023-12-07 14:08:55+00:00', 'According to Reuters, significant progress has been made in discussions between the US Securities and Exchange Commission (SEC) and asset managers over the potential approval of Bitcoin exchange-traded funds (ETFs). “Discussions between the US securities regulator and asset managers hoping to...', 'positive', 0.9129231572151184, 'https://cryptopanic.com/news/19092796/BREAKING-Spot-Bitcoin-ETF-May-Be-Approved-Soon-Reuters', [1.8770713806152344, -2.518946886062622, -0.6111675500869751]]
['2023-12-06 08:37:2

In [9]:
import csv
import datetime

# Get today's date in YYYYMMDD format
todays_date = datetime.datetime.now().strftime('%Y%m%d')

# Define the new CSV file path with today's date
new_csv_file_path = 'exports_by_date/assetsummaries_crypto_panic_bert_{}.csv'.format(todays_date)

# Save the final_output to the CSV file with the new name
with open(new_csv_file_path, mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

UnicodeEncodeError: 'charmap' codec can't encode character '\u2713' in position 110: character maps to <undefined>

In [None]:
import csv
import datetime

# Get today's date in YYYYMMDD format
todays_date = datetime.datetime.now().strftime('%Y%m%d')

# Define the new CSV file path with today's date
new_csv_file_path = 'exports_by_date/assetsummaries_crypto_panic_bert_{}.csv'.format(todays_date)

# Save the final_output to the CSV file with the new name
with open(new_csv_file_path, mode='w', newline='', encoding='utf-8') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

In [None]:
import os
import pandas as pd

# Define the directory where the CSV files are located
directory = r'D:\Documents\GitHub\cryptopanic_API_Wrapper\exports_by_date'

# Define the output file name
output_filename = 'assetsummaries_crypto_panic_bert_all.csv'

# Combine the directory and output filename to create the full output path
output_path = os.path.join(directory, output_filename)

# Check if the output file exists and delete it if it does
if os.path.exists(output_path):
    os.remove(output_path)
    print(f'{output_filename} already exists and has been deleted.')

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        try:
            # Read each CSV file into a DataFrame with 'ISO-8859-1' encoding
            df = pd.read_csv(file_path, encoding='ISO-8859-1')
            
            # Drop the "Logits" columns if they exist
            if "Logits" in df.columns:
                df = df.drop(columns=["Logits"])
            
            # Append the DataFrame to the list
            dfs.append(df)
        except UnicodeDecodeError:
            print(f"UnicodeDecodeError occurred while reading {file_path}. Skipping this file.")

# Concatenate the list of DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Sort the combined DataFrame in descending order by date
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
combined_df = combined_df.sort_values(by='Date', ascending=False)

# Drop all duplicate rows
combined_df.drop_duplicates(subset=['Date'], keep='first', inplace=True)

# Save the combined and deduplicated DataFrame to a new CSV file
combined_df.to_csv(output_path, index=False)

# Print a message to indicate the process is complete
print(f'Data has been combined, "Logits" columns dropped, sorted in descending order by date, all duplicates removed, and saved to {output_path}.')
