In [1]:
import requests
import pandas as pd
import datetime
import config
import time
from datetime import datetime, timedelta
#from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch
from bs4 import BeautifulSoup
import re
import feedparser
from readability import Document
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# main_script.py
import config

In [2]:
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [3]:
# Access the API key
api_key = config.API_KEY

global_api_rate_delay = .2  # All API methods are rate limited per IP at 5req/sec.

def make_url(api_key, filter=None, currencies=None, kind=None, region=None, page=None):
    """Handle of URL variables for API POST."""
    url = f'https://cryptopanic.com/api/v1/posts/?auth_token={api_key}'

    if currencies:
        if len(currencies.split(',')) <= 50:
            url += f"&currencies={currencies}"
        else:
            print("Warning: Max Currencies is 50")
            return

    if kind is not None and kind in ['news', 'media']:
        url += f"&kind={kind}"

    filters = ['rising', 'hot', 'bullish', 'bearish', 'important', 'saved', 'lol']
    if filter is not None and filter in filters:
        url += f"&filter={filter}"

    regions = ['en', 'de', 'es', 'fr', 'it', 'pt', 'ru']
    if region is not None and region in regions:
        url += f"&region={region}"

    if page is not None:
        url += f"&page={page}"

    return url


def get_page_json(url=None):
    """
    Get First Page.

    Returns Json.

    """
    time.sleep(global_api_rate_delay)
    if not url:
        url = "https://cryptopanic.com/api/v1/posts/?auth_token={}".format(config.API_KEY)
    page = requests.get(url)
    data = page.json()
    return data


def get_pages_list_json(lookback, url):
    """
    Get history of pages starting from page 1 to the lookback.

    Returns: List of Pages in Json format

    """
    pages_list_json = [get_page_json(url)]

    for i in range(lookback):
        pages_list_json.append(get_page_json(pages_list_json[i]["next"]))

    return pages_list_json


def get_df(data):
    """Return pandas DF."""
    # Ensure that data is a list of dictionaries
    if not all(isinstance(item, dict) for item in data):
        raise ValueError("Data must be a list of dictionaries")
    df = pd.DataFrame(data)
    try:
        df['created_at'] = pd.to_datetime(df['created_at'])
    except Exception as e:
        print(f"An error occurred: {e}")
    return df

def concatenate_pages(pages_list):
    """Concatenate Pages into one Dataframe."""
    # Ensure that pages_list is a list of lists of dictionaries
    if not all(isinstance(page, list) and all(isinstance(item, dict) for item in page) for page in pages_list):
        raise ValueError("Pages list must be a list of lists of dictionaries")
    frames = [get_df(page) for page in pages_list]
    return pd.concat(frames, ignore_index=True)

def get_last_posts(api_key, number_of_posts=200):
    pages_list = []
    page = 1
    total_results = 0
    while total_results < number_of_posts:
        # Conditionally set the filter based on a condition (e.g., 'hot' or 'important')
        if total_results % 2 == 0:
            filter = 'important'
        else:
            filter = 'hot'
        
        url = make_url(api_key, filter=filter, currencies='BTC', page=page)
        
        data = get_page_json(url)
        page_results = data['results']
        pages_list.append(page_results)
        total_results += len(page_results)
        page += 1
        if 'next' not in data or not data['next']:
            break  # No more pages to fetch
    flat_list = [item for sublist in pages_list for item in sublist]
    flat_list = flat_list[:number_of_posts]
    return [flat_list]

# Get the last 200 posts
pages_list = get_last_posts(api_key, number_of_posts=200)
df_last_200_posts = concatenate_pages(pages_list)
print(df_last_200_posts.head())

   kind               domain  \
0  news  feeds2.benzinga.com   
1  news        dailycoin.com   
2  news          finbold.com   
3  news          finbold.com   
4  news          finbold.com   

                                               votes  \
0  {'negative': 0, 'positive': 6, 'important': 3,...   
1  {'negative': 5, 'positive': 8, 'important': 6,...   
2  {'negative': 0, 'positive': 4, 'important': 3,...   
3  {'negative': 0, 'positive': 4, 'important': 3,...   
4  {'negative': 0, 'positive': 4, 'important': 3,...   

                                              source  \
0  {'title': 'Benzinga', 'region': 'en', 'domain'...   
1  {'title': 'Dailycoin', 'region': 'en', 'domain...   
2  {'title': 'Finbold', 'region': 'en', 'domain':...   
3  {'title': 'Finbold', 'region': 'en', 'domain':...   
4  {'title': 'Finbold', 'region': 'en', 'domain':...   

                                               title          published_at  \
0  Bitcoin Set To Break $40,000 Barrier, Says Cry...  2

In [4]:
# Transform the DataFrame into a list of dictionaries
articles_list = []
for _, row in df_last_200_posts.iterrows():
    article_dict = {
        'title': row['title'],
        'link': row['url'],  # Assuming 'url' is the link to the full article
        'published': row['created_at'],  # 'published_at' could also be used
        'summary': row['slug']  # Assuming you want to use the title as a summary, adjust as needed
        # If you have a summary column you can replace 'title' with the actual summary column name
    }
    articles_list.append(article_dict)

# Now articles_list is in the same format as thedefiant_articles
# You can print to confirm
for article in articles_list[:5]:  # Just print the first 5 for brevity
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Published: {article['published']}")
    print(f"Summary: {article['summary']}")
    print('---')

Title: Bitcoin Set To Break $40,000 Barrier, Says Crypto Expert: 'It Looks Like It Is Just A Rising Floor'
Link: https://cryptopanic.com/news/19080347/Bitcoin-Set-To-Break-40000-Barrier-Says-Crypto-Expert-It-Looks-Like-It-Is-Just-A-Rising-Floor
Published: 2023-12-02 15:11:03+00:00
Summary: Bitcoin-Set-To-Break-40000-Barrier-Says-Crypto-Expert-It-Looks-Like-It-Is-Just-A-Rising-Floor
---
Title: Cardano’s Charles Hoskinson Contests Bitcoin Maxi Narratives
Link: https://cryptopanic.com/news/19078628/Cardanos-Charles-Hoskinson-Contests-Bitcoin-Maxi-Narratives
Published: 2023-12-01 16:05:00+00:00
Summary: Cardanos-Charles-Hoskinson-Contests-Bitcoin-Maxi-Narratives
---
Title: Bitcoin to the moon? $40,000 around the corner as 85% of holders in profit
Link: https://cryptopanic.com/news/19078354/Bitcoin-to-the-moon-40000-around-the-corner-as-85-of-holders-in-profit
Published: 2023-12-01 14:31:18+00:00
Summary: Bitcoin-to-the-moon-40000-around-the-corner-as-85-of-holders-in-profit
---
Title: 3 cr

In [5]:
from bs4 import BeautifulSoup
import requests

def get_article_content(url):
    response = requests.get(url)
    if response.ok:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the meta tag with the name attribute 'description'
        meta_description = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
        
        # Extract the content attribute from the meta tag
        description = meta_description['content'] if meta_description else ''
        
        return description
    else:
        return 'Content not found or request failed.'

#  `df_last_200_posts` of the DataFrame containing articles
articles_list = []
for _, row in df_last_200_posts.iterrows():
    article_url = row['url']
    article_description = get_article_content(article_url)  # Retrieve the description
    article_dict = {
        'title': row['title'],
        'link': article_url,
        'published': row['created_at'],
        'summary': row['slug'],  # Or replace with the column that contains the actual summary
        'description': article_description  # Adding the retrieved description
    }
    articles_list.append(article_dict)

# Print the articles to confirm
for article in articles_list[:5]:  # Just print the first 5 for brevity
    print(f"Title: {article['title']}")
    print(f"Link: {article['link']}")
    print(f"Published: {article['published']}")
    print(f"Summary: {article['summary']}")
    print(f"Description: {article['description']}")
    print('---') 

Title: Bitcoin Set To Break $40,000 Barrier, Says Crypto Expert: 'It Looks Like It Is Just A Rising Floor'
Link: https://cryptopanic.com/news/19080347/Bitcoin-Set-To-Break-40000-Barrier-Says-Crypto-Expert-It-Looks-Like-It-Is-Just-A-Rising-Floor
Published: 2023-12-02 15:11:03+00:00
Summary: Bitcoin-Set-To-Break-40000-Barrier-Says-Crypto-Expert-It-Looks-Like-It-Is-Just-A-Rising-Floor
Description: Amid a recent surge of optimism within the cryptocurrency sector, pseudonymous crypto trader The Flow Horse has projected a significant rise in the value of Bitcoin (CRYPTO: BTC). read more
---
Title: Cardano’s Charles Hoskinson Contests Bitcoin Maxi Narratives
Link: https://cryptopanic.com/news/19078628/Cardanos-Charles-Hoskinson-Contests-Bitcoin-Maxi-Narratives
Published: 2023-12-01 16:05:00+00:00
Summary: Cardanos-Charles-Hoskinson-Contests-Bitcoin-Maxi-Narratives
Description: Charles Hoskinson questions Bitcoin maximalists’ dismissive attitude towards altcoins.
---
Title: Bitcoin to the moon

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Define the device (either 'cuda' or 'cpu')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

def analyze_sentiment(articles):
    sentiments = []
    for article in articles:
        # Assume that 'summary' is the key in the dictionary that contains the article text
        summary = article['description']

        # Encode and analyze the sentiment of the summary using your tokenizer and model
        inputs = tokenizer(summary, return_tensors='pt', truncation=True, padding=True)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Get the model's predictions
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        
        # The output has the format (loss, logits) when labels are not provided
        logits = outputs.logits
        predictions = torch.softmax(logits, dim=-1)

        # Get the highest probability sentiment
        _, predicted_class = torch.max(predictions, dim=-1)
        sentiment_label = model.config.id2label[predicted_class.item()]

        # Reattach the date to the sentiment
        dated_sentiment = f"{article['published']} - {sentiment_label}"
        sentiments.append(dated_sentiment)
    return sentiments

# Assuming 'thedefiant_articles' is a list of article dictionaries
sentiments = analyze_sentiment(articles_list)

# Print the sentiment analysis results
for sentiment in sentiments:
    print(sentiment)

2023-12-02 15:11:03+00:00 - positive
2023-12-01 16:05:00+00:00 - negative
2023-12-01 14:31:18+00:00 - positive
2023-12-01 14:03:21+00:00 - neutral
2023-12-01 13:31:21+00:00 - neutral
2023-12-01 10:56:30+00:00 - positive
2023-12-01 10:15:16+00:00 - positive
2023-12-01 09:50:44+00:00 - positive
2023-12-01 09:33:11+00:00 - positive
2023-11-30 14:52:23+00:00 - positive
2023-11-30 14:11:12+00:00 - neutral
2023-11-30 12:58:24+00:00 - positive
2023-11-30 11:55:35+00:00 - positive
2023-11-30 11:08:46+00:00 - positive
2023-11-29 18:44:45+00:00 - positive
2023-11-29 18:22:08+00:00 - neutral
2023-11-29 16:20:46+00:00 - negative
2023-11-29 15:39:20+00:00 - neutral
2023-11-29 10:36:43+00:00 - negative
2023-11-28 23:45:24+00:00 - neutral
2023-11-28 16:51:08+00:00 - positive
2023-11-28 13:47:42+00:00 - positive
2023-11-28 09:38:29+00:00 - neutral
2023-11-28 09:31:51+00:00 - positive
2023-11-28 08:15:56+00:00 - negative
2023-11-28 07:34:43+00:00 - positive
2023-11-27 14:07:07+00:00 - positive
2023-11-

In [7]:
from transformers import pipeline

# Initialize the sentiment analysis pipeline with the FinBERT model
sentiment_analyzer = pipeline("text-classification", model="ProsusAI/finbert")

# Define the create_output_array function
def create_output_array(summaries, sentiment_scores, urls):
    output = []
    for i, summary in enumerate(summaries):
        date, summary_text = summary.split(' - ', 1)

        # Sentiment analysis using the pipeline
        sentiment_score = sentiment_scores[i][0]

        # Tokenize the summary text for sentiment analysis
        input_ids = tokenizer.encode(summary_text, add_special_tokens=True, return_tensors="pt")

        # Make sure the model is in evaluation mode
        model.eval()

        # Perform a forward pass to get the logits
        with torch.no_grad():
            logits = model(input_ids).logits

        # Add the sentiment logits to the output
        output_this = [
            date,
            summary_text,
            sentiment_score['label'],
            sentiment_score['score'],
            urls[i],
            logits.tolist()[0]  # Add logits to the output
        ]
        output.append(output_this)
    return output

# Run sentiment analysis on descriptions using the pipeline
summaries = [f"{article['published']} - {article['description']}" for article in articles_list]
sentiment_scores = [sentiment_analyzer(article['description']) for article in articles_list]

# Extract URLs from the articles list
cleaned_urls = [article['link'] for article in articles_list]

# Combine the summaries, sentiment scores, and URLs into one list
final_output = create_output_array(summaries, sentiment_scores, cleaned_urls)

# Insert headers at the beginning of the final output
final_output.insert(0, ['Date', 'Summary', 'Label', 'Confidence', 'URL', 'Logits'])

# Print or return the final output as needed
for line in final_output:
    print(line)

['Date', 'Summary', 'Label', 'Confidence', 'URL', 'Logits']
['2023-12-02 15:11:03+00:00', 'Amid a recent surge of optimism within the cryptocurrency sector, pseudonymous crypto trader The Flow Horse has projected a significant rise in the value of Bitcoin (CRYPTO: BTC). read more', 'positive', 0.7253322601318359, 'https://cryptopanic.com/news/19080347/Bitcoin-Set-To-Break-40000-Barrier-Says-Crypto-Expert-It-Looks-Like-It-Is-Just-A-Rising-Floor', [1.3817099332809448, -2.497579574584961, 0.35452404618263245]]
['2023-12-01 16:05:00+00:00', 'Charles Hoskinson questions Bitcoin maximalists’ dismissive attitude towards altcoins.', 'negative', 0.8718141317367554, 'https://cryptopanic.com/news/19078628/Cardanos-Charles-Hoskinson-Contests-Bitcoin-Maxi-Narratives', [-1.545287847518921, 2.2369980812072754, 0.15164272487163544]]
['2023-12-01 14:31:18+00:00', 'After Bitcoin (BTC) crossed the massive psychological level at $38,000 and continued to advance, optimism has returned to the cryptocurrency

In [8]:
import csv
import datetime

# Get today's date in YYYYMMDD format
todays_date = datetime.datetime.now().strftime('%Y%m%d')

# Define the new CSV file path with today's date
new_csv_file_path = 'exports_by_date/assetsummaries_crypto_panic_bert_{}.csv'.format(todays_date)

# Save the final_output to the CSV file with the new name
with open(new_csv_file_path, mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)

In [9]:
import os
import pandas as pd

# Define the directory where the CSV files are located
directory = r'D:\Documents\GitHub\cryptopanic_API_Wrapper\exports_by_date'

# Define the output file name
output_filename = 'assetsummaries_crypto_panic_bert_all.csv'

# Combine the directory and output filename to create the full output path
output_path = os.path.join(directory, output_filename)

# Check if the output file exists and delete it if it does
if os.path.exists(output_path):
    os.remove(output_path)
    print(f'{output_filename} already exists and has been deleted.')

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        try:
            # Read each CSV file into a DataFrame with 'ISO-8859-1' encoding
            df = pd.read_csv(file_path, encoding='ISO-8859-1')
            
            # Drop the "Logits" columns if they exist
            if "Logits" in df.columns:
                df = df.drop(columns=["Logits"])
            
            # Append the DataFrame to the list
            dfs.append(df)
        except UnicodeDecodeError:
            print(f"UnicodeDecodeError occurred while reading {file_path}. Skipping this file.")

# Concatenate the list of DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Sort the combined DataFrame in descending order by date
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
combined_df = combined_df.sort_values(by='Date', ascending=False)

# Drop all duplicate rows
combined_df.drop_duplicates(subset=['Date'], keep='first', inplace=True)

# Save the combined and deduplicated DataFrame to a new CSV file
combined_df.to_csv(output_path, index=False)

# Print a message to indicate the process is complete
print(f'Data has been combined, "Logits" columns dropped, sorted in descending order by date, all duplicates removed, and saved to {output_path}.')


assetsummaries_crypto_panic_bert_all.csv already exists and has been deleted.
Data has been combined, "Logits" columns dropped, sorted in descending order by date, all duplicates removed, and saved to D:\Documents\GitHub\cryptopanic_API_Wrapper\exports_by_date\assetsummaries_crypto_panic_bert_all.csv.
