In [1]:
# Cell 1: Import necessary libraries
import os
import json
import boto3
import numpy as np
import pandas as pd
import requests
from opensearchpy import OpenSearch

# Check the current working directory
print("Current working directory:", os.getcwd())



Current working directory: c:\Users\Academy2024\Desktop\fmaric\PROMPTENGINEERING\DAY3


In [2]:
# Cell 2: Load dataset
dataset_path = "cryptonews.csv"
df = pd.read_csv(dataset_path)

In [3]:
# Cell 3: Extract and Combine Text
df = df.head(100)
df['content'] = df['title'] + " " + df['text']

In [4]:
# Cell 4: Extract Sentiment from CSV
def extract_sentiment(sentiment_str):
    sentiment_dict = json.loads(sentiment_str.replace("'", "\""))
    return sentiment_dict['class']

df['sentiment'] = df['sentiment'].apply(extract_sentiment)

In [5]:
# Cell 5: Embed Text Using Amazon Titan
client = boto3.client('bedrock-runtime')
model_id = "amazon.titan-embed-text-v1"
accept = "application/json"
content_type = "application/json"
expected_dimension = 1536

def embed_text(text):
    body = json.dumps({"inputText": text})
    response = client.invoke_model(
        body=body,
        modelId=model_id,
        accept=accept,
        contentType=content_type
    )
    response_body = json.loads(response.get('body').read())
    embedding = np.array(response_body['embedding'])
    if embedding.shape[0] != expected_dimension:
        print(f"Invalid embedding dimension: {embedding.shape[0]} for text: {text}")
        return None
    return embedding

df['embedding'] = df['content'].apply(embed_text)

In [6]:
# Cell 6: Store Vectors in OpenSearch
opensearch_url = "https://search-academy-02-sjb2kmrb4hzureuudlz6y5ukr4.eu-central-1.es.amazonaws.com"
username = "academy-opensearch"
password = "8q%a^6uP@Yoqg71LIJEQVVhAu3lcYSOx#@Qs#w7E2IRJ3^!uIp"

opensearch = OpenSearch(
    hosts=[opensearch_url],
    http_auth=(username, password)
)

# Delete the existing index if it exists
index_name = "cryptonews"
if opensearch.indices.exists(index=index_name):
    opensearch.indices.delete(index=index_name)

# Create a new index with the correct mapping
index_body = {
    "settings": {
        "index": {
            "knn": True
        }
    },
    "mappings": {
        "properties": {
            "content": {"type": "text"},
            "embedding": {"type": "knn_vector", "dimension": expected_dimension},
            "sentiment": {"type": "keyword"},
            "source": {"type": "keyword"},
            "subject": {"type": "keyword"},
            "url": {"type": "keyword"},
            "title": {"type": "text"},
            "text": {"type": "text"}  # Ensure text is indexed
        }
    }
}

opensearch.indices.create(index=index_name, body=index_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'cryptonews'}

In [7]:
# Cell 7: Index the documents
for _, row in df.iterrows():
    if row['embedding'] is not None and len(row['embedding']) == expected_dimension:
        document = {
            "content": row['content'],
            "embedding": row['embedding'].tolist(),
            "sentiment": row['sentiment'],
            "source": row['source'],
            "subject": row['subject'],
            "url": row['url'],
            "title": row['title'],  # Include title in the document
            "text": row['text']  # Include text in the document
        }
        opensearch.index(index=index_name, body=document)
        print(f"Indexed document: {row['content']}")
    else:
        print(f"Skipping document with null or invalid embedding: {row['content']}")

Indexed document: Grayscale CEO Calls for Simultaneous Approval of Spot Products to Level the Field Grayscale CEO Michael Sonnenshein believes the SEC needs to approve spot Bitcoin exchange-traded funds (ETFs) simultaneously.
Indexed document: Indian Government is Actively Collaborating With Crypto Industry: Liminal Custody’s Country Head In an exclusive interview with CryptoNews, Manhar Garegrat, the Country Head for India and Global Partnerships of Liminal Custody, shares insights into crypto industry's collaborations with India government.
Indexed document: Judge Approves Settlement: Binance to Pay $1.5 Billion to CFTC, CZ to Pay $150 Million Fine According to the Federal Court ruling on December 18, former Binance CEO Changpeng 'CZ' Zhao has been ordered to pay $150 million, while Binance will pay $2.7 billion to conclude the CFTC enforcement action.
Indexed document: Why a gold rush for inscriptions has broken half a dozen blockchains Some suggest EVM inscriptions are the latest w

In [8]:
# Cell 8: Query Vector Store
query_text = "What are the latest news on bitcoin prices?"
query_embedding = embed_text(query_text)

if query_embedding is not None and query_embedding.shape[0] == expected_dimension:
    print(f"Query embedding dimension: {query_embedding.shape[0]}")
    results = opensearch.search(
        index=index_name,
        body={
            "query": {
                "bool": {
                    "must": {
                        "knn": {
                            "embedding": {
                                "vector": query_embedding.tolist(),
                                "k": 5
                            }
                        }
                    },
                    "filter": {
                        "term": {"sentiment": "positive"}
                    }
                }
            }
        }
    )

    if results['hits']['hits']:
        for result in results['hits']['hits']:
            source = result['_source']
            print(f"Title: {source['title']}")
            print(f"Text: {source['text']}")
            print(f"URL: {source['url']}")
            print(f"Sentiment: {source['sentiment']}")
            print(f"Source: {source['source']}")
            print(f"Subject: {source['subject']}")
            print("\n")
    else:
        print("No results found.")
else:
    print(f"Query embedding has invalid dimension: {query_embedding.shape[0]}")

Query embedding dimension: 1536
Title: Bitcoin Price Prediction: VanEck CEO Optimism, Cathie Wood’s 2030 Forecast & Ripple Outlook
Text: In the ever-shifting world of cryptocurrency, Bitcoin's recent price movement paints a complex picture, trading at $41,079 with a 2.01% decrease on Monday. Amidst this volatility, notable voices in the industry weigh in, offering diverse outlooks. VanEck's CEO stands firm in the belief that Bitcoin will retain its status as the premier digital store of value. In a more futuristic projection, Cathie Wood speculates an astronomical rise for Bitcoin, predicting a 3,356% increase by 2030.
URL: https://cryptonews.comhttps://cryptonews.com/news/bitcoin-price-prediction-vaneck-ceo-optimism-cathie-woods-2030-forecast-ripple-outlook.htm
Sentiment: positive
Source: CryptoNews
Subject: bitcoin


Title: Bitcoin Price Prediction as VanEck’s CEO Says BTC Will Hit All-Time High Within 12 Months – Time to Buy?
Text: Bitcoin (BTC), the world's most valuable cryptocurr