In [1]:
import pandas as pd
import os
import json
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
df = pd.read_parquet('data/3b.cryptonews_absa.parquet')
print(df.shape)
df.head(2)

(155376, 10)


Unnamed: 0_level_0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2021 Bitcoin Price Predictions: Is The Massive...,As the bitcoin price hovers under the psycholo...,Forbes,2021-01-02 00:20:00+00:00,0.5,0.0,0.0,0.0,0.0,0.0
1,Will Central Banks Hold Bitcoin in 2021?,Central banks (CB) will hold bitcoin sooner or...,BeInCrypto,2021-01-01 20:31:35+00:00,0.0,0.5,0.0,0.0,0.7,0.0


In [None]:
# os.environ['deepseek_api_key'] = 'YOUR_KEY'
cryptonews_api_key = os.getenv('deepseek_api_key')
client = OpenAI(api_key=cryptonews_api_key, base_url="https://api.deepseek.com")

system_prompt = """
You are a cryptocurrency news expert.
Your task is to perform Named-Entity Recognition (NER) on text related to the cryptocurrency industry.
Specifically, identify and extract the following entities:

1. **People**: Famous individuals, founders, CEOs, and influential figures in the crypto space.
2. **Organizations**: Top companies, exchanges, blockchain projects, and institutions in the cryptocurrency industry.
3. **Cryptocurrencies**: Names of cryptocurrencies, tokens, and digital assets.
4. **Events**: Major events, conferences, or milestones in the crypto world.

Provide the extracted entities in JSON format, ensure accuracy and relevance to the cryptocurrency domain.
Example JSON output:
{
  "People": ["Elon Musk", "Changpeng Zhao"],
  "Organizations": ["Microstrategy", "Meta"],
  "Cryptocurrencies": ["Bitcoin"],
  "Events": ["Bitcoin Halving"],
}
"""

def analyze_aspects(example):
  try:
    user_prompt = f"Title: {example['title']}\nText: {example['text']}"
    messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt}
    ]
    response = client.chat.completions.create(
      model="deepseek-chat",
      messages=messages,
      response_format={'type': 'json_object'}
    )
    return {"ner": json.loads(response.choices[0].message.content)}
  except Exception as e:
    print(f"Error processing example: {example['title']}\n{e}")
    return {"ner": None}

def analyze_aspects_parallel(df, max_workers=64):
  with open("data/4.deepseek_ner_output.json", "a") as file, ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_index = {
      executor.submit(analyze_aspects, row): idx
      for idx, row in df.iterrows()
    }
    for future in tqdm(as_completed(future_to_index), total=len(future_to_index)):
      idx = future_to_index[future]
      try:
        result = future.result()
        file.write(json.dumps({"index": idx, "ner": result["ner"]}) + "\n")
      except Exception as e:
        print(f"Error processing result: {e}")
        file.write(json.dumps({"index": idx, "ner": None}) + "\n")

analyze_aspects_parallel(df)

---------------------------------------------

In [3]:
df_output = pd.read_json("data/4.deepseek_ner_output.json", lines=True)
df_output.set_index('index', inplace=True)
df_output.head()

Unnamed: 0_level_0,ner
index,Unnamed: 1_level_1
23,"{'People': [], 'Organizations': [], 'Cryptocur..."
34,"{'People': ['Pentoshi'], 'Organizations': ['Th..."
56,"{'People': ['Sally Ho'], 'Organizations': [], ..."
44,"{'People': [], 'Organizations': [], 'Cryptocur..."
66,"{'People': [], 'Organizations': [], 'Cryptocur..."


In [4]:
df = pd.merge(df, df_output, left_index=True, right_on="index")
df.head()

Unnamed: 0_level_0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,ner
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2021 Bitcoin Price Predictions: Is The Massive...,As the bitcoin price hovers under the psycholo...,Forbes,2021-01-02 00:20:00+00:00,0.5,0.0,0.0,0.0,0.0,0.0,"{'People': [], 'Organizations': [], 'Cryptocur..."
1,Will Central Banks Hold Bitcoin in 2021?,Central banks (CB) will hold bitcoin sooner or...,BeInCrypto,2021-01-01 20:31:35+00:00,0.0,0.5,0.0,0.0,0.7,0.0,"{'People': ['Krüger'], 'Organizations': ['Cent..."
2,"Bitcoin Is Digital Social Justice, feat. Tyron...",The podcaster and CEO of Onramp Invest discuss...,Coindesk,2021-01-01 19:15:02+00:00,0.0,0.3,0.0,0.5,0.7,0.0,"{'People': ['Tyrone Ross'], 'Organizations': [..."
3,Bitcoin hits all-time high against gold as hav...,"BTC has hit another milestone, this time again...",Cointelegraph,2021-01-01 18:52:00+00:00,1.0,0.0,0.0,0.0,0.0,0.0,"{'People': [], 'Organizations': [], 'Cryptocur..."
4,"The Last Time This Indicator Flashed, Bitcoin ...",Bitcoin has been facing some turbulence as of ...,Bitcoinist,2021-01-01 18:00:00+00:00,0.3,0.0,0.0,0.0,0.0,0.0,"{'People': [], 'Organizations': [], 'Cryptocur..."


In [5]:
df.to_parquet("data/4b.cryptonews_ner.parquet")