In [17]:
import pandas as pd
import os
import json
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [18]:
df = pd.read_parquet('data/3b.cryptonews_absa.parquet')
print(df.shape)
df.head(2)

(155376, 10)


Unnamed: 0_level_0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2021 Bitcoin Price Predictions: Is The Massive...,As the bitcoin price hovers under the psycholo...,Forbes,2021-01-02 00:20:00+00:00,0.5,0.0,0.0,0.0,0.0,0.0
1,Will Central Banks Hold Bitcoin in 2021?,Central banks (CB) will hold bitcoin sooner or...,BeInCrypto,2021-01-01 20:31:35+00:00,0.0,0.5,0.0,0.0,0.7,0.0


In [None]:
# os.environ['deepseek_api_key'] = 'YOUR_KEY'
cryptonews_api_key = os.getenv('deepseek_api_key')
client = OpenAI(api_key=cryptonews_api_key, base_url="https://api.deepseek.com")

system_prompt = """
You are a cryptocurrency news expert.
Your task is to perform Named-Entity Recognition (NER) on text related to the cryptocurrency industry.
Specifically, identify and extract the following entities:

1. **People**: Famous individuals, founders, CEOs, and influential figures in the crypto space.
2. **Organizations**: Top companies, exchanges, blockchain projects, and institutions in the cryptocurrency industry.
3. **Cryptocurrencies**: Names of cryptocurrencies, tokens, and digital assets.
4. **Events**: Major events, conferences, or milestones in the crypto world.

Provide the extracted entities in JSON format, ensure accuracy and relevance to the cryptocurrency domain.
Example JSON output:
{
  "People": ["Elon Musk", "Changpeng Zhao"],
  "Organizations": ["Microstrategy", "Meta"],
  "Cryptocurrencies": ["Bitcoin"],
  "Events": ["Bitcoin Halving"],
}
"""

def analyze_aspects(example):
  try:
    user_prompt = f"Title: {example['title']}\nText: {example['text']}"
    messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt}
    ]
    response = client.chat.completions.create(
      model="deepseek-chat",
      messages=messages,
      response_format={'type': 'json_object'}
    )
    return {"ner": json.loads(response.choices[0].message.content)}
  except Exception as e:
    print(f"Error processing example: {example['title']}\n{e}")
    return {"ner": None}

def analyze_aspects_parallel(df, max_workers=64):
  with open("data/4.deepseek_ner_output.json", "a") as file, ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_index = {
      executor.submit(analyze_aspects, row): idx
      for idx, row in df.iterrows()
    }
    for future in tqdm(as_completed(future_to_index), total=len(future_to_index)):
      idx = future_to_index[future]
      try:
        result = future.result()
        file.write(json.dumps({"index": idx, "ner": result["ner"]}) + "\n")
      except Exception as e:
        print(f"Error processing result: {e}")
        file.write(json.dumps({"index": idx, "ner": None}) + "\n")

analyze_aspects_parallel(df)

---------------------------------------------

In [19]:
df_output = pd.read_json("data/4.deepseek_ner_output.json", lines=True)
df_output.set_index('index', inplace=True)
df_output.head()

Unnamed: 0_level_0,ner
index,Unnamed: 1_level_1
23,"{'People': [], 'Organizations': [], 'Cryptocur..."
34,"{'People': ['Pentoshi'], 'Organizations': ['Th..."
56,"{'People': ['Sally Ho'], 'Organizations': [], ..."
44,"{'People': [], 'Organizations': [], 'Cryptocur..."
66,"{'People': [], 'Organizations': [], 'Cryptocur..."


In [20]:
df = pd.merge(df, df_output, left_index=True, right_on="index")
df.head(3)

Unnamed: 0_level_0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,ner
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2021 Bitcoin Price Predictions: Is The Massive...,As the bitcoin price hovers under the psycholo...,Forbes,2021-01-02 00:20:00+00:00,0.5,0.0,0.0,0.0,0.0,0.0,"{'People': [], 'Organizations': [], 'Cryptocur..."
1,Will Central Banks Hold Bitcoin in 2021?,Central banks (CB) will hold bitcoin sooner or...,BeInCrypto,2021-01-01 20:31:35+00:00,0.0,0.5,0.0,0.0,0.7,0.0,"{'People': ['Krüger'], 'Organizations': ['Cent..."
2,"Bitcoin Is Digital Social Justice, feat. Tyron...",The podcaster and CEO of Onramp Invest discuss...,Coindesk,2021-01-01 19:15:02+00:00,0.0,0.3,0.0,0.5,0.7,0.0,"{'People': ['Tyrone Ross'], 'Organizations': [..."


In [22]:
df['ner'][25]

{'People': ['Michael Saylor'],
 'Organizations': ['MicroStrategy', 'Funky Crypto Podcast', 'The Daily Hodl'],
 'Cryptocurrencies': ['Bitcoin', 'BTC'],
 'Events': []}

In [23]:
entity_types = ['People', 'Organizations', 'Cryptocurrencies', 'Events']
for entity in entity_types:
  df[entity] = df['ner'].apply(lambda x: x.get(entity, []) if x else [])

df.drop(columns=['ner'], inplace=True)

In [25]:
df.head(3)

Unnamed: 0_level_0,title,text,source_name,date,tanalysis_absa,economy_absa,regulation_absa,technology_absa,adoption_absa,cybersecurity_absa,People,Organizations,Cryptocurrencies,Events
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,2021 Bitcoin Price Predictions: Is The Massive...,As the bitcoin price hovers under the psycholo...,Forbes,2021-01-02 00:20:00+00:00,0.5,0.0,0.0,0.0,0.0,0.0,[],[],[Bitcoin],[]
1,Will Central Banks Hold Bitcoin in 2021?,Central banks (CB) will hold bitcoin sooner or...,BeInCrypto,2021-01-01 20:31:35+00:00,0.0,0.5,0.0,0.0,0.7,0.0,[Krüger],[Central banks],[Bitcoin],[]
2,"Bitcoin Is Digital Social Justice, feat. Tyron...",The podcaster and CEO of Onramp Invest discuss...,Coindesk,2021-01-01 19:15:02+00:00,0.0,0.3,0.0,0.5,0.7,0.0,[Tyrone Ross],[Onramp Invest],[Bitcoin],[]


In [26]:
df.to_parquet("data/4b.cryptonews_ner.parquet")

In [None]:
##################   Analysis   ##################

In [28]:
people_freq = df['People'].explode().value_counts()
people_freq.head(20)

People
Michael Saylor          2467
Elon Musk               1670
Donald Trump            1615
Peter Schiff             969
Sally Ho                 940
Cathie Wood              928
Jack Dorsey              879
Nayib Bukele             861
Robert Kiyosaki          815
Satoshi Nakamoto         790
Arthur Hayes             594
Peter Brandt             553
Mike Novogratz           543
Jerome Powell            511
Gary Gensler             499
Anthony Scaramucci       455
Trump                    452
Michaël van de Poppe     377
Mike McGlone             371
Benjamin Cowen           306
Name: count, dtype: int64

In [29]:
people_freq = df['Organizations'].explode().value_counts()
people_freq.head(20)

Organizations
SEC                       4413
MicroStrategy             3785
BlackRock                 3301
Binance                   2194
Coinbase                  2174
Grayscale                 2114
The Daily Hodl            1823
Federal Reserve           1630
Tesla                     1622
CNBC                      1577
CryptoQuant               1432
Fed                       1403
CryptoSlate               1356
FTX                       1330
Glassnode                 1253
Bloomberg                 1211
El Salvador               1171
The Block                 1119
BeInCrypto                1068
The Currency Analytics     986
Name: count, dtype: int64

In [30]:
people_freq = df['Cryptocurrencies'].explode().value_counts()
people_freq.head(20)

Cryptocurrencies
Bitcoin      145001
BTC           51248
Ethereum      13063
ETH            5172
XRP            3227
bitcoin        2759
Dogecoin       2443
Solana         1763
Ether          1455
SOL            1392
DOGE           1291
ADA            1215
BNB            1214
Cardano         882
BTC/USD         859
Shiba Inu       628
SHIB            566
DOT             546
USDT            543
AVAX            422
Name: count, dtype: int64

In [31]:
people_freq = df['Events'].explode().value_counts()
people_freq.head(20)

Events
Bitcoin Halving                                        2980
Bitcoin halving                                         414
Bitcoin ETF approval                                    386
Bitcoin ETF Approval                                    325
Bitcoin ETF                                             316
2021 Berkshire Hathaway Annual Shareholders Meeting     264
FOMC meeting                                            193
crypto winter                                           177
Spot Bitcoin ETF Approval                               173
spot Bitcoin ETF approval                               164
Bitcoin ETFs                                            158
FTX collapse                                            154
Uptober                                                 109
U.S. presidential election                               95
Spot Bitcoin ETF approval                                92
The Merge                                                91
US election                      