In [44]:
import requests, time
import pickle
from datetime import datetime, date, timedelta
import pandas as pd
from pandas import DataFrame as df

NEWS_URL = "https://min-api.cryptocompare.com/data/v2/news/?lang=EN"
NEWS_CATEGORY_URL = "https://min-api.cryptocompare.com/data/news/categories"
LIMIT_QUERY = 2000

In [23]:
'''
currency : "BTC", "ETH", "BCH", ...
'''
def news_request(category, lTs = int(datetime.now().timestamp())):
    params = {
        "categories" : category, 
        "lTs" : lTs
    }
    req = requests.get(NEWS_URL, params = params).json()
    return req["Data"]

In [24]:
def date_to_timestamp(date_str):
    date = datetime.strptime(date_str, "%Y-%m-%d")
    end_of_date = date + timedelta(days = 1) - timedelta(seconds = 1)
    return end_of_date

In [25]:
def is_same_date(a, b):
    return a.year == b.year and a.month == b.month and a.day == b.day

In [26]:
def get_news_data(category, news_per_date, start_date, end_date):
    result = []

    last_time_from = date_to_timestamp(end_date)
    
    while last_time_from >= datetime.strptime(start_date, "%Y-%m-%d"):
        data = news_request(category, int(datetime.timestamp(last_time_from)))
        i = 0
        while last_time_from >= datetime.strptime(start_date, "%Y-%m-%d"):
            while i< len(data) and is_same_date(date.fromtimestamp(data[i]['published_on']), last_time_from):
                result.append(data[i])
                i += 1
            if i >= len(data):
                last_time_from -= timedelta(days = 1)
                break
            else:
                last_time_from = datetime.fromtimestamp(data[i]['published_on'])
            
    return result

In [27]:
def to_dataframe(list_data):
    add_date = [data | {'date': date.fromtimestamp(data['published_on'])} for data in list_data]
    try:
        dataframe = df(add_date)[['date', 'id', 'published_on', 'title', 'body', 'url', 'imageurl', 'tags', 'categories']]
        return dataframe
    except:
        return df({})

In [28]:
def get_news_data_df(category, news_per_date, start_date, end_date):
    new_data = get_news_data(category, news_per_date, start_date, end_date)    
    return to_dataframe(new_data)

In [29]:
# # Create a dataframe for the data
# dataframe = get_news_data_df(None, 2, "2016-10-26", "2017-02-02")
# dataframe

In [30]:
def save_df(dataframe, category,year_str, output_dir = "./data/news/"):
    name = "news_{}.csv".format(year_str)
    dataframe.to_csv(output_dir + "/" + name)

In [31]:
def get_and_save_news_data(category, news_per_date = 10, \
        start_date = datetime.now().date().strftime("%Y-%m-%d"), \
        end_date = datetime.now().date().strftime("%Y-%m-%d"), \
        output_dir = "./data"):
    dataframe = get_news_data_df(category, news_per_date, start_date, end_date)
    save_df(dataframe, category, start_date[:4], output_dir)
    print("done")

In [32]:
'''
currency, days, news_per_date, end_date
'''
# get_and_save_news_data("BTC", 10, 1, "2022-03-09")

'\ncurrency, days, news_per_date, end_date\n'

In [33]:
def get_categories():
    req = requests.get(NEWS_CATEGORY_URL).json()
    result = [value['categoryName'] for value in req]
    return result

In [34]:
categories = get_categories()
# for category in categories:
#     get_and_save_news_data(category, 10, 1)
#     time.sleep(0.1)
for year in range(2022, 2012, -1):
    get_and_save_news_data(categories, None, \
        "{}-1-1".format(year), "{}-12-31".format(year),
        output_dir = "./data/news")

done
done
done
done
done
done
done
done
done
done


In [5]:
categories = get_categories()

In [6]:
categories

['BTC',
 'BCH',
 'ETH',
 'LTC',
 'XMR',
 'ZEC',
 'ETC',
 'XRP',
 'TRX',
 'ADA',
 'DASH',
 'XTZ',
 'USDT',
 'Mining',
 'Exchange',
 'Market',
 'Asia',
 'ICO',
 'Regulation',
 'Blockchain',
 'Trading',
 'Technology',
 'Wallet',
 'Altcoin',
 'Fiat',
 'Business',
 'Commodity',
 'Sponsored']

In [7]:
len(categories)

28

# Basic Statistics

In [42]:
basic_df = df(0, index = range(2013, 2023), columns = ['all']+categories)

In [62]:
def count_number(year):
    input_df = pd.read_csv("./data/news/news_{}.csv".format(year))
    for category in categories:
        try:
            filter_df = input_df[input_df['categories'].str.contains(category)]
            basic_df[category][year] = len(filter_df)
        except:
            pass
    basic_df['all'][year] = len(input_df)

for year in range(2013, 2023):
    count_number(year)

In [63]:
basic_df

Unnamed: 0,all,BTC,BCH,ETH,LTC,XMR,ZEC,ETC,XRP,TRX,...,Regulation,Blockchain,Trading,Technology,Wallet,Altcoin,Fiat,Business,Commodity,Sponsored
2013,28,24,0,1,0,0,0,0,0,0,...,3,0,3,1,0,0,3,1,0,1
2014,1,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
2015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016,20,6,0,0,2,0,0,4,2,0,...,0,8,10,16,0,2,0,0,0,2
2017,5002,2720,0,729,181,84,30,17,123,3,...,585,1467,1050,691,70,275,570,691,33,891
2018,18238,8758,0,2346,588,288,176,76,968,99,...,2542,5367,4515,2247,301,1387,1639,2480,120,2777
2019,18250,9399,492,3213,885,228,117,139,1711,308,...,2104,4035,5818,1921,250,2293,1660,2558,95,691
2020,18300,9507,579,2918,456,207,117,138,1207,79,...,1883,2854,5900,1942,150,2004,1727,3252,194,482
2021,18250,9095,285,3213,367,101,51,111,895,73,...,2466,1998,6372,1172,68,2538,1582,3658,88,812
2022,4700,2008,10,940,33,15,10,21,156,8,...,614,825,1610,307,35,680,377,836,26,256
