In [1]:
import requests, time
from datetime import datetime, date, timedelta
from pandas import DataFrame as df

NEWS_URL = "https://min-api.cryptocompare.com/data/v2/news/?lang=EN"
NEWS_CATEGORY_URL = "https://min-api.cryptocompare.com/data/news/categories"
LIMIT_QUERY = 2000

In [25]:
'''
currency : "BTC", "ETH", "BCH", ...
'''
def news_request(category, lTs = int(datetime.now().timestamp())):
    params = {
        "categories" : category, 
        "lTs" : lTs
    }
    req = requests.get(NEWS_URL, params = params).json()
    return req["Data"]

In [3]:
def date_to_timestamp(date_str):
    date = datetime.strptime(date_str, "%Y-%m-%d")
    end_of_date = date + timedelta(days = 1) - timedelta(seconds = 1)
    return end_of_date

In [45]:
def is_same_date(a, b):
    return a.year == b.year and a.month == b.month and a.day == b.day

In [75]:
def get_news_data(category, news_per_date, start_date, end_date):
    result = []

    last_time_from = date_to_timestamp(end_date)
    
    while last_time_from >= datetime.strptime(start_date, "%Y-%m-%d"):
        data = news_request(category, int(datetime.timestamp(last_time_from)))
        i = 0
        while last_time_from >= datetime.strptime(start_date, "%Y-%m-%d"):
            cnt = 0
            while i< len(data) and is_same_date(date.fromtimestamp(data[i]['published_on']), last_time_from):
                if cnt < news_per_date:
                    result.append(data[i])
                cnt += 1
                i += 1
            if i >= len(data):
                last_time_from -= timedelta(days = 1)
                break
            else:
                last_time_from = datetime.fromtimestamp(data[i]['published_on'])
            
    return result

In [81]:
def to_dataframe(list_data):
    add_date = [data | {'date': date.fromtimestamp(data['published_on'])} for data in list_data]
    try:
        dataframe = df(add_date)[['date', 'id', 'published_on', 'title', 'body', 'url', 'imageurl', 'tags', 'categories']]
        return dataframe
    except:
        return df({})

In [82]:
def get_news_data_df(category, news_per_date, start_date, end_date):
    new_data = get_news_data(category, news_per_date, start_date, end_date)    
    return to_dataframe(new_data)

In [86]:
# Create a dataframe for the data
dataframe = get_news_data_df(None, 2, "2016-10-26", "2017-02-02")
dataframe

Unnamed: 0,date,id,published_on,title,body,url,imageurl,tags,categories
0,2017-01-30,69,1485734400,WAVES Weekly No. 28,A transcript of the recent Core Radio intervie...,http://coremedia.info/waves-news/item/396-wave...,https://images.cryptocompare.com/news/default/...,WAVES News,Regulation|Blockchain|Technology|Business
1,2017-01-30,14826,1485734400,WAVES Weekly No. 28,A transcript of the recent Core Radio intervie...,http://www.cryptocoremedia.com/old/waves-news/...,https://images.cryptocompare.com/news/coremedi...,WAVES News,Regulation|Blockchain|Technology|Business
2,2017-01-27,14827,1485475200,CORE Radio Interview Transcript with Sasha Iva...,Lootz: “Today we have Sasha. Hey Sasha.”Sasha:...,http://www.cryptocoremedia.com/old/waves-news/...,https://images.cryptocompare.com/news/coremedi...,WAVES News,Trading
3,2017-01-27,70,1485475200,CORE Radio Interview Transcript with Sasha Iva...,Lootz: “Today we have Sasha. Hey Sasha.”Sasha:...,http://coremedia.info/waves-news/item/393-core...,https://images.cryptocompare.com/news/default/...,WAVES News,Trading
4,2017-01-07,137470,1483790400,Novoletne zaobljube,Začetek leta je tu in z njim tradicionalne nov...,https://dieta.si/diete/novoletne-zaobljube/,https://images.cryptocompare.com/news/default/...,Diete,Other
5,2016-12-11,137471,1481469652,Crossfit vadba,Zakaj gre pri Crossfit vadbi? Crossfit vadba i...,https://dieta.si/sport/crossfit-vadba/,https://images.cryptocompare.com/news/default/...,Šport|crossfit|vadba,Other
6,2016-11-23,137472,1479903710,Med zdravjem in boleznijo,Med zdravjem in boleznijo – prehrana pri kroni...,https://dieta.si/nekategorizirano/med-zdravjem...,https://images.cryptocompare.com/news/default/...,Nerazvrščeno|bolezen|dieta|prehrana|zdravje,Other
7,2016-11-18,137473,1479455270,Prebava in naše prehranske navade,Prebava ne more slediti našim navadam Za resni...,https://dieta.si/hujsanje/prebava-nase-prehran...,https://images.cryptocompare.com/news/default/...,Diete|Hujšanje|dieta|hujšanje|obrok|prebava|pr...,Other
8,2016-11-15,137474,1479199980,Dieta in čustva,Čustva močno vplivajo na dieto Danes se bomo p...,https://dieta.si/hujsanje/dieta-in-custva/,https://images.cryptocompare.com/news/default/...,Hujšanje|čustva|dieta|telovadba,Other


In [87]:
def save_df(dataframe, category, start_date_str, end_date_str, output_dir = "./data/news/"):
    name = "news_{}_{}_{}.csv".format('all', start_date_str, end_date_str)
    dataframe.to_csv(output_dir + "/" + name)

In [88]:
def get_and_save_news_data(category, news_per_date = 10, \
        start_date = datetime.now().date().strftime("%Y-%m-%d"), \
        end_date = datetime.now().date().strftime("%Y-%m-%d"), \
        output_dir = "../raw_data"):
    dataframe = get_news_data_df(category, news_per_date, start_date, end_date)
    save_df(dataframe, category, start_date, end_date, output_dir)
    print("done")

In [89]:
'''
currency, days, news_per_date, end_date
'''
# get_and_save_news_data("BTC", 10, 1, "2022-03-09")

'\ncurrency, days, news_per_date, end_date\n'

In [90]:
def get_categories():
    req = requests.get(NEWS_CATEGORY_URL).json()
    result = [value['categoryName'] for value in req]
    return result

In [92]:
categories = get_categories()
# for category in categories:
#     get_and_save_news_data(category, 10, 1)
#     time.sleep(0.1)
for year in range(2022, 2012, -1):
    get_and_save_news_data(categories, 10, \
        "{}-1-1".format(year), "{}-12-31".format(year),
        output_dir = "../cleaned_data/news")

done
done
done
done


In [30]:
categories = get_categories()

In [31]:
categories

['BTC',
 'BCH',
 'ETH',
 'LTC',
 'XMR',
 'ZEC',
 'ETC',
 'XRP',
 'TRX',
 'ADA',
 'DASH',
 'XTZ',
 'USDT',
 'Mining',
 'Exchange',
 'Market',
 'Asia',
 'ICO',
 'Regulation',
 'Blockchain',
 'Trading',
 'Technology',
 'Wallet',
 'Altcoin',
 'Fiat',
 'Business',
 'Commodity',
 'Sponsored']