In [148]:
"""Алгоритм парсит название папок в входной папке """

from pathlib import Path
import pandas as pd

MODEL_DIR = str(Path('../data/models/'))

from os import listdir
from os.path import isdir, join

model_dirs = [dir_ for dir_ in listdir(MODEL_DIR) if isdir(join(MODEL_DIR, dir_)) and dir_.startswith('model')]

In [149]:
model_dirs

['model_1_60_7', 'model_1_90_15', 'model_1_30_4']

In [150]:
from collections import defaultdict
import re
from envparse import env
import ssl
import pymongo as pym

In [152]:
import yfinance as yf
import pandas as pd
from gdeltdoc import Filters, GdeltDoc
from datetime import timedelta
from pathlib import Path
import time


def get_shift_percentage(numerator: pd.Series,
                         denominator: pd.Series) -> pd.Series:
    """
    На сколько процентов изменилась переменная numerator
    относительно текущего denominator
    """
    return numerator / denominator * 100


def get_shift_digit(s: pd.Series) -> pd.Series:
    """
    Возвращает колонку с изменениями значений
    Для вызова этой функции необходимо очистить колонки Close от NaN
    """
    assert not s.isna().any(), "Необходимо очистить колонку от NaN"

    return (s.shift(-1) - s).shift(1)


def get_data_yfinance(quotation: str,
                      start_date: str,
                      end_date: str,
                      interval='1d') -> pd.DataFrame:
    """
    Взять данные с yahoo finance

    params:
        quotation: название котировки, данные для которой хотим получить
        start_date, end_date - интервал, формат "год-месяц-день"
        interval - периодичность, формат "(номер)(первая буква слова (d, m, y))"
    returns:
        DataFrame формата "Тикет, Время, 6 видов цен"
    """

    df_res = yf.download(tickers=quotation,
                         start=start_date,
                         end=end_date,
                         interval=interval)
    df_res.loc[:, 'Ticker'] = quotation
    df_res = df_res.groupby(pd.Grouper(level="Date",
                                       freq=interval.upper())).mean()
    # Приводим время к одному виду для слияния
    df_res.index = pd.to_datetime(df_res.index).tz_localize('Etc/UCT')
    return df_res


def get_data_gdelt(quotation: str,
                   keywords: list,
                   start_date: str,
                   end_date: str,
                   interval="1d",
                   num_records=250,
                   repeats=3) -> pd.DataFrame:
    """
    Взять данные с gdelt

    params:
        quotation - имя ценной бумаги
        keywords - из графа знаний по ключевому слову
        start_date, end_date - интервал, формат "год-месяц-день"
        (опционально) interval - периодичность, формат "(номер)(первая буква слова (d, m, y))"
        (не реализована) (опционально) num_records - сколько максимум записей взять за промежуток
        (не реализовано) (опционально) repeats - сколько раз должно повториться ключевое слово в статье
    returns:
        DataFrame формата "Datetime (индекс), Ticker,
        [Average_Tone, Article_Count, Volume_Intensity]_[std, mean, sum, min, max]
    """

    # Колонки в полученных DataFrames
    col_names = ['Average_Tone', 'Article_Count', 'Volume_Intensity']

    # Что будем искать для данных ключевых слов
    # Тон статей, их количество и их кол-во в отношении ко всем остальным на gdelt
    match_list = ["timelinetone", "timelinevolraw", "timelinevol"]
    match_dict = dict(zip(match_list, col_names))

    df_res = None
    for keyword in keywords:
        try:
            gd_filter = Filters(start_date=start_date,
                                end_date=end_date,
                                keyword=keyword)

            for feature_name in match_list:
                gd = GdeltDoc()
                timeline_data = gd.timeline_search(feature_name, gd_filter)
                time.sleep(5)
                timeline_data = timeline_data.fillna(0)
                timeline_data = timeline_data.groupby(
                    pd.Grouper(key="datetime", freq=interval.upper()))

                if feature_name in ['timelinetone']:
                    timeline_data = timeline_data.mean()
                else:
                    timeline_data = timeline_data.sum()

                if df_res is None:
                    # Выровняем индексы, чтобы при копировании не выдавалось NaN
                    df_res = pd.DataFrame(index=timeline_data.index)

                col_name = match_dict[feature_name]
                df_res[f"{keyword}_{feature_name}_{col_name}"] = timeline_data[
                    col_name.replace('_', ' ')].values
        except Exception as e:
            print(f'invalid keyword: {keyword}')

    return df_res


def set_statistic_columns(df_dub: pd.DataFrame) -> pd.DataFrame:
    """После использования этой функции желательно удалить df_dub"""

    # Колонки в полученных DataFrames
    col_names = ['Average_Tone', 'Article_Count', 'Volume_Intensity']

    df_res = pd.DataFrame(index=df_dub.index)

    # Нужно создать колонки со средним, средним отклонением, минимумом и максимумом для каждой фичи
    # Сначала сформируем список датафреймов, которые нам нужно достать для каждой колонки
    for pattern in col_names:
        pattern_list = list()
        for col in df_dub.columns:
            if pattern in col:
                pattern_list.append(col)

        # Теперь для pattern у нас есть список
        df_res[f'{pattern}_min'] = df_dub[pattern_list].min(axis=1,
                                                            skipna=True)
        df_res[f'{pattern}_max'] = df_dub[pattern_list].max(axis=1,
                                                            skipna=True)
        df_res[f'{pattern}_mean'] = df_dub[pattern_list].mean(axis=1,
                                                              skipna=True)
        df_res[f'{pattern}_std'] = df_dub[pattern_list].std(axis=1,
                                                            skipna=True)
        df_res[f'{pattern}_sum'] = df_dub[pattern_list].sum(axis=1,
                                                            skipna=True)

    return df_res


def set_column_ticker(df: pd.DataFrame, quotation: str) -> None:
    # Добавим название ценной бумаги в таблицу
    df.loc[:, 'Ticker'] = quotation


def get_nan_chain(nan_series: pd.Series) -> list:
    """Берет цепочки NaN из фрейма"""
    res = []
    day = timedelta(days=1)
    along = list(nan_series[nan_series].index)

    prev_day = along[0]
    chain = [prev_day]
    for cur_day in along[1:]:
        if cur_day - prev_day == day:
            chain.append(cur_day)
        else:
            res.append(chain)
            chain = [cur_day]

        prev_day = cur_day

    return res


def set_cumulative_effect(df: pd.DataFrame, chains: list) -> None:
    """Меняет переданный датафрейм, не убирая NaN"""
    day = timedelta(days=1)

    for chain in chains:
        buffer = None
        for date in chain:
            if buffer is None:
                buffer = df.loc[date]
            else:
                buffer += df.loc[date]

        cumulative_day = chain[-1] + day
        # Берем все дни из цепочки, в строке будет среднее арифметическое этих дней
        df.loc[cumulative_day] = (df.loc[cumulative_day] +
                                  buffer) / (len(chain) + 1)


def get_dataframe_v2(**kwargs) -> pd.DataFrame:
    """
    Получить полный датафрейм с кумулятивностью и дополнительными фичами
    Пример использования: d = get_dataframe(quotation='NVDA',
                                            keywords=['nvidia', 'geforce', 'geforce rtx', 'geForce now',
                                            'nvidia rtx', 'nvidia shield', 'nvidia dgx'],
                                            start_date="2020-01-01",
                                            end_date="2020-12-31")

    params:
        quotation - имя ценной бумаги
        keywords - из графа знаний по ключевому слову
        start_date, end_date - интервал, формат "год-месяц-день"
        (опционально) interval - периодичность, формат "(номер)(первая буква слова (d, m, y))"
        (не реализована) (опционально) num_records - сколько максимум записей взять за промежуток
        (не реализовано) (опционально) repeats - сколько раз должно повториться ключевое слово в статье
    returns:
        DataFrame формата "Datetime (индекс), Ticker,
        [Average_Tone, Article_Count, Volume_Intensity]_[std, mean, sum, min, max], - из новостей
        Open, High, Low, Close, Adj Close, Volume - из финансов
    """

    gdelt_data = get_data_gdelt(**kwargs)
    yfinance_data = get_data_yfinance(
        quotation=kwargs['quotation'],
        start_date=kwargs['start_date'],
        end_date=kwargs['end_date'],
        interval="1d" if not kwargs.get('interval') else kwargs['interval'])

    row_is_nan = yfinance_data['Close'].isna()
    yfinance_data.dropna(inplace=True)
    set_cumulative_effect(gdelt_data, get_nan_chain(row_is_nan))
    gdelt_data.drop(row_is_nan[row_is_nan].index, inplace=True)
    gdelt_data = set_statistic_columns(gdelt_data)

    set_column_ticker(gdelt_data, quotation=kwargs['quotation'])
    set_column_ticker(yfinance_data, quotation=kwargs['quotation'])
    yfinance_data['Price Change'] = get_shift_digit(
        yfinance_data['Close'].dropna())
    yfinance_data['Percentage Change'] = get_shift_percentage(
        yfinance_data['Price Change'], yfinance_data['Close'])
    gdelt_data = gdelt_data.iloc[1:-1].reset_index()
    yfinance_data = yfinance_data.reset_index()

    return gdelt_data, yfinance_data

In [153]:
d, y = get_dataframe_v2(quotation='NVDA',
                                            keywords=['nvidia', 'geforce', 'geforce rtx', 'geForce now',
                                            'nvidia rtx', 'nvidia shield', 'nvidia dgx'],
                                            start_date="2020-01-01",
                                            end_date="2020-12-31")

KeyboardInterrupt: 

In [None]:
d

In [3]:
MODEL_DIR + '/' + 'model_1_120_30' + '/test_data.csv'

'models/model_1_120_30/test_data.csv'

In [487]:
frames['model_1_120_30'][frames['model_1_120_30']['Ticker'] == 'MMM']

Unnamed: 0,index,Ticker,Adj Close_percent_pr-120,Adj Close_percent_pr-119,Adj Close_percent_pr-118,Adj Close_percent_pr-117,Adj Close_percent_pr-116,Adj Close_percent_pr-115,Adj Close_percent_pr-114,Adj Close_percent_pr-113,...,Average_Tone_mean_pr-9,Average_Tone_mean_pr-8,Average_Tone_mean_pr-7,Average_Tone_mean_pr-6,Average_Tone_mean_pr-5,Average_Tone_mean_pr-4,Average_Tone_mean_pr-3,Average_Tone_mean_pr-2,Average_Tone_mean_pr-1,target


In [154]:
def get_frames_for_company(company: str, model_dirs, number_of_samples) -> dict:
    #rand_tickers = [i for i in range(10)]
    df_test_10 = {}
    for model in model_dirs:
        df_test = pd.read_csv(MODEL_DIR + '/' + model + '/test_data.csv', nrows=100)
        df_test[model] = df_test[df_test['Ticker'] == company].head(number_of_samples)
    return df_test

In [None]:
df_test = pd.read_csv(MODEL_DIR + '/' + 'model_1_120_30' + '/test_data.csv', nrows=100)
# df_test['model_1_120_30'] = df_test[df_test['Ticker'] == 'ABT'].head(10)
# df_test['model_1_120_30']
df_test[df_test['Ticker'] == 'ABT'].head(10)

In [21]:
test = {}
test['MMM'] = get_frames_for_company('MMM', model_dirs, 10)
#for company in tickets:
    #test[company] = get_frames_for_company(company, model_dirs, 10)

ValueError: Wrong number of items passed 243, placement implies 1

In [451]:
days_back

{'model_1_120_30': '120',
 'model_2_120_30': '120',
 'model_3_120_30': '120',
 'model_4_120_30': '120',
 'model_5_120_30': '120'}

In [452]:
frames = get_frames_for_company('ABT', days_back)

In [461]:
frames['model_1_120_30']

Unnamed: 0,index,Ticker,Adj Close_percent_pr-120,Adj Close_percent_pr-119,Adj Close_percent_pr-118,Adj Close_percent_pr-117,Adj Close_percent_pr-116,Adj Close_percent_pr-115,Adj Close_percent_pr-114,Adj Close_percent_pr-113,...,Average_Tone_mean_pr-9,Average_Tone_mean_pr-8,Average_Tone_mean_pr-7,Average_Tone_mean_pr-6,Average_Tone_mean_pr-5,Average_Tone_mean_pr-4,Average_Tone_mean_pr-3,Average_Tone_mean_pr-2,Average_Tone_mean_pr-1,target
2,2017-01-31 00:00:00+00:00,ABT,0.0,0.820623,-0.857629,0.863807,0.758402,-0.803525,-1.499063,1.552487,...,0.23,0.4569,0.2107,0.37078,0.43578,0.88942,0.77116,0.6399,0.28486,1
5,2017-02-01 00:00:00+00:00,ABT,-1.662297,-0.069771,-1.297582,2.720423,0.031371,0.531313,1.139234,-0.079083,...,-0.3867,0.2098,0.0,0.31314,-0.43282,0.90852,0.73446,-0.2539,0.79866,1
8,2017-02-02 00:00:00+00:00,ABT,0.711743,-1.309058,3.593353,1.350002,-0.218049,-4.158486,0.052169,1.8037,...,-0.40585,-0.04736,0.2286,0.33072,0.486,0.47936,0.7093,0.45105,-0.0091,1
11,2017-02-03 00:00:00+00:00,ABT,-0.435163,0.230033,-1.734627,0.134066,-3.610268,-0.864221,0.921272,2.136543,...,-0.472375,0.19392,1.4448,0.28566,-0.22208,0.85198,0.70956,1.239925,0.49034,1
14,2017-02-07 00:00:00+00:00,ABT,-2.0979,2.065686,3.883499,-0.876349,0.228984,0.988592,-0.292812,-0.178575,...,0.004875,0.3383,1.537,0.45206,-0.34248,0.36762,0.31604,0.11575,0.0,1
17,2017-02-08 00:00:00+00:00,ABT,-3.928568,0.599664,-1.614267,0.515727,1.158618,2.936723,1.01053,-0.460015,...,-0.54095,-0.53,-1.0104,0.4293,0.23568,0.82922,0.31474,-0.62795,0.08196,1
20,2017-02-09 00:00:00+00:00,ABT,-1.672865,0.183418,1.208966,-0.342282,-0.323311,-0.657779,-0.068964,0.333765,...,-0.623075,-0.16712,-0.3559,0.32218,0.36634,0.6484,0.2727,0.894775,0.65572,1
23,2017-02-10 00:00:00+00:00,ABT,0.611002,0.566457,0.167935,-0.883209,-0.454098,-0.935815,-0.388103,-0.562951,...,0.008875,-0.3202,0.6155,0.41174,0.36232,1.32636,0.09094,-0.78365,0.55556,1
26,2017-02-14 00:00:00+00:00,ABT,-1.113357,-1.29898,-1.676457,0.148495,-0.374706,-0.329741,-0.372284,0.283074,...,0.173225,-0.2801,-0.9485,0.7346,0.0661,1.73326,0.04716,-1.98915,0.57692,1
29,2017-02-15 00:00:00+00:00,ABT,-0.204713,-0.128111,-2.728046,-0.074378,-0.623362,0.654301,1.445746,1.479367,...,-0.6971,0.44174,0.5026,0.29106,0.507,1.15724,0.486,-1.922875,0.2765,1


In [155]:
"""Группируем папки  по days_back, days_forward, сохраняем название папок"""

from collections import defaultdict
import re

# pattern = re.compile('model_(\d+)_(\d+)_(\d)')
# d = defaultdict(list)
# for item in model_dirs:
#     i, db, df = pattern.match(item).groups()
#     d[(db, df)] += [item]

db = {model : model.split('_')[2] for model in model_dirs}
df = {model : model.split('_')[3] for model in model_dirs}

In [157]:
from catboost import CatBoostClassifier, Pool

def predict_for_model(path_to_model, test, cat_features):
    
    model = CatBoostClassifier(Pool(test, cat_features=cat_features))
    model.load_model(path_to_model, format='cbm')
    
    prediction = model.predict(test)
    
    return prediction

In [411]:
from gdeltdoc import GdeltDoc, Filters, near, repeat
from datetime import datetime, timedelta

def set_filters(from_date, days_back, keywords):
    
    delta = timedelta(days=int(days_back))
    date = datetime.strptime(from_date, "%Y-%m-%d")
    end_date = date + delta
    end_date = end_date.strftime("%Y-%m-%d")
    
    f = Filters(
        start_date = from_date,
        end_date = end_date,
        num_records = 250,
        keyword = keywords,
        domain = "finance.yahoo.com",
        country = ["UK", "US"],
        theme = "ECON_STOCKMARKET",
#         near = near(3, keywords, "company"),
        #repeat = repeat(5, "planet")
    )
    
    return f

In [169]:
def connect_to_mongodb():
    env.read_envfile()
    url = env("URL")
    ssl_ca_certs = str(Path('../additional/YandexInternalRootCA.crt'))
    con = pym.MongoClient(url,
                           ssl_ca_certs=ssl_ca_certs,
                           ssl_cert_reqs=ssl.CERT_REQUIRED)
    return con.get_database('stock-news-backend')

In [170]:
gd = GdeltDoc()
f = set_filters('2019-01-01', 30, ['Microsoft', 'Bill Gates'])

In [171]:
# Get a timeline of the number of articles matching the filters
timeline = gd.timeline_search("timelinetone", f)
timeline['datetime']

0    2018-12-02 00:00:00+00:00
1    2018-12-03 00:00:00+00:00
2    2018-12-04 00:00:00+00:00
3    2018-12-05 00:00:00+00:00
4    2018-12-06 00:00:00+00:00
5    2018-12-07 00:00:00+00:00
6    2018-12-08 00:00:00+00:00
7    2018-12-09 00:00:00+00:00
8    2018-12-10 00:00:00+00:00
9    2018-12-11 00:00:00+00:00
10   2018-12-12 00:00:00+00:00
11   2018-12-13 00:00:00+00:00
12   2018-12-14 00:00:00+00:00
13   2018-12-15 00:00:00+00:00
14   2018-12-16 00:00:00+00:00
15   2018-12-17 00:00:00+00:00
16   2018-12-18 00:00:00+00:00
17   2018-12-19 00:00:00+00:00
18   2018-12-20 00:00:00+00:00
19   2018-12-21 00:00:00+00:00
20   2018-12-22 00:00:00+00:00
21   2018-12-23 00:00:00+00:00
22   2018-12-24 00:00:00+00:00
23   2018-12-25 00:00:00+00:00
24   2018-12-26 00:00:00+00:00
25   2018-12-27 00:00:00+00:00
26   2018-12-28 00:00:00+00:00
27   2018-12-29 00:00:00+00:00
28   2018-12-30 00:00:00+00:00
29   2018-12-31 00:00:00+00:00
30   2019-01-01 00:00:00+00:00
Name: datetime, dtype: datetime64[ns, U

In [200]:
def get_important_days(timeline, number_of_days):
    
    indexes = timeline['Volume Intensity'].values.argsort()[-number_of_days:][::-1]
    result = list(map(str, timeline['datetime'][indexes]))
    result = [result[i].split(' ')[0] for i in range(len(result))]
    
    return result

In [173]:
import random

def get_important_news(articles, important_day, keywords, number_of_news):
    
    articles = articles[articles['language'] == 'English']
    result = []
    for _ in range(number_of_news):
        index = random.randint(0, len(list(articles['title']))-1)
        result.append(list(articles['title'])[index])
    return result

In [199]:
important_days = get_important_days(timeline, 3)

KeyError: 'Volume Intensity'

In [175]:
important_news = get_important_news(important_days[0], ['Microsoft company', 'Bill Gates'], 5)
important_news

NameError: name 'important_days' is not defined

In [176]:
db = connect_to_mongodb()
keywords = pd.DataFrame.from_dict(db['KEYWORDS'].find())
list(keywords['Keywords'])[0]

['3 Mmm Company Limited',
 '3M Japan Ltd',
 'MMM Healthcare, LLC',
 'MMM Münchener Medizin Mechanik GmbH',
 'MMM Group Limited',
 'Morgan–McClure Motorsports',
 'Zya, Inc.']

In [177]:
model_dirs

['model_1_60_7', 'model_1_90_15', 'model_1_30_4']

In [184]:
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [189]:
def clean_column(column):
    s = "".join(e for e in column if e.isalnum() or e == " ")
    return s.strip().strip("_").replace('_', ' ').lower()

def clean_columns(columns):
    res = []
    for column in columns:
        res += [clean_column(column)]
        
    return res

In [225]:
def process_news(news):
    res = []
    for k, vs in news.items():
        for v in vs:
            res += [(k, v)]
    return res

In [191]:
clean_columns(['BIO COMPANY GmbH', 'Nissan', 'Hewlett-Packard', 'Facebook', 'Takeda Pharmaceutical Company', 'Cargill', 'Ericsson', 'Netflix', 'Namco', 'Donald Trump'])

['bio company gmbh',
 'nissan',
 'hewlettpackard',
 'facebook',
 'takeda pharmaceutical company',
 'cargill',
 'ericsson',
 'netflix',
 'namco',
 'donald trump']

In [269]:
import random

In [485]:
def level_generator(model_dirs):

    pattern = re.compile('model_(\d+)_(\d+)_(\d+)')
    d = defaultdict(list)
    for item in model_dirs:
        i, db, df = pattern.match(item).groups()
        d[(db, df)] += [item]

    db = connect_to_mongodb()
    keywords = pd.DataFrame.from_dict(db['KEYWORDS'].find())
    #tickets = list(keywords['Ticker'])[0:10]
    #keywords = list(keywords['Keywords'])[0:10]

    MODEL_DIR = str(Path('../data/models/'))
    
    res = []
    for key in d.keys():
        db_model = db[f"MODEL_{key[0]}_{key[1]}"]
        db_model.drop()
        
        counter = 0
        for model in d[key]:
            cat_features = Path(MODEL_DIR + '/' + model + '/cat_features.txt').read_text().split()
            test_df = pd.read_csv(MODEL_DIR + '/' + model + '/test_data.csv',
                               nrows=None)
            
            for i, idx in enumerate(random.sample(range(0, len(test_df.Ticker.unique())), len(test_df.Ticker.unique()))):
                try:
                    if counter == 20:
                        break

                    t = test_df[test_df.Ticker == test_df.Ticker.unique()[idx]]
                    print('Ticker:', test_df.Ticker.unique()[idx])
                    idx_ = random.randint(a=0, b=t.shape[0])
                    test = t.iloc[idx_].to_frame().T

                    date = test['datetime.1'].values[0]
                    days_back = key[0]
                    target = test['target'].values[0]
                    test = test.drop(['datetime.1', 'target'], axis=1)
                    delta = timedelta(days=int(days_back))
                    date_datetime = datetime.strptime(date, "%Y-%m-%d")
                    start_date = date_datetime - delta
                    start_date = start_date.strftime("%Y-%m-%d")

                    model_predict = predict_for_model(MODEL_DIR + '/' + model + '/model.cbm', test, cat_features)
                    keywords_ = list(keywords['Keywords'][keywords['Ticker']==test['Ticker'].values[0]])[0]

                    keywords_ = clean_columns(keywords_)[:4]


                    yfinance_df = get_data_yfinance(quotation=test['Ticker'].values[0],
                                                       start_date=start_date,
                                                       end_date=date)

                    
                    f = set_filters(start_date, days_back, keywords_)
                    gd = GdeltDoc()
                    articles = gd.article_search(f)

                    volumes = gd.timeline_search("timelinevol", f)
                    tones = gd.timeline_search("timelinetone", f)
                    important_days = get_important_days(volumes, 3)
                    news = {}
                    for day in important_days:
                        news[day] = get_important_news(articles, day, keywords_, number_of_news=2)
            #return prices, tones, volumes, news, date, days_back, model_predict, target  news, days_back, model_predict, target

        #             return prices, tones, volumes, news, date, days_back, model_predict, target
        
                    tones['prices'] = pd.Series(yfinance_df['Close'].values)
                    tones = tones.fillna(method="ffill")
                
        
                    d_ = {}
                    d_['prices'] = list(map(float, tones.prices.values))
                    d_['Ticker'] = test_df.Ticker.unique()[idx]
                    d_['tones'] = list(map(float, tones['Average Tone'].values))
                    d_['volumes'] = list(map(float, 100*volumes['Volume Intensity'].values))
                    d_['news'] = process_news(news)
                    d_['date'] = str(date)
                    d_['model_predict'] = int(model_predict[0])
                    d_['days_back'] = int(days_back)
                    d_['target'] = int(target)
                    d_['level_id'] = counter

                    db_model.insert_many([d_])
                    counter += 1
                except Exception as e:
                    print(e)
    #             df = pd.DataFrame()
    #             df['tones'] = list(tones['Average Tone'])
    #             df['volumes'] = list(volumes['Volume Intensity'])
    #             df['date'] = date
    #             model_predict[0]
    #             process_news(news)
                

In [486]:
model_dirs = [dir_ for dir_ in listdir(MODEL_DIR) if isdir(join(MODEL_DIR, dir_)) and dir_.startswith('model')]

In [487]:
model_dirs

['model_1_60_7', 'model_1_90_15', 'model_1_30_4']

In [488]:
level_generator(model_dirs)

Ticker: SJM
[*********************100%***********************]  1 of 1 completed
Ticker: HBAN
[*********************100%***********************]  1 of 1 completed
https://api.gdeltproject.org/api/v2/doc/doc?query=("halo business angel network")domain:finance.yahoo.com(sourcecountry:UK OR sourcecountry:US)theme:ECON_STOCKMARKET&startdatetime=20190907000000&enddatetime=20191106000000&maxrecords=250&mode=artlist&format=json
("halo business angel network")domain:finance.yahoo.com(sourcecountry:UK OR sourcecountry:US)theme:ECON_STOCKMARKET&startdatetime=20190907000000&enddatetime=20191106000000&maxrecords=250
b"Parentheses may only be used around OR'd statements.\n"
Expecting value: line 1 column 1 (char 0)
Ticker: CMS
[*********************100%***********************]  1 of 1 completed
Ticker: CAH
[*********************100%***********************]  1 of 1 completed
Ticker: APH
[*********************100%***********************]  1 of 1 completed
Ticker: HLT
[*********************100%*******

[*********************100%***********************]  1 of 1 completed
Ticker: FTV
[*********************100%***********************]  1 of 1 completed
Ticker: CAH
[*********************100%***********************]  1 of 1 completed
'articles'
Ticker: SJM
[*********************100%***********************]  1 of 1 completed
Ticker: HAS
[*********************100%***********************]  1 of 1 completed
Ticker: DE
[*********************100%***********************]  1 of 1 completed
Ticker: FTV
[*********************100%***********************]  1 of 1 completed
Ticker: FRC
[*********************100%***********************]  1 of 1 completed
Ticker: KMB
[*********************100%***********************]  1 of 1 completed
https://api.gdeltproject.org/api/v2/doc/doc?query=("kmb design company limited" OR "kmb utilities company" OR "kmb international company limited" OR "zao banca intesa closed jointstock company prior to merger with kmbbank")domain:finance.yahoo.com(sourcecountry:UK OR sourc

In [69]:
test = pd.read_csv(MODEL_DIR + '/' + model + '/test_data.csv',
                               nrows=1)
test['Ticker'][0]

'ACN'

In [86]:
list(keywords['Keywords'][keywords['Ticker']=='ACN'])[0]

['Michael Hill Jeweller',
 'Dymocks Booksellers',
 'Alliance Airlines',
 'NBN Co',
 'Muzak',
 "Foster's Group",
 'Jewelry Television',
 'Qube Holdings']

In [14]:
# Search for articles matching the filters
articles = gd.article_search(f)

In [53]:
articles

Unnamed: 0,domain,language,seendate,socialimage,sourcecountry,title,url,url_mobile
0,timesnownews.com,English,20190921T130000Z,https://imgk.timesnownews.com/story/1569068177...,India,Surprising ! Bill Gates net worth : When he re...,https://www.timesnownews.com/business-economy/...,https://www.timesnownews.com/amp/business-econ...
4,netflix.com,English,20190921T153000Z,https://occ-0-2794-2219.1.nflxso.net/dnm/api/v...,United States,Inside Bill Brain : Decoding Bill Gates | Netf...,https://www.netflix.com/title/80184771?s=i&trk...,
5,indianexpress.com,English,20190921T134500Z,https://images.indianexpress.com/2019/09/bill-...,India,The simple strategy fueling the rise of Bill G...,https://indianexpress.com/article/world/the-si...,https://indianexpress.com/article/world/the-si...
11,nairaland.com,English,20190921T111500Z,,Nigeria,Bill Gates Call For More Funding For Primary H...,https://www.nairaland.com/5429149/bill-gates-c...,
12,decider.com,English,20190921T013000Z,https://nypdecider.files.wordpress.com/2019/09...,United States,Inside Bill Brain : Decoding Bill Gate Netfl...,https://decider.com/2019/09/20/inside-bills-br...,https://decider.com/2019/09/20/inside-bills-br...
13,boingboing.net,English,20190921T161500Z,,United States,brandeis revival / Boing Boing,https://boingboing.net/tag/brandeis-revival,
15,comicbook.com,English,20190921T000000Z,https://media.comicbook.com/2019/08/netflix-co...,United States,Everything Releasing On Netflix This Weekend,https://comicbook.com/movies/2019/09/20/everyt...,https://comicbook.com/movies/amp/2019/09/20/ev...
19,her.ie,English,20190921T090000Z,,Ireland,5 new additions to Netflix which are worth che...,https://www.her.ie/entertainment/5-new-additio...,https://www.her.ie/amp/entertainment/5-new-add...
21,popculture.com,English,20190921T143000Z,,United States,Everything Coming to Netflix This Weekend,https://popculture.com/streaming/2019/09/20/ev...,https://popculture.com/streaming/amp/2019/09/2...
23,indiatoday.in,English,20190921T144500Z,https://akm-img-a-in.tosshub.com/indiatoday/im...,India,I took 15 Economics classes over 4 years in ...,https://www.indiatoday.in/education-today/news...,https://www.indiatoday.in/amp/education-today/...


In [21]:
articles = articles[articles['language'] == 'English']

In [54]:
list(articles['title'])[3]

'Bill Gates Call For More Funding For Primary Health Care In Nigeria - Health'

In [16]:
articles[['url', 'title']]['title'].values

array(['Surprising ! Bill Gates net worth : When he resigned as Microsoft CEO',
       'Inside Bill Brain : Decoding Bill Gates | Netflix Official Site',
       'The simple strategy fueling the rise of Bill Gates fortune | World News , The Indian Express',
       'Bill Gates Call For More Funding For Primary Health Care In Nigeria - Health',
       '  Inside Bill Brain : Decoding Bill Gate Netflix Review : Stream It Or Skip It ? ',
       'brandeis revival / Boing Boing',
       'Everything Releasing On Netflix This Weekend',
       '5 new additions to Netflix which are worth checking out this weekend',
       'Everything Coming to Netflix This Weekend',
       '  I took 15 Economics classes over 4 years in my BA education : Wipro Chairman Rishad Premji',
       'Why transitioning to only renewable energy will be difficult for the U . S . ',
       'Bill Gates : if we break up Big Tech , well just have more bad companies',
       'Lake Worth Beach wants to harness the Gulf Stream power

In [None]:
def get_importtant_news(important_days,)