In [84]:
import numpy as np
import pandas as pd
import locale
import json
import re
import pymorphy2
from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

In [64]:
# Setting up the locale
locale.setlocale(locale.LC_TIME, 'ru_RU.UTF-8')

'ru_RU.UTF-8'

**We need to get different datasets to look the same way: columns, formats, etc.**:

In [65]:
# Dates to datetime format in news dataset
news_data = pd.read_json('/Users/alexanderknyshov/Desktop/LLM/Data/datasets/news_data_new.json', lines=True)
date = []
for i in range(len(news_data['publishedAt'])):
    date.append(pd.to_datetime(news_data['publishedAt'][i], dayfirst=True, errors='coerce'))
for i in range(len(news_data['publishedAt'])):
    news_data.loc[i, 'publishedAt'] = date[i]

In [66]:
# Joint recommendations dataset
rec1 = pd.read_json('/Users/alexanderknyshov/Desktop/LLM/Data/datasets/invest_recommendations.json')
rec2 = pd.read_json('/Users/alexanderknyshov/Desktop/LLM/Data/datasets/recommendations.json')

rec2 = rec2.rename(columns={'titles': 'title'})
rec2 = rec2.drop(columns=['hrefs'])

In [67]:
# Dates to datetime format in rec1 and rec2
date = []
for i in range(len(rec1['publishedAt'])):
    date.append(pd.to_datetime(rec1['publishedAt'][i], format='%d %B %Y, %H:%M'))
for i in range(len(rec1['publishedAt'])):
    rec1.loc[i, 'publishedAt'] = date[i]
    
date = []
for i in range(len(rec2['publishedAt'])):
    date.append(pd.to_datetime(rec2['publishedAt'][i], format='%d %B %Y', errors='coerce'))
for i in range(len(rec2['publishedAt'])):
    rec2.loc[i, 'publishedAt'] = date[i]

# Merging rec1 and rec2 to get final dataset
recommendations = pd.concat([rec1, rec2], ignore_index=True)

**Now we get to connect recommendations with news being "the cause" of it**:

In [68]:
rec_news = []
for i in range(len(recommendations['publishedAt'])):
    date = recommendations['publishedAt'][i]
    start_date = date - pd.Timedelta(days=7)
    filtered_news = news_data[(news_data['publishedAt'] >= start_date) & (news_data['publishedAt'] < date)]['description']
    filtered_news = str(filtered_news.sum())
    rec_news.append(filtered_news)

recommendations['reason news'] = rec_news

**Here we filter our data**:

In [69]:
def remove_html(text):
    return BeautifulSoup(text, 'html.parser').get_text()

recommendations['cleaned_description'] = recommendations['description'].apply(remove_html)
recommendations['cleaned_news'] = recommendations['reason news'].apply(remove_html)

In [71]:
def remove_urls_emails(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    return re.sub(r'@\w+', '', text)

recommendations['cleaned_description'] = recommendations['cleaned_description'].apply(remove_urls_emails)
recommendations['cleaned_news'] = recommendations['cleaned_news'].apply(remove_urls_emails)

In [74]:
def remove_special_chars(text):
    return re.sub(r'[^а-яА-ЯёЁa-zA-Z\s]', '', text)

recommendations['cleaned_description'] = recommendations['cleaned_description'].apply(remove_special_chars)
recommendations['cleaned_news'] = recommendations['cleaned_news'].apply(remove_special_chars)

In [75]:
def remove_extra_spaces(text):
    return ' '.join(text.split())

recommendations['cleaned_description'] = recommendations['cleaned_description'].apply(remove_extra_spaces)
recommendations['cleaned_news'] = recommendations['cleaned_news'].apply(remove_extra_spaces)

**Text normalization**:

In [78]:
recommendations['cleaned_news'] = recommendations['cleaned_news'].str.lower()
recommendations['cleaned_description'] = recommendations['cleaned_description'].str.lower()

In [79]:
stop_words = set(stopwords.words('russian'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

recommendations['cleaned_news'] = recommendations['cleaned_news'].apply(remove_stopwords)
recommendations['cleaned_description'] = recommendations['cleaned_description'].apply(remove_stopwords)

In [85]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(word) for word in text.split()]

recommendations['cleaned_news'] = recommendations['cleaned_news'].apply(lemmatize_text)
recommendations['cleaned_description'] = recommendations['cleaned_description'].apply(lemmatize_text)

**Write to json file**:

In [91]:
train_set = {'news': [], 'recommendations': []}

train_set['news'] = list(recommendations['cleaned_news'])
train_set['recommendations'] = list(recommendations['cleaned_description'])

In [92]:
with open('/Users/alexanderknyshov/Desktop/LLM/Data/datasets/train_set.json', 'w') as json_file:
    json.dump(train_set, json_file, ensure_ascii=False)