<a href="https://colab.research.google.com/github/efearase/RL_with_sentiment/blob/main/pulse_parsin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -r requirements.txt





In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from httpx import ReadTimeout
from tpulse import TinkoffPulse
from httpx import HTTPStatusError
import time
import os
import requests
from IPython.display import clear_output
from pprint import pp
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel

In [8]:
names = pd.read_csv('Export_ru_securities-list_20240129.csv', encoding='windows-1251', delimiter=';', quotechar='"')

In [9]:
codes = names[((names['LIST_SECTION']=='Первый уровень')) & (names['SUPERTYPE']=='Акции')]['TRADE_CODE'].dropna()

In [5]:
date = pd.to_datetime(time.time(), unit='s')
date

Timestamp('2024-03-20 13:35:53.905000960')

In [11]:
def datareader_optimized(ticker):
    cursor = 999999999
    t_0 = time.time()
    KEYS = ["id", "nickname", "content", "inserted"]
    date = pd.to_datetime(time.time(), unit='s').tz_localize('UTC')
    fin_date = pd.to_datetime('2022-03-29').tz_localize('UTC')
    pulse = TinkoffPulse()  # Assuming this is defined elsewhere

    # Initialize an empty DataFrame with specified types if possible
    base_df = pd.DataFrame(columns=["id", "nickname", "text", "inserted"])

    while date >= fin_date:
        try:
            if time.time() - t_0 > 1:
                print(f"{date}, {ticker}")
                clear_output(wait=True)
                t_0 = time.time()
            response = pulse.get_posts_by_ticker(ticker, cursor)
            cursor = response["nextCursor"]
            posts = response["items"]

            chunk_data = []
            for post in posts:
                data = {key: post[key] for key in KEYS if key in post}
                data['text'] = data['content'].get('text', None) if isinstance(data.get('content'), dict) else None
                data['inserted'] = pd.to_datetime(data['inserted']).strftime('%Y-%m-%d')
                del data['content']
                chunk_data.append(data)
                date = pd.to_datetime(post['inserted'])
            if chunk_data:
                chunk_df = pd.DataFrame(chunk_data)
                base_df = pd.concat([base_df, chunk_df], ignore_index=True)

        except (HTTPStatusError, TimeoutError, KeyError, ReadTimeout, NameError, TypeError) as e:
            print(f"Encountered an error: {e}. Continuing...")
            date = pd.to_datetime(date.timestamp() - 24 * 60 * 60, unit = 's').tz_localize('UTC')
            pass

        time.sleep(0.25)

    base_df.to_csv(f"data/text_{ticker}.csv", index=False)
    users = base_df['nickname'].unique()

    return users

In [13]:
users_set = set()

In [11]:
def check(name):
    for file in os.listdir('data'):
        if name in file:
            return 1
    return 0

In [None]:
  # Use a set to automatically handle unique entries
for code in codes:
    if not check(code):  # Assuming check is a function defined elsewhere
        new_users = datareader_optimized(code)  # Collect new users from the optimized function
        users_set.update(new_users)  # Efficiently add new users, automatically avoiding duplicates

users = list(users_set)

In [14]:
pulse = TinkoffPulse()

In [16]:
users_set = set()
for file in os.listdir('text_data/'):
    df = pd.read_csv(f"text_data/{file}")
    new_users = df['nickname'].unique()
    users_set.update(new_users)

In [17]:
users_set

{'Melhior94',
 'Trade_15let',
 'blaaawka',
 'Traktorist_III_klassa',
 'rakolov',
 'Nabotov',
 'Alim82',
 'Denis__Morozov',
 'Ovepfunkep',
 'liolia604',
 'Ann_Blacksea',
 'rkfljbcrfntkm',
 'gidravlic',
 'DmitriyPuls',
 'Machinka',
 'Emvashdepo',
 'cetronc',
 'GetBackBrentTo100',
 'Kohafe',
 'Ksycha777',
 'vseema',
 'leki7',
 'SergeyTkache',
 'SRM6',
 'Lanc',
 'Evg.krs',
 'Kaupibarya',
 'booooooo',
 'Diamond19',
 'MoyPerviyMillion',
 'Travel_and_Live',
 'ErrareHumanumEst',
 'CXG22',
 'PiratFX',
 'energizzzzer',
 'ALSANCHO',
 'serjmat',
 'korsean',
 'my_name_andrew',
 'knsv',
 'ProstoyTrader',
 'Igor_46_',
 'sweetlhare',
 'alex1823',
 'Project_A',
 'Pohmelye.AUF',
 'Skroogee',
 'skurd',
 'RauiPauraui',
 'tsvet',
 '14_Invest',
 'Dr.Waclaw',
 'boss_babloss77',
 'FinchZar',
 'Itormi',
 'Zhukovka33',
 'NachalnikUpravleniya',
 'Basilio_n',
 'SSV1',
 '_No_Limit_',
 'ytko1988',
 'zeelT',
 'Vargvard',
 'Caty',
 'Mawlikiev1981',
 'mariya.456',
 'iksi_neizbezni_2',
 'Reiter88',
 'qcorntts',
 'Bryce

In [23]:
retries = 5  # Maximum number of retries
backoff_factor = 1  # Initial backoff duration in seconds
followers = []
yields = []

for user in users_set:
    print(f"progress: {round(len(followers) / len(users_set) * 100, 2)}%")
    clear_output(wait=True)
    attempt = 0
    while attempt < retries:
        try:
            user_info = pulse.get_user_info(user)
            if user_info is not None:
                if 'statistics' in user_info.keys():
                    followers.append(user_info['followersCount'])
                    yields.append(user_info['statistics']['yearRelativeYield'])
                else:
                    followers.append(0)
                    yields.append(0)
            else:
                followers.append(0)
                yields.append(0)
            break  # If the request was successful, exit the loop
        except (ReadTimeout, ValueError, TimeoutError, NameError, KeyError, HTTPStatusError) as e:
            if isinstance(e, HTTPStatusError) and e.response.status_code == 429:
                # Calculate wait time with exponential backoff
                wait_time = backoff_factor * (2 ** attempt)
                print(f"Rate limit exceeded, retrying in {wait_time} seconds...")
                time.sleep(wait_time)
                attempt += 1
            else:
                # For other errors, append 0 and move to the next user
                followers.append(0)
                yields.append(0)
                break  # Exit the loop for non-429 errors
        if attempt == retries:
            # If maximum retries reached without success, append 0s
            followers.append(0)
            yields.append(0)


progress: 100.0%


In [24]:
users_info = pd.DataFrame({'users': list(users_set), 'yields': yields, 'followers': followers})

users_info.to_csv("users_info.csv")

In [28]:
len(followers), len(yields), len(users_set)

(1973, 1973, 55509)

In [None]:
users_info.to_csv("users_info.csv")