# Импорт модулей и библиотек

In [53]:
import pandas as pd
import numpy as np
import pickle
import datetime as d
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds


import warnings
warnings.filterwarnings("ignore")

# Data Preprocessing

In [54]:
def extraction_data(path_new_data: str, all_years_df = pd.DataFrame([])):

    # load data 2015-2021 years
    for year in range(2015, 2022):
        with open(f"Data/df_{year}_cleaned.pickle", "rb") as file:
            df_years = pickle.load(file)
        all_years_df = all_years_df.append(df_years, ignore_index=True)

    all_years_df.rename(columns={"deals": "amount"}, inplace=True)
    all_years_df.drop("date", axis=1, inplace=True)
    
    # load data 2022 years
    df_2022 = pd.read_csv(f"{path_new_data}", sep=";")

    # Some transforms for data 
    df_2022["datetime"] = pd.to_datetime(df_2022["timestamp"])
    df_2022["cf"] = -df_2022["amount"]*df_2022["price"]
    df_2022.rename(columns={"id": "user"}, inplace=True)

    df_2022 = df_2022.reindex(columns=["datetime", "ticker", "amount", "price", "user", "cf"])

    df_result = pd.concat([df_2022, all_years_df]).reset_index(drop = True)
    df_result["volume"] = abs(df_result["cf"])
    
    tick2ind = {ticker:i for i, ticker in enumerate(df_result.ticker.unique())}
    ind2tick = {val:key for key, val in tick2ind.items()}
    df_result["ticker_id"] = df_result["ticker"].apply(lambda x: tick2ind[x])

    user2ind = {user:i for i, user in enumerate(df_result.user.unique())}
    ind2user = {val:key for key, val in user2ind.items()}
    df_result["user_id"] = df_result["user"].apply(lambda x: user2ind[x])
    df_result.groupby(["ticker_id", "user_id", "datetime"], as_index=False)["volume"].sum()
    df_result["year"] = df_result["datetime"].apply(lambda row: row.year)
    
    df_result.sort_values("datetime", inplace = True)
    return df_result.reset_index(drop = True)

In [55]:
df_result = extraction_data("stock_market_trades.csv")

In [105]:
df_result.head(5)

Unnamed: 0,datetime,ticker,amount,price,user,cf,volume,ticker_id,user_id,year
0,2015-09-16 10:00:00,LKOH,5,2485.1,1_48198,-12425.5,12425.5,4,8851,2015
1,2015-09-16 10:00:00,SBER,100,75.12,1_48199,-7512.0,7512.0,23,8951,2015
2,2015-09-16 10:00:00,SBER,10,75.25,1_49493,-752.5,752.5,23,3355,2015
3,2015-09-16 10:00:00,LKOH,38,2485.1,1_48198,-94433.8,94433.8,4,8851,2015
4,2015-09-16 10:00:00,SBER,-200,75.23,1_48670,15046.0,15046.0,23,7243,2015


## train-test-split

### Создаем словарь (Дата сделки: сделка)

In [59]:
def date_deals(df, data):    
    
    """
    Функция получает на вход столбец датафрейма с датами сделок
    
    На выходе: словарь с ключами - Дата сделки, а значения - сами сделки
    """
    
    date_deal = {}
    dates=set(data.values)
    
    for d in dates:
        deals = df[(data==d)]
        if len(deals)==1:
            continue
        else:
            date_deal[d] = deals    
            
    return date_deal

# Train_test_split

In [60]:
def train_test_split(df, train_size=0.7, test_size=0.3):
    
    train = pd.DataFrame()
    test = pd.DataFrame()
    dict_date = date_deals(df, df["year"])
    
    for key in dict_date.keys():
        deals_count = len(dict_date[key])
        to_train = round(deals_count*train_size)
        to_test = round(deals_count*test_size)
            
        train = pd.concat([train, dict_date[key][:to_train]])
        test = pd.concat([test, dict_date[key][-to_test:]])
    return (train, test)

In [61]:
%%time
train, test = train_test_split(df_result, train_size=0.7, test_size=0.3)

Wall time: 2.47 s


In [62]:
test.head()

Unnamed: 0,datetime,ticker,amount,price,user,cf,volume,ticker_id,user_id,year
2273137,2016-11-29 11:04:00,SBER,10,160.0,1_95110,-1600.0,1600.0,23,12969,2016
2273138,2016-11-29 11:04:00,GAZP,50,149.46,1_97230,-7473.0,7473.0,15,14006,2016
2273139,2016-11-29 11:04:00,SBER,10,160.0,1_95110,-1600.0,1600.0,23,12969,2016
2273140,2016-11-29 11:04:00,GAZP,20,149.46,1_97230,-2989.2,2989.2,15,14006,2016
2273141,2016-11-29 11:04:00,GAZP,10,149.46,1_97230,-1494.6,1494.6,15,14006,2016


In [63]:
train.head()

Unnamed: 0,datetime,ticker,amount,price,user,cf,volume,ticker_id,user_id,year
1424494,2016-09-16 10:00:00,ALRS,-700,84.83,1_83251,59381.0,59381.0,18,14426,2016
1424495,2016-09-16 10:00:00,VTBR,-10000,0.07508,1_82717,750.8,750.8,20,11582,2016
1424496,2016-09-16 10:00:00,SBER,50,149.15,1_83181,-7457.5,7457.5,23,11701,2016
1424497,2016-09-16 10:00:00,RTKM,600,81.35,1_84208,-48810.0,48810.0,36,12568,2016
1424498,2016-09-16 10:00:00,RTKM,300,81.35,1_84208,-24405.0,24405.0,36,12568,2016


# Сбор матриц

In [40]:
def get_csr_matrix(df, user_col, ticker_col, add_col, csr_method):
    n_rows = df[user_col].nunique()
    n_cols = df[ticker_col].nunique()

    row = df[user_col] #user
    col = df[ticker_col] #ticker
    data = df[add_col].astype(float) #deals
    
    return csr_method((data, (row, col)))

In [68]:
csr_train = get_csr_matrix(train, "user_id", "ticker_id", "volume", csr_matrix)
csr_test = get_csr_matrix(test, "user_id", "ticker_id", "volume", csr_matrix)

# Обучение

In [98]:
#!pip install implicit
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender, TFIDFRecommender
from implicit.als import AlternatingLeastSquares
from implicit.lmf import LogisticMatrixFactorization
from implicit.bpr import BayesianPersonalizedRanking

In [70]:
tick2ind = {ticker:i for i, ticker in enumerate(df_result.ticker.unique())}
ind2tick = {val:key for key, val in tick2ind.items()}
user2ind = {user:i for i, user in enumerate(df_result.user.unique())}
ind2user = {val:key for key, val in user2ind.items()}

Параметр K влияет на максимальную выдачу топа, поэтому N для таких моделей желательно указывать меньше K

In [73]:
def recommend_tick(user, model, train_matrix, N):
    user_id = user2ind[user]

    recs = model.recommend(user_id, train_matrix[user_id], N=N, filter_already_liked_items=True)
    
    res = [ind2tick[ticker_ids] for ticker_ids in recs[0]]
    return res

In [74]:
def find_similar(user, model, train_matrix, N):
    user_id = user2ind[user]

    recs = model.recommend(user_id, train_matrix[user_id], N=N, filter_already_liked_items=True)
    
    res = recs[1]
    return res

In [93]:
def get_recommenders(model, csr_matrix_train, csr_matrix_test, df_test, 
                     k=20, factors=40, regularization=0.1, iterations=20,
                     top_N = 10):
    
    if model == CosineRecommender or model == BM25Recommender or model == TFIDFRecommender or :
        fitting_model = model(K=k)
    elif model == AlternatingLeastSquares or model == LogisticMatrixFactorization or model == BayesianPersonalizedRanking:
        fitting_model = model(factors=factors, regularization=regularization, iterations=iterations)
    
    fitting_model.fit(csr_matrix_train)

    df_reccomeds = pd.DataFrame({'user': df_test['user'].unique()})
    
    df_reccomeds['ticker'] = df_reccomeds['user'].apply(lambda x: 
                                                            recommend_tick(x, 
                                                                           fitting_model, 
                                                                           csr_matrix_train, 
                                                                           top_N))
    df_reccomeds['similar'] = df_reccomeds['user'].apply(lambda x: find_similar(x, 
                                                                                fitting_model, 
                                                                                csr_matrix_train, 
                                                                                top_N))
    
    return df_reccomeds

In [100]:
models = {"cosine": CosineRecommender,
          "bm25": BM25Recommender, 
          "tfidf": TFIDFRecommender, 
          "als": AlternatingLeastSquares,
          "lmf": LogisticMatrixFactorization,
          "bpr": BayesianPersonalizedRanking}

In [84]:
df_reccom = get_recommenders(models["cosine"], csr_train, csr_test, test)

  0%|          | 0/161 [00:00<?, ?it/s]

In [82]:
df_reccom

Unnamed: 0,user,ticker,similar
0,1_95110,"[OGKB, LSRG, PLZL, HYDR, PHOR, GAZP, TRMK, MRK...","[80610.5551978891, 69833.6842855536, 66785.783..."
1,1_97230,"[SIBN, AFKS, SBERP, SBCS, RU000A103C46, FIVE, ...","[50734.760950786374, 27781.253792147923, 25735..."
2,1_98823,"[LSRG, TRNFP, RASP, NVTK, SIBN, GAZP, PHOR, NL...","[7043.846775400336, 5624.598337292298, 4331.24..."
3,1_88898,"[VTBR, GAZP, LSRG, SBERP, PLZL, MSNG, MTLRP, T...","[3219690.467156529, 2564815.6398403235, 254351..."
4,1_97360,"[OGKB, SBERP, LSRG, VTBR, LNTA, PLZL, GAZP, RA...","[2865.6347198591807, 2855.6710996774814, 2338...."
...,...,...,...
30743,1_62317,"[PHOR, PLZL, RASP, TRNFP, LSRG, SBERP, GAZP, N...","[759714.234366598, 730090.0655581742, 638938.6..."
30744,1_59833,"[HYDR, VTBR, GAZP, RSTI, LSRG, MTLRP, MSNG, SB...","[2160169.3176838127, 2014152.9872685012, 13311..."
30745,1_48758,"[PIKK, SPBE, LSRG, LNTA, TRNFP, OGKB, RASP, SI...","[34288.446002821176, 33436.1767391402, 32747.8..."
30746,1_59684,"[MGNT, GAZP, VTBR, OGKB, PLZL, LSRG, RSTI, MTL...","[5420.354458979919, 4245.435030422089, 4198.58..."


In [89]:
df_reccom = get_recommenders(models["als"], csr_train, csr_test, test)

  0%|          | 0/20 [00:00<?, ?it/s]

In [90]:
df_reccom 

Unnamed: 0,user,ticker,similar
0,1_95110,"[FXRU, OGKB, AFKS, FIXP, RTKM, SPBC, ALRS, RST...","[1.0917088, 1.0667922, 1.0626063, 1.032947, 1...."
1,1_97230,"[AFKS, MAGN, VKCO, NVTK, HYDR, RSTI, FXRU, LNT...","[0.940866, 0.9121953, 0.89962715, 0.89571303, ..."
2,1_98823,"[AFKS, FXRU, LNTA, FIXP, RU000A103C46, OGKB, M...","[1.0703063, 1.0317894, 1.0099193, 1.0075343, 1..."
3,1_88898,"[FIXP, RSTI, SPBC, SNGS, FXIP, SBERP, MSNG, VK...","[1.0118275, 1.0114672, 1.010182, 1.0096433, 1...."
4,1_97360,"[FXRU, RTKM, INGR, OGKB, MAGN, SNGS, AFKS, AKM...","[0.7908582, 0.758466, 0.7529734, 0.7396774, 0...."
...,...,...,...
30743,1_62317,"[AFKS, FXRU, MAGN, RTKM, SUGB, FIXP, NVTK, GAZ...","[1.1376191, 1.1046456, 1.0560594, 1.0449603, 1..."
30744,1_59833,"[FXRU, AFKS, MTLRP, SNGS, RTKM, MSNG, MTLR, CH...","[0.97107136, 0.94731086, 0.9318914, 0.9224819,..."
30745,1_48758,"[FXRU, AFKS, VKCO, NVTK, SBERP, OGKB, MAGN, IN...","[1.0533332, 1.0400494, 1.0256319, 1.0238245, 1..."
30746,1_59684,"[AFKS, ALRS, VKCO, FXRU, SBERP, MAGN, TATNP, P...","[1.0514574, 1.020128, 1.0013571, 0.9972016, 0...."


In [101]:
df_reccom = get_recommenders(models["lmf"], csr_train, csr_test, test)

  0%|          | 0/20 [00:00<?, ?it/s]

In [102]:
df_reccom 

Unnamed: 0,user,ticker,similar
0,1_95110,"[TATN, LKOH, CHMF, FEES, CBOM, AQUA, ORUP, RU0...","[28.06086, 23.65461, 23.101477, 22.574347, 21...."
1,1_97230,"[MGNT, SBCS, YNDX, SBRB, SBER, RU000A0ZYEE5, E...","[17.900822, 17.606106, 17.53485, 17.245491, 16..."
2,1_98823,"[PHOR, AKRN, MRKP, TGKA, RCMX, RNFT, FIXP, HYD...","[27.147085, 25.182655, 23.889774, 23.636723, 2..."
3,1_88898,"[SU26212RMFS9, TATN, TCSG, LNTA, ETLN, POLY, F...","[32.40721, 27.503094, 27.418756, 26.377956, 26..."
4,1_97360,"[RU000A0ZYU05, RU000A0JXE06, SFTL, TGKA, ESGR,...","[32.145336, 30.01079, 27.195896, 26.267992, 23..."
...,...,...,...
30743,1_62317,"[LNZL, RU000A103KG4, FIVE, NMTP, NLMK, TRNFP, ...","[25.096924, 24.347256, 24.321465, 24.286163, 2..."
30744,1_59833,"[BCSB, SPBC, MSNG, UPRO, YNDX, HYDR, PHOR, LNT...","[36.167725, 33.123047, 32.756016, 32.326645, 3..."
30745,1_48758,"[AFKS, AMEZ, INFL, RU000A0JXE06, RU000A0JXFS8,...","[28.592365, 24.570608, 23.832323, 22.887472, 2..."
30746,1_59684,"[SMLT, SFTL, TGKA, ENRU, RU000A103C46, SBCS, O...","[28.492113, 27.947533, 26.331001, 24.432625, 2..."
