# Импорт модулей и библиотек

In [21]:
import pandas as pd
import numpy as np
import pickle
import datetime as d
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds


import warnings
warnings.filterwarnings("ignore")

# Data Preprocessing

In [58]:
def extraction_data(path_new_data: str, all_years_df = pd.DataFrame([])):

    # load data 2015-2021 years
    for year in range(2015, 2022):
        with open(f"Data/df_{year}_cleaned.pickle", "rb") as file:
            df_years = pickle.load(file)
        all_years_df = all_years_df.append(df_years, ignore_index=True)

    all_years_df.rename(columns={"deals": "amount"}, inplace=True)
    all_years_df.drop("date", axis=1, inplace=True)
    
    # load data 2022 years
    df_2022 = pd.read_csv(f"{path_new_data}", sep=";")

    # Some transforms for data 
    df_2022["datetime"] = pd.to_datetime(df_2022["timestamp"])
    df_2022["cf"] = -df_2022["amount"]*df_2022["price"]
    df_2022.rename(columns={"id": "user"}, inplace=True)

    df_2022 = df_2022.reindex(columns=["datetime", "ticker", "amount", "price", "user", "cf"])

    df_result = pd.concat([df_2022, all_years_df]).reset_index(drop = True)
    df_result["volume"] = abs(df_result["cf"])
    
    tick2ind = {ticker:i for i, ticker in enumerate(df_result.ticker.unique())}
    ind2tick = {val:key for key, val in tick2ind.items()}
    df_result["ticker_id"] = df_result["ticker"].apply(lambda x: tick2ind[x])

    user2ind = {user:i for i, user in enumerate(df_result.user.unique())}
    ind2user = {val:key for key, val in user2ind.items()}
    df_result["user_id"] = df_result["user"].apply(lambda x: user2ind[x])
    df_result.groupby(["ticker_id", "user_id", "datetime"], as_index=False)["volume"].sum()
    df_result["year"] = df_result["datetime"].apply(lambda row: row.year)

    return df_result.sort_values("datetime")

In [59]:
df_result = extraction_data("stock_market_trades.csv")

In [92]:
df_result.head(3)

Unnamed: 0,datetime,ticker,amount,price,user,cf,volume,ticker_id,user_id,year
2101107,2015-09-16 10:00:00,LKOH,5,2485.1,1_48198,-12425.5,12425.5,4,8851,2015
2118472,2015-09-16 10:00:00,SBER,100,75.12,1_48199,-7512.0,7512.0,23,8951,2015
1020902,2015-09-16 10:00:00,SBER,10,75.25,1_49493,-752.5,752.5,23,3355,2015


---------------------------------------------------------------------------------------------------------------------------

## train-test-split

### Создаем словарь (Дата сделки: сделка)

In [93]:
def date_deals(df, data):    
    
    """
    Функция получает на вход столбец датафрейма с датами сделок
    
    На выходе: словарь с ключами - Дата сделки, а значения - сами сделки
    """
    
    date_deal = {}
    dates=set(data.values)
    
    for d in dates:
        deals = df[(data==d)]
        if len(deals)==1:
            continue
        else:
            date_deal[d] = deals    
            
    return date_deal

# Train_test_split

In [94]:
def train_test_split(df, train_size=0.7, test_size=0.3):
    
    train = pd.DataFrame()
    test = pd.DataFrame()
    dict_date = date_deals(df, df["year"])
    
    for key in dict_date.keys():
        deals_count = len(dict_date[key])
        to_train = round(deals_count*train_size)
        to_test = round(deals_count*test_size)
            
        train = pd.concat([train, dict_date[key][:to_train]])
        test = pd.concat([test, dict_date[key][-to_test:]])
    return (train, test)

In [95]:
%%time
train, test = train_test_split(df_result, train_size=0.7, test_size=0.3)

CPU times: total: 3.66 s
Wall time: 3.46 s


In [96]:
train

Unnamed: 0,datetime,ticker,amount,price,user,cf,volume,ticker_id,user_id,year
3191991,2016-09-16 10:00:00,ALRS,-700,84.83000,1_83251,59381.0,59381.0,18,14426,2016
2622156,2016-09-16 10:00:00,VTBR,-10000,0.07508,1_82717,750.8,750.8,20,11582,2016
2645894,2016-09-16 10:00:00,SBER,50,149.15000,1_83181,-7457.5,7457.5,23,11701,2016
2815987,2016-09-16 10:00:00,RTKM,600,81.35000,1_84208,-48810.0,48810.0,36,12568,2016
2815986,2016-09-16 10:00:00,RTKM,300,81.35000,1_84208,-24405.0,24405.0,36,12568,2016
...,...,...,...,...,...,...,...,...,...,...
780330,2015-11-24 10:11:00,SBER,440,110.29000,1_57385,-48527.6,48527.6,23,2293,2015
1895554,2015-11-24 10:11:00,SBERP,300,79.90000,1_63385,-23970.0,23970.0,74,7835,2015
2015936,2015-11-24 10:11:00,GAZP,500,152.61000,1_52732,-76305.0,76305.0,15,8358,2015
1157299,2015-11-24 10:11:00,ROSN,10,275.50000,1_51444,-2755.0,2755.0,11,3985,2015


# Сбор матриц

In [14]:
def get_csr_matrix(df, user_col, ticker_col, add_col, csr_method):
    n_rows = df[user_col].nunique()
    n_cols = df[ticker_col].nunique()

    row = df[user_col] #user
    col = df[ticker_col] #ticker
    data = df[add_col].astype(float) #deals
    
    return csr_method((data, (row, col)))

In [30]:
csr_train = get_csr_matrix(train, "user_id", "ticker_id", "volume", csr_matrix)
csr_test = get_csr_matrix(test, "user_id", "ticker_id", "volume", csr_matrix)

# Обучение

In [105]:
from implicit.nearest_neighbours import CosineRecommender
from implicit.als import AlternatingLeastSquares

In [120]:
tick2ind = {ticker:i for i, ticker in enumerate(df_result.ticker.unique())}
ind2tick = {val:key for key, val in tick2ind.items()}
user2ind = {user:i for i, user in enumerate(df_result.user.unique())}
ind2user = {val:key for key, val in user2ind.items()}

Параметр K влияет на максимальную выдачу топа, поэтому N для таких моделей желательно указывать меньше K

In [121]:
def recommend_tick(user, model, train_matrix, N):
    user_id = user2ind[user]

    recs = model.recommend(user_id, train_matrix[user_id], N=N, filter_already_liked_items=True)
    
    res = [ind2tick[ticker_ids] for ticker_ids in recs[0]]
    return res

In [122]:
def find_similar(user, model, train_matrix, N):
    user_id = user2ind[user]

    recs = model.recommend(user_id, train_matrix[user_id], N=N, filter_already_liked_items=True)
    
    res = recs[1]
    return res

In [130]:
def get_recommenders(model, csr_matrix_train, csr_matrix_test, df_test, k=20, top_N = 10):
    
    fitting_model = model(K=k)
    fitting_model.fit(csr_matrix_train)

    df_reccomeds = pd.DataFrame({'user': df_test['user'].unique()})
    
    df_reccomeds['ticker'] = df_reccomeds['user'].apply(lambda x: 
                                                            recommend_tick(x, 
                                                                           fitting_model, 
                                                                           csr_matrix_train, 
                                                                           top_N))
    df_reccomeds['similar'] = df_reccomeds['user'].apply(lambda x: find_similar(x, 
                                                                                fitting_model, 
                                                                                csr_matrix_train, 
                                                                                top_N))
    
    return df_reccomeds

In [131]:
df_reccom = get_recommenders(CosineRecommender, csr_train, csr_test, test)

  0%|          | 0/161 [00:00<?, ?it/s]

In [132]:
df_reccom

Unnamed: 0,user,ticker,similar
0,1_95110,"[OGKB, LSRG, PLZL, HYDR, PHOR, GAZP, TRMK, MRK...","[80610.5551978891, 69833.6842855536, 66785.783..."
1,1_97230,"[SIBN, AFKS, SBERP, SBCS, RU000A103C46, FIVE, ...","[50734.760950786374, 27781.253792147923, 25735..."
2,1_98823,"[LSRG, TRNFP, RASP, NVTK, SIBN, GAZP, PHOR, NL...","[7043.846775400336, 5624.598337292298, 4331.24..."
3,1_88898,"[VTBR, GAZP, LSRG, SBERP, PLZL, MSNG, MTLRP, T...","[3219690.467156529, 2564815.6398403235, 254351..."
4,1_97360,"[OGKB, SBERP, LSRG, VTBR, LNTA, PLZL, GAZP, RA...","[2865.6347198591807, 2855.6710996774814, 2338...."
...,...,...,...
30743,1_62317,"[PHOR, PLZL, RASP, TRNFP, LSRG, SBERP, GAZP, N...","[759714.234366598, 730090.0655581742, 638938.6..."
30744,1_59833,"[HYDR, VTBR, GAZP, RSTI, LSRG, MTLRP, MSNG, SB...","[2160169.3176838127, 2014152.9872685012, 13311..."
30745,1_48758,"[PIKK, SPBE, LSRG, LNTA, TRNFP, OGKB, RASP, SI...","[34288.446002821176, 33436.1767391402, 32747.8..."
30746,1_59684,"[MGNT, GAZP, VTBR, OGKB, PLZL, LSRG, RSTI, MTL...","[5420.354458979919, 4245.435030422089, 4198.58..."


In [112]:
top_N = 10
df_recs_cossim = pd.DataFrame({'user': test['user'].unique()})

df_recs_cossim['ticker'] = df_recs_cossim['user'].apply(lambda x: 
                                                        recommend_tick(x, cosine_model, csr_train, top_N))

df_recs_cossim['similar'] = df_recs_cossim['user'].apply(lambda x: find_similar(x, cosine_model, csr_train, top_N))
df_recs_cossim.head()

Unnamed: 0,user,ticker,similar
0,1_95110,"[OGKB, LSRG, PLZL, HYDR, PHOR, GAZP, TRMK, MRK...","[80610.5551978891, 69833.6842855536, 66785.783..."
1,1_97230,"[SIBN, AFKS, SBERP, SBCS, RU000A103C46, FIVE, ...","[50734.760950786374, 27781.253792147923, 25735..."
2,1_98823,"[LSRG, TRNFP, RASP, NVTK, SIBN, GAZP, PHOR, NL...","[7043.846775400336, 5624.598337292298, 4331.24..."
3,1_88898,"[VTBR, GAZP, LSRG, SBERP, PLZL, MSNG, MTLRP, T...","[3219690.467156529, 2564815.6398403235, 254351..."
4,1_97360,"[OGKB, SBERP, LSRG, VTBR, LNTA, PLZL, GAZP, RA...","[2865.6347198591807, 2855.6710996774814, 2338...."
