In [8]:
import pandas as pd
import numpy as np
import joblib

In [2]:
df_samples = pd.read_csv('data/books_samples_.csv', sep='\t')

In [3]:
df_samples.head(10)

Unnamed: 0,Идентификатор экземпляра,ИД Каталожной записи,Инвентарный номер,Штрих-код,Раздел знаний,Идентификатор сиглы
0,12488544,1,09:0000120386,980007854120,84(2Рос=Рус)6,314.0
1,12488546,1,09:0000152427,980010247056,84(2Рос=Рус)6,320.0
2,12488542,1,09:0000240622,980000737154,84(2Рос=Рус)6,306.0
3,13671389,1,09:0000278133,980005475303,84(2Рос=Рус)6,308.0
4,7625486,1,05:0000257728,580001931461,84(2Рос=Рус)6,293.0
5,7625475,1,05:0000002639,580000155080,84(2Рос=Рус)6,272.0
6,1375296,1,10:0000024099,1081000042350,84(2Рос)6,51.0
7,11840575,1,11:0000049164,1100000491644,84(2Рос)6,196.0
8,11840570,1,11:0000049159,1100000491590,84(2Рос)6,191.0
9,11840578,1,11:0000049168,1100000491682,84(2Рос)6,197.0


In [4]:
df_samples.columns = ['sample_id', 'catalog_id', 'inventory_num', 'barcode', 'section_knowledge', 'sigla_id']
df_samples.drop(columns=['barcode', 'section_knowledge', 'sigla_id'], inplace=True)

In [5]:
df_samples.head()

Unnamed: 0,sample_id,catalog_id,inventory_num
0,12488544,1,09:0000120386
1,12488546,1,09:0000152427
2,12488542,1,09:0000240622
3,13671389,1,09:0000278133
4,7625486,1,05:0000257728


In [6]:
df_give = pd.read_csv('data/books_give_.csv', sep='\t', dtype='object')

In [7]:
df_give.columns = ['give_id', 'reader_id', 'inventory_num', 'barcode', 'date_give', 'date_return', 'state_id']
df_give.drop(columns=['barcode', 'date_give', 'date_return'], inplace=True)

In [8]:
df_give.head()

Unnamed: 0,give_id,reader_id,inventory_num,state_id
0,1.0,179.0,03:0000005767,6545.0
1,2.0,179.0,03:0000005761,6548.0
2,3.0,179.0,03:0000005767,6545.0
3,4.0,179.0,03:0000005767,6545.0
4,5.0,179.0,03:0000005767,6548.0


In [9]:
df_catalog = pd.read_csv('data/books_catalog_.csv', sep='\t')

In [10]:
df_catalog.drop(columns=['p260a', 'p260b', 'p084a', 'p521a'], inplace=True)

In [11]:
df_catalog.sample(5)

Unnamed: 0,doc_id,p100a,p245a,p260c,p490a,p650a
1078111,1417733,Чехов Антон Павлович,Полное собрание сочинений и писем. Сочинения,1946,,Художественная литература
727589,941369,Зимняя Ирина Алексеевна,Педагогическая психология,2009,Новая университетская библиотека,Психология труда и профессиональной деятельности
1164633,1547307,Соловьев Сергей Михайлович,История России с древнейших времен,1966,,
1362104,1774629,Фрай Макс,Болтливый мертвец,2019,Лабиринты Ехо,Художественная литература ; Российская проза
1024053,1320509,,Шедевры русской живописи,2009,,Живопись


In [12]:
df_catalog['s'] = (df_catalog['p100a'].fillna('') + ' ' + df_catalog['p245a'].fillna('') + ' ' + df_catalog['p490a'].fillna('') + ' ' + df_catalog['p650a'].fillna('')).str.replace('\t',' ').str.strip().str.lower()
df_catalog.drop(columns=['p100a', 'p245a', 'p490a', 'p650a'], inplace=True)

In [13]:
import re

In [14]:
def calc_book_age(x):
    x = re.sub(r'\D','',x)
    if len(x) > 0:
        return 2020 - int(x)
    return np.nan

In [15]:
df_catalog.loc[df_catalog['p260c'].notna(), 'p260c'] = df_catalog.loc[df_catalog['p260c'].notna(), 'p260c'].map(calc_book_age)

In [16]:
df_catalog['p260c'] = df_catalog['p260c'].fillna(df_catalog['p260c'].median())

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tf_idf = TfidfVectorizer(max_features=1000, ngram_range=(1,2))

In [19]:
%%time
tf_emb = tf_idf.fit_transform(df_catalog['s'])

Wall time: 36.3 s


In [330]:
tf_emb

<1370457x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 8708180 stored elements in Compressed Sparse Row format>

In [213]:
df_readers = pd.read_csv('data/books_readers_.csv', sep='\t', parse_dates=True)

In [214]:
df_readers.columns = ['reader_id', 'reader_birth']

In [215]:
df_readers['reader_birth'] = pd.to_datetime(df_readers['reader_birth'], format='%Y-%m-%d', errors='coerce')
df_readers['reader_birth'] = df_readers['reader_birth'].fillna(df_readers['reader_birth'].mean())

In [216]:
df_readers['age'] = (pd.to_datetime('2020-11-01') - df_readers.reader_birth).dt.days

In [217]:
df_readers['age'] = df_readers['age'] // 360

In [218]:
df_readers.drop(columns='reader_birth', inplace=True)

In [26]:
df_give

Unnamed: 0,give_id,reader_id,inventory_num,state_id
0,1.0,179.0,03:0000005767,6545.0
1,2.0,179.0,03:0000005761,6548.0
2,3.0,179.0,03:0000005767,6545.0
3,4.0,179.0,03:0000005767,6545.0
4,5.0,179.0,03:0000005767,6548.0
...,...,...,...,...
10421155,11423388.0,566211.0,09:0000922563,6544.0
10421156,11423389.0,110298.0,08:0000227792,6544.0
10421157,11423390.0,110298.0,Б08:00001928,6544.0
10421158,11423391.0,110298.0,Б08:00027425,6544.0


In [27]:
df = df_give#.set_index('give_id')
df = (df.join(pd.get_dummies(df['state_id'], prefix='state_'))).drop(columns='state_id')

In [28]:
df.head()

Unnamed: 0,give_id,reader_id,inventory_num,state__6544.0,state__6545.0,state__6546.0,state__6547.0,state__6548.0,state__6549.0,state__6659.0
0,1.0,179.0,03:0000005767,0,1,0,0,0,0,0
1,2.0,179.0,03:0000005761,0,0,0,0,1,0,0
2,3.0,179.0,03:0000005767,0,1,0,0,0,0,0
3,4.0,179.0,03:0000005767,0,1,0,0,0,0,0
4,5.0,179.0,03:0000005767,0,0,0,0,1,0,0


In [29]:
%%time
df.reader_id = df.reader_id.astype(float).astype('int64')
df = pd.merge(df, df_readers, on='reader_id', how='left')

Wall time: 4.52 s


In [30]:
df.head()

Unnamed: 0,give_id,reader_id,inventory_num,state__6544.0,state__6545.0,state__6546.0,state__6547.0,state__6548.0,state__6549.0,state__6659.0,age
0,1.0,179,03:0000005767,0,1,0,0,0,0,0,31.0
1,2.0,179,03:0000005761,0,0,0,0,1,0,0,31.0
2,3.0,179,03:0000005767,0,1,0,0,0,0,0,31.0
3,4.0,179,03:0000005767,0,1,0,0,0,0,0,31.0
4,5.0,179,03:0000005767,0,0,0,0,1,0,0,31.0


In [31]:
%%time
df = pd.merge(df, df_samples, on='inventory_num', how='left')

Wall time: 19.7 s


In [32]:
df.head()

Unnamed: 0,give_id,reader_id,inventory_num,state__6544.0,state__6545.0,state__6546.0,state__6547.0,state__6548.0,state__6549.0,state__6659.0,age,sample_id,catalog_id
0,1.0,179,03:0000005767,0,1,0,0,0,0,0,31.0,30266.0,5190.0
1,2.0,179,03:0000005761,0,0,0,0,1,0,0,31.0,30236.0,5185.0
2,3.0,179,03:0000005767,0,1,0,0,0,0,0,31.0,30266.0,5190.0
3,4.0,179,03:0000005767,0,1,0,0,0,0,0,31.0,30266.0,5190.0
4,5.0,179,03:0000005767,0,0,0,0,1,0,0,31.0,30266.0,5190.0


In [33]:
%%time
df_catalog['doc_id'] = df_catalog['doc_id'].astype('float64')
df = pd.merge(df, df_catalog, left_on='catalog_id', right_on='doc_id', how='left')

Wall time: 7.3 s


In [34]:
df.drop(columns='doc_id', inplace=True)

In [323]:
df.head()

Unnamed: 0,give_id,reader_id,inventory_num,state__6544.0,state__6545.0,state__6546.0,state__6547.0,state__6548.0,state__6549.0,state__6659.0,age,sample_id,catalog_id,p260c,s
0,1.0,179,03:0000005767,0,1,0,0,0,0,0,31.0,30266.0,5190.0,20.0,семенова мария васильевна год людоеда художес...
1,2.0,179,03:0000005761,0,0,0,0,1,0,0,31.0,30236.0,5185.0,21.0,сказки о солдате библиотека русской сказки худ...
2,3.0,179,03:0000005767,0,1,0,0,0,0,0,31.0,30266.0,5190.0,20.0,семенова мария васильевна год людоеда художес...
3,4.0,179,03:0000005767,0,1,0,0,0,0,0,31.0,30266.0,5190.0,20.0,семенова мария васильевна год людоеда художес...
4,5.0,179,03:0000005767,0,0,0,0,1,0,0,31.0,30266.0,5190.0,20.0,семенова мария васильевна год людоеда художес...


In [36]:
df = df[df['catalog_id'].notna()]

In [324]:
for c in df.columns:
    if df[df[c].isna()].shape[0] > 0:
        print(c)

In [38]:
df['age'] = df['age'].fillna(df['age'].median())
df['p260c'] = df['p260c'].fillna(df['p260c'].median())
df['s'] = df['s'].fillna('')

In [322]:
%%time
df.to_csv('_data_books_main.csv', sep='\t', index=False)

Wall time: 1min 29s
