# Базовый пример с логистической регрессией

In [1]:
import os

import pandas as pd
import numpy as np

import pyarrow.parquet as parquet
# Used to train document embeddings
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# Used to train the baseline model
from sklearn.linear_model import LogisticRegression


input_path = '../input'
print(os.listdir(input_path))

['train', 'test', 'texts']


Посмотри на все данные, что у нас есть (аккуратно, очень большой список!)

In [2]:
%%sh
ls -lhR ../input | awk '
/:$/&&f{s=$0;f=0}
/:$/&&!f{sub(/:$/,"");s=$0;f=1;next}
NF&&f{ print s"/"$0 }'

../input/total 12K
../input/drwxr-xr-x  2 root root 4.0K Feb  8 18:05 test
../input/drwxr-xr-x  4 root root 4.0K Feb  8 18:05 texts
../input/drwxr-xr-x 50 root root 4.0K Feb  8 18:06 train
../input/test/total 18M
../input/test/-rw-r--r-- 1 root root    0 Feb  8 18:05 _SUCCESS
../input/test/-rw-r--r-- 1 root root  967 Feb  8 18:05 _common_metadata
../input/test/-rw-r--r-- 1 root root 8.9K Feb  8 18:05 _metadata
../input/test/-rw-r--r-- 1 root root 2.5M Feb  8 18:05 part-00000-b530ebcd-5cdf-4e1c-8099-65ebf5729ba1-c000.gz.parquet
../input/test/-rw-r--r-- 1 root root 2.5M Feb  8 18:05 part-00001-b530ebcd-5cdf-4e1c-8099-65ebf5729ba1-c000.gz.parquet
../input/test/-rw-r--r-- 1 root root 2.5M Feb  8 18:05 part-00002-b530ebcd-5cdf-4e1c-8099-65ebf5729ba1-c000.gz.parquet
../input/test/-rw-r--r-- 1 root root 2.5M Feb  8 18:05 part-00003-b530ebcd-5cdf-4e1c-8099-65ebf5729ba1-c000.gz.parquet
../input/test/-rw-r--r-- 1 root root 2.5M Feb  8 18:05 part-00004-b530ebcd-5cdf-4e1c-8099-65ebf5729ba1-c000.gz

In [3]:
test_texts = parquet.read_table(
    input_path + '/texts/test/', 
    columns = ['objectId','preprocessed']
).to_pandas()
test_texts.head(10)

Unnamed: 0,objectId,preprocessed
0,517288,"[квартирник, нтв, маргулис, групп, пилот]"
1,9501964,"[родител, очен, трогательн, песн, артур, халат]"
2,23007371,"[сух, суперджет, откажет, западн, комплект, те..."
3,38353886,"[сгорел, сара, гор, хат]"
4,21192138,"[живодёр, отруб, лап, собак]"
5,26415073,"[ажурн, маков, кулич, так, нежн, воздушн, мяки..."
6,36734526,"[друг, уснул, бар, теб, нужн, тащ, дом, очен, ..."
7,8699823,"[никифор, ден, дат, год, март, понедельник, др..."
8,12236843,[]
9,38393782,"[днр, лнр, новост, войск, берут, кольц, донбас..."


In [4]:
%%time
# Build document embeddings for text documents
doc2vec = Doc2Vec(
    [TaggedDocument(lines,'tag') for lines in test_texts.preprocessed], 
    vector_size=5, 
    window=2, 
    min_count=1, 
    workers=4
)

CPU times: user 18min 31s, sys: 1min 28s, total: 20min
Wall time: 9min 14s


In [5]:
# Read a single day to train model on as Pandas dataframe
data = parquet.read_table(
    input_path + '/train/date=2018-02-07',
    columns = ['instanceId_objectId','feedback']
).to_pandas()
data.rename(columns = {'instanceId_objectId':'objectId'}, inplace = True)
data['label'] = data['feedback'].apply(lambda x: 1.0 if("Liked" in x) else 0.0).values
data = data[['objectId','label']]
data.head(10)

Unnamed: 0,objectId,label
0,22429313,0.0
1,14676953,0.0
2,11562101,0.0
3,20892119,0.0
4,16063005,0.0
5,14576490,0.0
6,11811947,0.0
7,1936012,1.0
8,20867189,0.0
9,28967185,0.0


In [6]:
%%time

parts = []
# Get unique object ids
ids = data.groupby('objectId').count()

# In order to save memory iterate part by part
for (dirpath, dirnames, filenames) in os.walk(input_path + '/texts/train/'):
    for name in filenames:
        if name.startswith('part'):
            # Read single part
            texts = parquet.read_table(
                input_path + '/texts/train/' + name,
                columns = ['objectId','preprocessed']
            ).to_pandas()            
            # Filter documents we need
            joined = ids.join(texts.set_index('objectId'), how='inner', on='objectId')
            # Evaluate embeddings
            joined['embedding'] = joined.preprocessed.apply(doc2vec.infer_vector)
            # Memorize
            parts.append(joined[['embedding']])
            print('Done with ' + name)

Done with part-00021-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00025-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00026-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00020-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00004-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00006-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00007-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00022-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00012-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00023-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00011-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00001-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00016-1b50c8f5-87db-4a53-9677-17f1113c3f8d-c000.gz.parquet
Done with part-00024-1b50c8f5-87db-4a5

In [7]:
# Combine all the parts
train = data.join(pd.concat(parts), on='objectId')
train.head(10)

Unnamed: 0,objectId,label,embedding
0,22429313,0.0,"[-0.37136218, 2.6387086, 2.3155334, -1.041994,..."
1,14676953,0.0,"[-0.6552614, 1.6004632, 1.4893707, -0.70169014..."
2,11562101,0.0,"[0.017401226, 0.46140304, 0.09371229, -0.15623..."
3,20892119,0.0,"[-0.6175911, 0.84971684, 0.6825317, -0.3112013..."
4,16063005,0.0,"[-0.20742297, 3.2966564, 2.0907254, -1.3507222..."
5,14576490,0.0,"[0.14290676, 0.1726916, -0.03486367, -0.214590..."
6,11811947,0.0,"[-0.22111274, 0.56780833, 0.21943146, -0.08558..."
7,1936012,1.0,"[-0.166379, 0.5966222, -0.056178145, -0.049690..."
8,20867189,0.0,"[-0.7364866, 1.7211171, 2.021003, -0.76432395,..."
9,28967185,0.0,"[-0.12567963, 0.0719028, 0.1733252, 0.05765563..."


In [8]:
# Construct the label (liked objects)
y = train['label'].values

In [9]:
# Extract the most interesting features
X = np.stack(train['embedding'].values)

In [10]:
# Fit the model and check the weights
model = LogisticRegression(C=0.01, random_state=23, solver='lbfgs').fit(X, y)
model.coef_

array([[ 0.37023498,  0.07842812, -0.06003187,  0.40423709, -0.12813174]])

In [11]:
%%time
# Weight the test documents
test_texts['weight'] = model.predict_proba(
    np.stack(
        test_texts.preprocessed.apply(lambda x : doc2vec.infer_vector(x))
    )
)[:, 1]
test_texts.head(10)

CPU times: user 9min 47s, sys: 168 ms, total: 9min 47s
Wall time: 9min 47s


In [12]:
# Read the test data
test = parquet.read_table(
    input_path + '/test',
    columns = ['instanceId_userId','instanceId_objectId']
).to_pandas()
test.rename(columns={'instanceId_objectId': 'objectId'}, inplace=True)
test.head(10)

Unnamed: 0,instanceId_userId,objectId
0,1006,34577503
1,1006,37520199
2,1618,546086
3,1618,546086
4,1618,546086
5,1618,35981492
6,1618,26764305
7,1810,958605
8,1810,20479574
9,1810,36254478


In [13]:
# Join test documents and elliminate possible duplicates
scores = test.join(
    test_texts[['objectId','weight']].set_index('objectId'), 
    how='inner', 
    on='objectId'
).groupby(['instanceId_userId','objectId']).max()
scores.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
instanceId_userId,objectId,Unnamed: 2_level_1
316,17997084,0.147213
316,37758420,0.164279
631,15478935,0.179994
631,30513650,0.175571
631,38118098,0.16638
742,10672856,0.145232
742,24302446,0.172535
742,28816291,0.182678
742,34685448,0.169407
868,11640701,0.157995


In [14]:
#  Sort for each user
result = scores.sort_values(by=['instanceId_userId', 'weight'], ascending=False).reset_index()
result.head(10)

Unnamed: 0,instanceId_userId,objectId,weight
0,15717370,28363245,0.185091
1,15717370,37835786,0.179396
2,15717313,30924653,0.187674
3,15717313,28362991,0.179801
4,15717313,36954076,0.175919
5,15717307,19531285,0.152437
6,15717307,36100559,0.139437
7,15717292,36638522,0.17037
8,15717292,34627018,0.169067
9,15717286,30720981,0.188865


In [15]:
# Collect predictions for each user
submit = result.groupby('instanceId_userId')['objectId'].apply(list)
submit.head(10)

instanceId_userId
316                                  [37758420, 17997084]
631                        [15478935, 30513650, 38118098]
742              [28816291, 24302446, 34685448, 10672856]
868     [30143153, 29193052, 35655697, 30882080, 22115...
979                                   [7996257, 37950972]
1006                                 [37520199, 34577503]
1276                       [22812401, 31000576, 36856262]
1444                                 [36806487, 20963755]
1483                                 [38036543, 34991228]
1618                         [546086, 35981492, 26764305]
Name: objectId, dtype: object

In [16]:
# Persist the first submit
submit.to_csv('text_submit.csv.gz', header=False, compression='gzip')