In [1]:
%load_ext watermark

In [2]:
%watermark -v -m -p numpy,scipy,pandas,matplotlib,statsmodels,sklearn -g

CPython 3.7.3
IPython 7.5.0

numpy 1.16.3
scipy 1.2.1
pandas 0.24.2
matplotlib 3.0.3
statsmodels 0.9.0
sklearn 0.21.0

compiler   : MSC v.1915 64 bit (AMD64)
system     : Windows
release    : 7
machine    : AMD64
processor  : Intel64 Family 6 Model 69 Stepping 1, GenuineIntel
CPU cores  : 2
interpreter: 64bit
Git hash   :


In [1]:
from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
from glob import glob
import os
import pickle
#pip install tqdm
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from collections import defaultdict

In [2]:
PATH_TO_DATA = 'capstone_user_identification'

In [3]:
user31_data = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                       '10users/user0031.csv'))

In [6]:
#user31_data.head()
user31_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7600 entries, 0 to 7599
Data columns (total 2 columns):
timestamp    7600 non-null object
site         7600 non-null object
dtypes: object(2)
memory usage: 118.8+ KB


# Часть 1. Подготовка обучающей выборки¶

In [17]:
def prepare_train_set(path_to_csv_files, session_length=10):
    #Для выполнения рекомендации меньшие индексы давать более часто попадающимся сайтам
    #(приницип наименьшего описания) выполняется два прохода по файлам. На первом проходе
    #осуществляется подсчет частот, сортировка, назначение индексов, формирование выходного
    #словаря
    dicC = {}; dicC = defaultdict(lambda:0,dicC)
    files = glob(os.path.join(path_to_csv_files, 'user????.csv'))
    for fname in files: 
        with open(fname) as f:
            for line in [line.rstrip() for line in f][1:]:
                dicC[line.split(',')[1]] += 1

    
    df = pd.DataFrame.from_dict(dicC, columns = ['Count'], 
                       orient='index')
    
    df['colFromIndex'] = df.index
    df = df.sort_values(by=['Count', 'colFromIndex'], ascending=[False, True])
    df['ID'] = np.arange(len(df))
    dictId = {} # для простоты извлечения и обработки пустого значения
    dictOut = {}
    for index, row in df.iterrows():
        dictOut[index] = (row['ID'] + 1, row['Count'])
        dictId[index] = row['ID'] + 1
    
    dictId[''] = 0 #Обеспечит преобразовани пустой строки в 0 
    #На втором проходе выполняется собственно разбор данных и помещение их в DataFrame

    lOut = [] 
    for fname in files:
        user_id = int(fname[-8:-4])
        with open(fname) as f:
            # l = [ line.split(',')[1] for line in [line.rstrip() for line in f][1:]]
            l = [ dictId[line.split(',')[1]] for line in [line.rstrip() for line in f][1:]]

        
        nadd = len(l) % session_length 
        if nadd != 0: l += list(np.zeros(session_length-nadd, dtype=np.int)) # ['' for _ in range (session_length-nadd)] 
        for i in np.arange(0, int(len(l)/session_length)):
            lOut += [l[i*session_length: (i+1)*session_length] + [user_id]]
              
    dfOut = pd.DataFrame( lOut, 
            columns= ['site{}'.format(x) for x in range(1, session_length+1)] + ['user_id'])
    return (dfOut, dictOut)


In [8]:
!type $PATH_TO_DATA\3users\user0001.csv

timestamp,site
2013-11-15 09:28:17,vk.com
2013-11-15 09:33:04,oracle.com
2013-11-15 09:52:48,oracle.com
2013-11-15 11:37:26,geo.mozilla.org
2013-11-15 11:40:32,oracle.com
2013-11-15 11:40:34,google.com
2013-11-15 11:40:35,accounts.google.com
2013-11-15 11:40:37,mail.google.com
2013-11-15 11:40:40,apis.google.com
2013-11-15 11:41:35,plus.google.com
2013-11-15 12:40:35,vk.com
2013-11-15 12:40:37,google.com
2013-11-15 12:40:40,google.com
2013-11-15 12:41:35,google.com


In [9]:
!type $PATH_TO_DATA\3users\user0002.csv

timestamp,site
2013-11-15 09:28:17,vk.com
2013-11-15 09:33:04,oracle.com
2013-11-15 09:52:48,football.kulichki.ru
2013-11-15 11:37:26,football.kulichki.ru
2013-11-15 11:40:32,oracle.com


In [10]:
!type $PATH_TO_DATA\3users\user0003.csv

timestamp,site
2013-11-15 09:28:17,meduza.io
2013-11-15 09:33:04,google.com
2013-11-15 09:52:48,oracle.com
2013-11-15 11:37:26,google.com
2013-11-15 11:40:32,oracle.com
2013-11-15 11:40:34,google.com
2013-11-15 11:40:35,google.com
2013-11-15 11:40:37,mail.google.com
2013-11-15 11:40:40,yandex.ru
2013-11-15 11:41:35,meduza.io
2013-11-15 12:28:17,meduza.io
2013-11-15 12:33:04,google.com
2013-11-15 12:52:48,oracle.com


In [30]:
%%time
train_data_toy, site_freq_3users = prepare_train_set(os.path.join(PATH_TO_DATA, '3users'), 
                                                     session_length=10)

Wall time: 10.2 ms


In [9]:
train_data_toy

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,user_id
0,4,2,2,9,2,1,7,6,8,10,1
1,4,1,1,1,0,0,0,0,0,0,1
2,4,2,5,5,2,0,0,0,0,0,2
3,3,1,2,1,2,1,1,6,11,3,3
4,3,1,2,0,0,0,0,0,0,0,3


In [6]:
train_data_toy

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,user_id
0,4,2,2,9,2,1,7,6,8,10,1
1,4,1,1,1,0,0,0,0,0,0,1
2,4,2,5,5,2,0,0,0,0,0,2
3,3,1,2,1,2,1,1,6,11,3,3
4,3,1,2,0,0,0,0,0,0,0,3


In [7]:
site_freq_3users

{'google.com': (1, 9),
 'oracle.com': (2, 8),
 'meduza.io': (3, 3),
 'vk.com': (4, 3),
 'football.kulichki.ru': (5, 2),
 'mail.google.com': (6, 2),
 'accounts.google.com': (7, 1),
 'apis.google.com': (8, 1),
 'geo.mozilla.org': (9, 1),
 'plus.google.com': (10, 1),
 'yandex.ru': (11, 1)}

Вычмсляем и сохраняем значения для 10 и 150 пользователей

In [18]:
%%time
train_data_10, site_freq_10users = prepare_train_set(os.path.join(PATH_TO_DATA, '10users'), 
                                                     session_length=10)

Wall time: 1.01 s


In [13]:
%%time
train_data_150, site_freq_150users = prepare_train_set(os.path.join(PATH_TO_DATA, '150users'), 
                                                     session_length=10)

Wall time: 8.31 s


In [17]:
train_data_10.to_csv(os.path.join(PATH_TO_DATA, 'train_data_10users.csv'), 
                        index_label='session_id', float_format='%d')
train_data_150.to_csv(os.path.join(PATH_TO_DATA, 'train_data_150users.csv'), 
                        index_label='session_id', float_format='%d')

In [55]:
def save_answer(fnameNum, val):
    with open('dminik6_w1_{}.txt'.format(fnameNum),"w") as fout:
        fout.write('{}'.format(val))

def save_answer_str_Array(fnameNum, array):
    with open('dminik6_w1_{}.txt'.format(fnameNum),"w") as fout:
        fout.write(" ".join(['{}'.format(el) for el in array]))

### Вопрос 1. Сколько уникальных сессий из 10 сайтов в выборке с 10 пользователями?

In [18]:
train_data_10.shape

(14061, 11)

In [38]:
print('Число уникальных сессий из 10 сайтов в выборке с 10 пользователями: {}'.format(len(train_data_10)))
save_answer(1, len(train_data_10))

Число уникальных сессий из 10 сайтов в выборке с 10 пользователями: 14061


### Вопрос 2. Сколько всего уникальных сайтов в выборке из 10 пользователей?

In [39]:
print('Число уникальных сайтов в выборке из 10 пользователей: {}'.format(len(site_freq_10users)))
save_answer(2, len(site_freq_10users))

Число уникальных сайтов в выборке из 10 пользователей: 4913


### Вопрос 3. Сколько уникальных сессий из 10 сайтов в выборке с 150 пользователями?

In [40]:
train_data_150.shape

(137019, 11)

In [41]:
print('Число уникальных сессий из 10 сайтов в выборке с 150 пользователями: {}'.format(len(train_data_150)))
save_answer(3, len(train_data_150))

Число уникальных сессий из 10 сайтов в выборке с 150 пользователями: 137019


### Вопрос 4. Сколько всего уникальных сайтов в выборке из 150 пользователей?

In [42]:
print('Число уникальных сайтов в выборке из 150 пользователей: {}'.format(len(site_freq_150users)))
save_answer(4, len(site_freq_150users))

Число уникальных сайтов в выборке из 150 пользователей: 27797


### Вопрос 5. Какой из этих сайтов НЕ входит в топ-10 самых популярных сайтов среди посещенных 150 пользователями?

In [56]:
test_sites = ['www.google.fr', 'www.youtube.com', 
              'safebrowsing-cache.google.com', 'www.linkedin.com']
#я знаю, что когда формировался site_freq_150users ключи добавлялись от большого к меньшим
list_top = list(site_freq_150users.keys())[0:10]
for site in test_sites:
    if not site in list_top:
        print('Сайт {} не входит в топ-10 самых популярных сайтов'.format(site))

        
        
save_answer_str_Array(5, list_top)
#save_answer(5, 'www.linkedin.com')

Сайт www.linkedin.com не входит в топ-10 самых популярных сайтов


# Часть 2. Работа с разреженным форматом данных

In [8]:
X_toy, y_toy = train_data_toy.iloc[:, :-1].values, train_data_toy.iloc[:, -1].values

In [9]:
X_toy

array([[ 4,  2,  2,  9,  2,  1,  7,  6,  8, 10],
       [ 4,  1,  1,  1,  0,  0,  0,  0,  0,  0],
       [ 4,  2,  5,  5,  2,  0,  0,  0,  0,  0],
       [ 3,  1,  2,  1,  2,  1,  1,  6, 11,  3],
       [ 3,  1,  2,  0,  0,  0,  0,  0,  0,  0]], dtype=int64)

In [10]:
def dat_for_csr_00(aDat):
    indptr = [0]
    indices = []
    data = []
    for d in aDat:
        for term in d:
            indices.append(term)
            data.append(1)
        indptr.append(len(indices))
    return (data, indices, indptr)

In [7]:
def dat_for_csr(aDat):
    # Получено после анализа данных предшествующего варианта и документации
    # csr_matrix((data, indices, indptr))
    # is the standard CSR representation where the column indices for 
    # row i are stored in indices[indptr[i]:indptr[i+1]] 
    # and their corresponding values are stored in data[indptr[i]:indptr[i+1]]. 
    return (np.ones(aDat.shape[0] * aDat.shape[1], dtype=np.int), 
            aDat.reshape(aDat.shape[0] * aDat.shape[1]), 
            np.arange(0, (aDat.shape[0] + 1) * aDat.shape[1], aDat.shape[1] ))

In [15]:
data, indices, indptr = dat_for_csr(X_toy)

In [35]:
print(len(data))
print (data)
print((np.ones(X_toy.shape[0] * X_toy.shape[1])))

50
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1.]


In [30]:
print(len(indices))
print (indices)
X_toy.reshape(X_toy.shape[0] * X_toy.shape[1])

50
[4, 2, 2, 9, 2, 1, 7, 6, 8, 10, 4, 1, 1, 1, 0, 0, 0, 0, 0, 0, 4, 2, 5, 5, 2, 0, 0, 0, 0, 0, 3, 1, 2, 1, 2, 1, 1, 6, 11, 3, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0]


array([ 4,  2,  2,  9,  2,  1,  7,  6,  8, 10,  4,  1,  1,  1,  0,  0,  0,
        0,  0,  0,  4,  2,  5,  5,  2,  0,  0,  0,  0,  0,  3,  1,  2,  1,
        2,  1,  1,  6, 11,  3,  3,  1,  2,  0,  0,  0,  0,  0,  0,  0],
      dtype=int64)

In [25]:
print(indptr)
print(np.arange(0, (X_toy.shape[0] + 1)* X_toy.shape[1], X_toy.shape[1] ))

[0, 10, 20, 30, 40, 50]
[ 0 10 20 30 40 50]


In [22]:
X_toy.shape

(5, 10)

In [37]:
X_sparse_toy = csr_matrix(dat_for_csr(X_toy))[:,1:]

In [38]:
X_sparse_toy.todense()

matrix([[1, 3, 0, 1, 0, 1, 1, 1, 1, 1, 0],
        [3, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0],
        [4, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

In [29]:
X_sparse_toy.todense()

matrix([[1, 3, 0, 1, 0, 1, 1, 1, 1, 1, 0],
        [3, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0],
        [4, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

Обазец из задания.
matrix([[1, 3, 1, 0, 1, 0, 1, 1, 1, 1, 0],
        [3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 2, 1, 0, 0, 2, 0, 0, 0, 0, 0],
        [4, 2, 0, 2, 1, 0, 0, 0, 0, 0, 1],
        [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

In [20]:
X_10users, y_10users = train_data_10.iloc[:, :-1].values, train_data_10.iloc[:, -1].values
#X_150users, y_150users = train_data_150.iloc[:, :-1].values, train_data_150.iloc[:, -1].values
with open(os.path.join(PATH_TO_DATA, 'y_10users.pkl'), 'wb') as y10_pkl:
    pickle.dump(y_10users, y10_pkl, protocol=2)

In [42]:
X_sparse_10users = csr_matrix(dat_for_csr(X_10users))[:,1:]
X_sparse_150users = csr_matrix(dat_for_csr(X_150users))[:,1:]

In [32]:
with open(os.path.join(PATH_TO_DATA, 'X_sparse_10users.pkl'), 'wb') as X10_pkl:
    pickle.dump(X_sparse_10users, X10_pkl, protocol=2)
with open(os.path.join(PATH_TO_DATA, 'y_10users.pkl'), 'wb') as y10_pkl:
    pickle.dump(y_10users, y10_pkl, protocol=2)
with open(os.path.join(PATH_TO_DATA, 'X_sparse_150users.pkl'), 'wb') as X150_pkl:
    pickle.dump(X_sparse_150users, X150_pkl, protocol=2)
with open(os.path.join(PATH_TO_DATA, 'y_150users.pkl'), 'wb') as y150_pkl:
    pickle.dump(y_150users, y150_pkl, protocol=2)
with open(os.path.join(PATH_TO_DATA, 'site_freq_3users.pkl'), 'wb') as site_freq_3users_pkl:
    pickle.dump(site_freq_3users, site_freq_3users_pkl, protocol=2)
with open(os.path.join(PATH_TO_DATA, 'site_freq_10users.pkl'), 'wb') as site_freq_10users_pkl:
    pickle.dump(site_freq_10users, site_freq_10users_pkl, protocol=2)
with open(os.path.join(PATH_TO_DATA, 'site_freq_150users.pkl'), 'wb') as site_freq_150users_pkl:
    pickle.dump(site_freq_150users, site_freq_150users_pkl, protocol=2)

In [43]:
assert X_sparse_10users.shape[1] == len(site_freq_10users)

In [44]:
assert X_sparse_150users.shape[1] == len(site_freq_150users)

Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,user_id
0,4,2,2,9,2,1,7,6,8,10,1
1,4,1,1,1,0,0,0,0,0,0,1
2,4,2,5,5,2,0,0,0,0,0,2
3,3,1,2,1,2,1,1,6,11,3,3
4,3,1,2,0,0,0,0,0,0,0,3


## Добавлено на 4 неделе, для формирования данных

In [None]:
def prepare_train_set_with_fe(path_to_csv_files, site_freq_path, 
                                    session_length=10, window_size=10):
    with open(site_freq_path, 'rb') as f:
        countdict = pickle.load(f)
    
    files = glob(os.path.join(path_to_csv_files, 'user????.csv'))
    lOut = []

    for fname in files:
        user_id = int(fname[-8:-4])
        l = []
        with open(fname) as f:                        
            for line in [line.rstrip() for line in f][1:]:
                atmp = line.split(',')
                l.append(countdict[atmp[1]][0])
                
        ipos, lenmax = 0, len(l)
        while ipos < lenmax:
            ltmp = np.zeros(session_length)
            for i_tmp, i_win in enumerate(range(ipos, min(lenmax, ipos+session_length))):
                ltmp[i_tmp] = l[i_win]
            ipos += window_size 
            lOut += [ list(ltmp) + [user_id] ]
    
    dfOut = pd.DataFrame( lOut, 
                columns= ['site{}'.format(x) for x in range(1, session_length+1)] + ['user_id'],
                dtype=np.int)
    
    return dfOut

In [26]:
def prepare_pk(window_size, session_length, nusers):
    train_data = prepare_train_set_with_fe(os.path.join(PATH_TO_DATA, 
                                                             '{}users'.format(nusers)), 
                          site_freq_path=os.path.join(PATH_TO_DATA, 
                                                  'site_freq_{}users.pkl'.format(nusers)),
                        session_length=session_length, window_size=window_size)
    X_users, y_users  = train_data.iloc[:, :-1].values, train_data.iloc[:, -1].values
    X_sparse_users = csr_matrix(dat_for_csr(X_users))[:,1:]

    with open(os.path.join(PATH_TO_DATA, 'X_sparse_{}users_s{:02d}_w{:02d}.pkl'.format(nusers, session_length, window_size)), 'wb') as X_pkl:
        pickle.dump(X_sparse_users, X_pkl, protocol=2)

    with open(os.path.join(PATH_TO_DATA, 'y_{}users_s{:02d}_w{:02d}.pkl'.format(nusers, session_length, window_size)), 'wb') as y_pkl:
        pickle.dump(y_users, y_pkl, protocol=2)
    return

In [29]:
%%time
import itertools
# (5,5), (7,7), (10,10) - для 150 пользователей
nusers = 10

for window_size, session_length in itertools.product([10, 7, 5], [15, 10, 7, 5]):
# for window_size, session_length in [(5,5), (7,7), (10,10)]:
    if window_size <= session_length:
        print(window_size, session_length)
        prepare_pk(window_size, session_length, nusers)

5 5
7 7
10 10
Wall time: 8.77 s
