# Неделя 1. Подготовка данных к анализу и построению моделей

[link](https://nbviewer.jupyter.org/github/Yorko/mlcourse_open/blob/master/jupyter_russian/project_alice/week1_prepare_dataset.ipynb?flush_cache=true)

In [1]:
%load_ext watermark

In [2]:
%watermark -v -m -p numpy,scipy,pandas,matplotlib,statsmodels,sklearn -g

CPython 3.7.6
IPython 7.11.1

numpy 1.18.1
scipy 1.3.2
pandas 0.25.3
matplotlib 3.1.1
statsmodels 0.10.1
sklearn 0.22.1

compiler   : GCC 7.3.0
system     : Linux
release    : 5.3.0-28-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   :


In [3]:
from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
from glob import glob
import os
import pickle
#pip install tqdm
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

Посмотрим на один из файлов с данными о посещенных пользователем (номер 31) веб-страницах

In [4]:
!ls

10users				  X_sparse_10users_s15_w7.pkl
150users			  X_sparse_10users_s5_w5.pkl
1_answer			  X_sparse_10users_s7_w5.pkl
2_answer			  X_sparse_10users_s7_w7.pkl
3_answer			  X_sparse_150users.pkl
3users				  X_sparse_150users_s10_w5.pkl
4_answer			  X_sparse_150users_s10_w7.pkl
5_answer			  X_sparse_150users_s15_w10.pkl
capstone_user_identification.zip  X_sparse_150users_s15_w5.pkl
__pycache__			  X_sparse_150users_s15_w7.pkl
site_freq_10users.pkl		  X_sparse_150users_s5_w5.pkl
site_freq_150users.pkl		  X_sparse_150users_s7_w5.pkl
site_freq_3users.pkl		  X_sparse_150users_s7_w7.pkl
task1_1.ipynb			  y_10users.pkl
task1_2.ipynb			  y_10users_s10_w5.pkl
task2_1_answer			  y_10users_s10_w7.pkl
task2_1.ipynb			  y_10users_s15_w10.pkl
task2_2_answer			  y_10users_s15_w5.pkl
task2_3_answer			  y_10users_s15_w7.pkl
task2_4_answer			  y_10users_s5_w5.pkl
task2_5_answer			  y_10users_s7_w5.pkl
train_data_10users.csv		  y_10users_s7_w7.pkl
train_data_150users.csv		  y_150us

In [5]:
# Поменяйте на свой путь к данным
PATH_TO_DATA = '.'
user31_data = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                       '10users/user0031.csv'))

In [6]:
user31_data.head()

Unnamed: 0,timestamp,site
0,2013-11-15 08:12:07,fpdownload2.macromedia.com
1,2013-11-15 08:12:17,laposte.net
2,2013-11-15 08:12:17,www.laposte.net
3,2013-11-15 08:12:17,www.google.com
4,2013-11-15 08:12:18,www.laposte.net


Поставим задачу классификации: идентифицировать пользователя по сессии из 10 подряд посещенных сайтов. Объектом в этой задаче будет сессия из 10 сайтов, последовательно посещенных одним и тем же пользователем, признаками – индексы этих 10 сайтов (чуть позже здесь появится "мешок" сайтов, подход Bag of Words). Целевым классом будет id пользователя.

Реализуйте функцию prepare_train_set, которая принимает на вход путь к каталогу с csv-файлами path_to_csv_files и параметр session_length – длину сессии, а возвращает 2 объекта:
- DataFrame, в котором строки соответствуют уникальным сессиям из session_length сайтов, session_length столбцов – индексам этих session_length сайтов и последний столбец – ID пользователя
- частотный словарь сайтов вида {'site_string': [site_id, site_freq]}, например для недавнего игрушечного примера это будет {'vk.com': (1, 2), 'google.com': (2, 2), 'yandex.ru': (3, 3), 'facebook.com': (4, 1)}


In [7]:
from collections import Counter


def get_sites(path_to_csv_files, session_length=10):
    """
    Returns :
    - dictionary of sites used by users;
    - sessions counter;
    - list of .csv files.
    """

#     ----- Variables:

#     Dictionary of sites. Count frequencies of use.
    sites = Counter()
#     Sessions counter
    sessions_counter = 0
#     List of .csv files
    fns = glob(path_to_csv_files+"*")

#     ----- Fill `sites` counter. Calculate sessions.

#     for every file
    for f in fns:
#         read user data
        ud = pd.read_csv(os.path.join(PATH_TO_DATA, f))
#         determine visited sites
        sites.update(Counter(ud['site']))
#         up session counter
        sessions_counter += int(np.ceil(len(ud)/session_length))
    
    
#     ----- Return dictionary with sorted and enumerated sites.
    result = {}
    for i, x in enumerate(sites.most_common()):
        result[x[0]] = (i+1, x[1])
    
    return result, sessions_counter, fns

In [17]:
def prepare_train_set(path_to_csv_files, session_length=10):
    """Returns:
    train_data : training table;
    sites : dictionary of sites"""

#     ----- Get dictionary of sites
    sites, session_counter, fns = get_sites(path_to_csv_files, session_length)

#     ----- Get columns for train table
    site_columns = []
    for i in range(session_length):
        site_columns.append(f"site{i+1}")
    site_columns.append("uid")

#     -----  Construct training table with shape (session_counter, len(site_columns))
    train_data = np.zeros(shape=(session_counter, len(site_columns)), dtype = 'int')

#     ----- Fill training table
#     Variables:
#     Row counter
    active_row = 0

#     For every user file (.csv)
    for f in fns:
#         Read sites usage data to DataFrame
        d = pd.read_csv(os.path.join(PATH_TO_DATA, f))
#         Add 'sid' as 'site id' to the table.
        d['sid'] = d['site'].apply(lambda x: sites[x][0])
#         Determine number of sessions:
        raws_number = int(np.ceil(len(d)/session_length))
#         Write session data to train table:
        for i, s in enumerate(d['sid']):
            row = active_row + i // session_length
            col = i % session_length
            train_data[row, col] = s
#         Write user data to train table
        for i in range(active_row, active_row+raws_number):
            train_data[i, session_length] = int(f[-8:-4])
#         Update row counter
        active_row += raws_number

    return pd.DataFrame(data = train_data, columns = site_columns), sites

In [9]:
!ls | grep users

10users
150users
3users
site_freq_10users.pkl
site_freq_150users.pkl
site_freq_3users.pkl
train_data_10users.csv
train_data_150users.csv
X_sparse_10users.pkl
X_sparse_10users_s10_w5.pkl
X_sparse_10users_s10_w7.pkl
X_sparse_10users_s15_w10.pkl
X_sparse_10users_s15_w5.pkl
X_sparse_10users_s15_w7.pkl
X_sparse_10users_s5_w5.pkl
X_sparse_10users_s7_w5.pkl
X_sparse_10users_s7_w7.pkl
X_sparse_150users.pkl
X_sparse_150users_s10_w5.pkl
X_sparse_150users_s10_w7.pkl
X_sparse_150users_s15_w10.pkl
X_sparse_150users_s15_w5.pkl
X_sparse_150users_s15_w7.pkl
X_sparse_150users_s5_w5.pkl
X_sparse_150users_s7_w5.pkl
X_sparse_150users_s7_w7.pkl
y_10users.pkl
y_10users_s10_w5.pkl
y_10users_s10_w7.pkl
y_10users_s15_w10.pkl
y_10users_s15_w5.pkl
y_10users_s15_w7.pkl
y_10users_s5_w5.pkl
y_10users_s7_w5.pkl
y_10users_s7_w7.pkl
y_150users.pkl
y_150users_s10_w5.pkl
y_150users_s10_w7.pkl
y_150users_s15_w10.pkl
y_150users_s15_w5.pkl
y_150users_s15_w7.pkl
y_150users_s5_w5.pkl

In [10]:
!ls ./3users/

user0001.csv  user0002.csv  user0003.csv


Test on small data

In [16]:
%%time
path = "3users/"
train_data_toy, site_freq_3users = prepare_train_set(path)
train_data_toy

CPU times: user 8.34 ms, sys: 8.96 ms, total: 17.3 ms
Wall time: 17 ms


Unnamed: 0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10,uid
0,3,2,2,7,2,1,8,5,9,10,1
1,3,1,1,1,0,0,0,0,0,0,1
2,4,1,2,1,2,1,1,5,11,4,3
3,4,1,2,0,0,0,0,0,0,0,3
4,3,2,6,6,2,0,0,0,0,0,2


In [18]:
%%time
path = "10users/"
train_data_10users, site_freq_10users = prepare_train_set(path)

CPU times: user 364 ms, sys: 47.9 ms, total: 412 ms
Wall time: 411 ms


In [19]:
len(train_data_10users)

14061

In [20]:
len(site_freq_10users)

4913

In [21]:
%%time
path = "150users/"
train_data_150users, site_freq_150users = prepare_train_set(path)

CPU times: user 3.86 s, sys: 283 ms, total: 4.14 s
Wall time: 4.28 s


In [22]:
len(train_data_150users)

137019

In [23]:
len(site_freq_150users)

27797

In [24]:
import itertools
top10 = tuple(itertools.islice(site_freq_150users.items(),10))
answer  = " ".join([x[0] for x in top10])
answer

'www.google.fr www.google.com www.facebook.com apis.google.com s.youtube.com clients1.google.com mail.google.com plus.google.com safebrowsing-cache.google.com www.youtube.com'

In [25]:
def write_answer(i, answer):
    with open(f"{i}_answer", mode = "w") as file:
        file.write(str(answer))

In [26]:
write_answer(1, 14061)
write_answer(2, 4913)
write_answer(3, 137019)
write_answer(4, 27797)
write_answer(5, answer)

In [27]:
!ls | grep answer

1_answer
2_answer
3_answer
4_answer
5_answer
task2_1_answer
task2_2_answer
task2_3_answer
task2_4_answer
task2_5_answer


In [28]:
pd.DataFrame(train_data_10users).to_csv(os.path.join(PATH_TO_DATA, 
                                       'train_data_10users.csv'), 
                        index_label='session_id', float_format='%d')
pd.DataFrame(train_data_150users).to_csv(os.path.join(PATH_TO_DATA, 
                                        'train_data_150users.csv'), 
                         index_label='session_id', float_format='%d')

In [29]:
pd.DataFrame(train_data_toy).to_csv(os.path.join(PATH_TO_DATA, 
                                       'train_data_toy.csv'), 
                        index_label='session_id', float_format='%d')

Saving

In [30]:
with open(os.path.join(PATH_TO_DATA, 'site_freq_3users.pkl'), 'wb') as site_freq_3users_pkl:
    pickle.dump(site_freq_3users, site_freq_3users_pkl, protocol=2)
with open(os.path.join(PATH_TO_DATA, 'site_freq_10users.pkl'), 'wb') as site_freq_10users_pkl:
    pickle.dump(site_freq_10users, site_freq_10users_pkl, protocol=2)
with open(os.path.join(PATH_TO_DATA, 'site_freq_150users.pkl'), 'wb') as site_freq_150users_pkl:
    pickle.dump(site_freq_150users, site_freq_150users_pkl, protocol=2)