# Неделя 1. Подготовка данных к анализу и построению моделей

[link](https://nbviewer.jupyter.org/github/Yorko/mlcourse_open/blob/master/jupyter_russian/project_alice/week1_prepare_dataset.ipynb?flush_cache=true)

In [1]:
%load_ext watermark

In [2]:
%watermark -v -m -p numpy,scipy,pandas,matplotlib,statsmodels,sklearn -g

CPython 3.7.6
IPython 7.11.1

numpy 1.18.1
scipy 1.3.2
pandas 0.25.3
matplotlib 3.1.1
statsmodels 0.10.1
sklearn 0.22.1

compiler   : GCC 7.3.0
system     : Linux
release    : 5.3.0-26-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   :


In [3]:
from __future__ import division, print_function
# отключим всякие предупреждения Anaconda
import warnings
warnings.filterwarnings('ignore')
from glob import glob
import os
import pickle
#pip install tqdm
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

Посмотрим на один из файлов с данными о посещенных пользователем (номер 31) веб-страницах

In [4]:
!ls

10users   4_answer			    task1.ipynb
150users  5_answer			    task2.ipynb
1_answer  capstone_user_identification	    train_data_10users.csv
2_answer  capstone_user_identification.zip  train_data_150users.csv
3_answer  main.py			    users.py
3users	  __pycache__			    week1_prepare_dataset.ipynb


In [5]:
# Поменяйте на свой путь к данным
PATH_TO_DATA = '.'
user31_data = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                       '10users/user0031.csv'))

In [6]:
user31_data.head()

Unnamed: 0,timestamp,site
0,2013-11-15 08:12:07,fpdownload2.macromedia.com
1,2013-11-15 08:12:17,laposte.net
2,2013-11-15 08:12:17,www.laposte.net
3,2013-11-15 08:12:17,www.google.com
4,2013-11-15 08:12:18,www.laposte.net


Поставим задачу классификации: идентифицировать пользователя по сессии из 10 подряд посещенных сайтов. Объектом в этой задаче будет сессия из 10 сайтов, последовательно посещенных одним и тем же пользователем, признаками – индексы этих 10 сайтов (чуть позже здесь появится "мешок" сайтов, подход Bag of Words). Целевым классом будет id пользователя.

Реализуйте функцию prepare_train_set, которая принимает на вход путь к каталогу с csv-файлами path_to_csv_files и параметр session_length – длину сессии, а возвращает 2 объекта:
- DataFrame, в котором строки соответствуют уникальным сессиям из session_length сайтов, session_length столбцов – индексам этих session_length сайтов и последний столбец – ID пользователя
- частотный словарь сайтов вида {'site_string': [site_id, site_freq]}, например для недавнего игрушечного примера это будет {'vk.com': (1, 2), 'google.com': (2, 2), 'yandex.ru': (3, 3), 'facebook.com': (4, 1)}


In [7]:
from collections import Counter

def get_sites2(path_to_csv_files, session_length=10):
    """Returns :
    - dictionary of sites were used by users.
    """
    # Vars:

    # dictionary of site id's
    sites = Counter()
    # sessions counter
    sessions_counter = 0
    # list of user files
    fns = glob(path_to_csv_files+"*")

    # for every file
    for f in fns:
        # read user data
        ud = pd.read_csv(os.path.join(PATH_TO_DATA, f))
        # determine sites were visited
        sites.update(Counter(ud['site']))
#       up session counter
        sessions_counter += int(np.ceil(len(ud)/session_length))
    # return dictionary with sorted and enumerated sites
    result = {}
    for i, x in enumerate(sites.most_common()):
        result[x[0]] = (i+1, x[1])
    return result, sessions_counter, fns

In [8]:
def prepare_train_set2(path_to_csv_files, session_length=10):
    """Returns:
    train_data : training table;
    sites : dictionary of sites"""

#     Get dictionary of sites
    sites, session_counter, fns = get_sites2(path_to_csv_files, session_length)

#     Get columns for train table
    site_columns = []
    for i in range(session_length):
        site_columns.append(f"site{i+1}")
    site_columns.append("uid")

#     Training table
    train_data = np.zeros(shape=(session_counter, len(site_columns)))
#     train_data = csr_matrix((session_counter, len(site_columns)))

#     Row counter
    active_row = 0

#     For every user file
    for f in fns:
        #         Read user data to DataFrame
        d = pd.read_csv(os.path.join(PATH_TO_DATA, f))

#         Add site id from dictionary of sites
        d['sid'] = d['site'].apply(lambda x: sites[x][0])

#         Determine number of new rows:
        raws_number = int(np.ceil(len(d)/session_length))

#         Write session data to train table:
        for i, s in enumerate(d['sid']):
            row = active_row + i // session_length
            col = i % session_length
            train_data[row, col] = s
            
#         Write user data to train table
        for i in range(active_row, active_row+raws_number):
            train_data[i, session_length] = int(f[-8:-4])

#         Update row counter
        active_row += raws_number

    return train_data, sites

In [9]:
!ls | grep users

10users
150users
3users
train_data_10users.csv
train_data_150users.csv
users.py


In [10]:
!ls ./3users/

user0001.csv  user0002.csv  user0003.csv


Test on small data

In [11]:
%%time
path = "3users/"
train_data_toy, _ = prepare_train_set2(path)
train_data_toy

CPU times: user 30.6 ms, sys: 0 ns, total: 30.6 ms
Wall time: 29.2 ms


array([[ 3.,  2.,  2.,  7.,  2.,  1.,  8.,  5.,  9., 10.,  1.],
       [ 3.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 4.,  1.,  2.,  1.,  2.,  1.,  1.,  5., 11.,  4.,  3.],
       [ 4.,  1.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.],
       [ 3.,  2.,  6.,  6.,  2.,  0.,  0.,  0.,  0.,  0.,  2.]])

In [12]:
%%time
path = "10users/"
train_data_10users, site_freq_10users = prepare_train_set2(path)

CPU times: user 399 ms, sys: 48.4 ms, total: 448 ms
Wall time: 446 ms


In [13]:
len(train_data_10users)

14061

In [14]:
len(site_freq_10users)

4913

In [15]:
%%time
path = "150users/"
train_data_150users, site_freq_150users = prepare_train_set2(path)

CPU times: user 4.04 s, sys: 259 ms, total: 4.3 s
Wall time: 4.3 s


In [16]:
len(train_data_150users)

137019

In [17]:
len(site_freq_150users)

27797

In [18]:
import itertools
top10 = tuple(itertools.islice(site_freq_150users.items(),10))
answer  = " ".join([x[0] for x in top10])
answer

'www.google.fr www.google.com www.facebook.com apis.google.com s.youtube.com clients1.google.com mail.google.com plus.google.com safebrowsing-cache.google.com www.youtube.com'

In [19]:
def write_answer(i, answer):
    with open(f"{i}_answer", mode = "w") as file:
        file.write(str(answer))

In [20]:
write_answer(1, 14061)
write_answer(2, 4913)
write_answer(3, 137019)
write_answer(4, 27797)
write_answer(5, answer)

In [21]:
!ls | grep answer

1_answer
2_answer
3_answer
4_answer
5_answer


In [22]:
pd.DataFrame(train_data_10users).to_csv(os.path.join(PATH_TO_DATA, 
                                       'train_data_10users.csv'), 
                        index_label='session_id', float_format='%d')
pd.DataFrame(train_data_150users).to_csv(os.path.join(PATH_TO_DATA, 
                                        'train_data_150users.csv'), 
                         index_label='session_id', float_format='%d')

In [24]:
pd.DataFrame(train_data_toy).to_csv(os.path.join(PATH_TO_DATA, 
                                       'train_data_toy.csv'), 
                        index_label='session_id', float_format='%d')