### Data Splitting

This notebook contains code to split data into train and test data. You can also split the data into train, test and validate. See the comments inline for more detail.

In [51]:
import json   

#### Create package to id dict for packages. Maps each package to a unique id.

In [None]:
with open('manifest-list-trimmed-unique.json', 'r') as f:
    a = 0
    content = json.load(f)
    x = dict()
    for package_list in content[0].get('package_list'):
        for package in package_list:
            if package:
                if package not in x:
                    x[package] = a
                    a = a+1

In [None]:
with open('package-to-id-dict-normalized.json', 'w') as w:
    json.dump(x, w)

#### Create item id to package mapping

In [None]:
id_to_package_dict = {v: k for k, v in x.items()}

In [None]:
with open('id-to-package-dict.json', 'w') as w:
    json.dump(id_to_package_dict, w)

#### Create id to manifest mapping. Each unique manifest is mapped to a unique id.

In [None]:
"""Create an id to manifest mapping. Maps each unique manifest to a unique id."""
with open('manifest-list-trimmed-unique.json', 'r') as f:
    content = json.load(f)
    a = 0
    id_to_manifest_dict = dict()
    for manifest in content[0].get('package_list'):
        id_to_manifest_dict[a] = manifest
        a += 1

In [None]:
with open('id-to-manifest-dict.json', 'w') as w:
    json.dump(id_to_manifest_dict, w)

#### Create a manifest to id mapping

In [None]:
manifest_to_id_dict = {frozenset(v): k for k,v in id_to_manifest_dict.items()}

In [None]:
"""Create a manifest to id mapping. Maps each unique manifest to a unique id."""

import pickle

with open('manifest-to-id.pickle', 'wb') as w:
    pickle.dump(manifest_to_id_dict, w, protocol=pickle.HIGHEST_PROTOCOL)

In [52]:
import pandas

In [None]:
with open('id-to-manifest-dict.json', 'r') as f:
    content = json.load(f)
    df = pandas.DataFrame.from_dict(content, orient='index')

In [None]:
del(content)

#### Create a dataframe with three columns UserId, ItemId, Count for every manifest. Each manifest is a user and the packages are items and the count is 1.

In [None]:
data_mapping_list = []

In [None]:
with open('id-to-manifest-dict.json', 'r') as m, open('package-to-id-dict-normalized.json', 'r') as p:
    content_man = json.load(m)
    content_pack = json.load(p)
    for k,v in content_man.items():
        userId = int(k)
        for package in v:
            itemId = content_pack[package]
            count = 1
            data_mapping_list.append(
                {
                    "UserId": userId,
                    "ItemId": itemId,
                    "Count": 1
                }
            )

In [None]:
with open('user-item-matrix.json', 'w') as f:
    json.dump(data_mapping_list, f)

#### The model accepts the data in form of a dataframe. Hence load the user item matrix as a dataframe.

In [6]:
with open('user-item-matrix.json', 'r') as f:
    df = pandas.read_json(f, orient='records')

In [7]:
import numpy as np

In [8]:
df.head()

Unnamed: 0,Count,ItemId,UserId
0,1,0,0
1,1,1,0
2,1,2,0
3,1,3,0
4,1,4,1


#### Group the dataframe by UserId since we want every user to be a part of training. This is because of the fact that this model can only validate the log likelihood on seen users and items.

In [9]:
df_user_id = df.groupby("UserId")

In [10]:
df_user_id.head()

Unnamed: 0,Count,ItemId,UserId
0,1,0,0
1,1,1,0
2,1,2,0
3,1,3,0
4,1,4,1
5,1,2,1
6,1,5,1
7,1,6,1
8,1,7,2
9,1,8,2


#### This creates train, test, validation split of data. 60% is given to training, 20% to validate and test respectively.

In [11]:
def train_test_validate_split(df):
    return np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

#### This creates only train and test split. 80% is training and 20% is testing.

In [12]:
def train_test_split(df):
    return np.split(df.sample(frac=1), [int(.8*len(df))])

In [13]:
df_user_id.head()

Unnamed: 0,Count,ItemId,UserId
0,1,0,0
1,1,1,0
2,1,2,0
3,1,3,0
4,1,4,1
5,1,2,1
6,1,5,1
7,1,6,1
8,1,7,2
9,1,8,2


In [14]:
len(df_user_id)

69134

#### Split the dataframe grouped by UserId into train and test

In [15]:
"""This is slow somehow. Need to optimize it."""
dataframe = df_user_id.apply(train_test_split)

In [16]:
dataframe[5]

[    Count  ItemId  UserId
 51      1      45       5
 58      1      52       5
 52      1      46       5
 63      1      57       5
 34      1      12       5
 59      1      53       5
 57      1      51       5
 42      1       3       5
 49      1      44       5
 61      1      55       5
 56      1      50       5
 36      1      33       5
 68      1      62       5
 29      1      28       5
 40      1      37       5
 62      1      56       5
 64      1      58       5
 67      1      61       5
 37      1      34       5
 53      1      47       5
 65      1      59       5
 44      1      40       5
 35      1      32       5
 38      1      35       5
 31      1      30       5
 48      1      43       5
 55      1      49       5
 43      1      39       5
 39      1      36       5
 47      1      42       5
 66      1      60       5
 60      1      54       5,     Count  ItemId  UserId
 30      1      29       5
 32      1      10       5
 33      1      31       5
 

In [28]:
list_df_train = list()
list_df_test = list()
list_df_validate = list()

#### This logic is used to make sure that every user is in training. For details on how data is splitted into train, test and validation set for implicit feedback systems have a look [here](https://jessesw.com/Rec-System/).

In [24]:
# """This is how this works:

#     Data is splitted into train, validate and test parts: 60% train, 20% validate and 20% test data.
#     Now, if we have no entries in training data (for example there might be only single item that user interacts with),
#     in that case if those entries are not in training data then they might be in test or validate since it's a random split. 
    
#     So, in that case we need to add it to the training data and remove it from test or validate since it's mandatory to have all the
#     users in the training set. The item interactions for a specific user, if having a reasonable count, are splitted between train test and validate.
# """

# for s in dataframe:
#     """If training data is empty."""
#     if s[0].empty:
#         """If test data is empty."""
#         if s[2].empty:
#             """Add the validation data to training data."""
#             list_df_train.append(s[1])
#         else:
#             """Add test data to training data."""
#             list_df_train.append(s[2])
#             if not s[1].empty:
#                 """Add validation data to validation df."""
#                 list_df_validate.append(s[1])
#     else:
#         """Add respective splits in their respective data frames."""
#         list_df_train.append(s[0])
#         if not s[1].empty:
#             list_df_validate.append(s[1])
#         if not s[2].empty:
#             list_df_test.append(s[2])
            

#### Same explanation as above goes here, it's just that we split into train and test here.

In [29]:
for s in dataframe:
    if s[0].empty:
        list_df_train.append(s[1])
    else:
        list_df_train.append(s[0])
        if not s[1].empty:
            list_df_test.append(s[1])

In [30]:
"""This has to be same as the number of unique manifests."""
len(list_df_train)

69134

In [31]:
len(list_df_test)

66807

In [32]:
len(list_df_validate)

0

In [33]:
list_df_test[0]

Unnamed: 0,Count,ItemId,UserId
0,1,0,0


In [35]:
training_data = pandas.concat(list_df_train, ignore_index=True)

In [36]:
len(set(training_data.UserId))

69134

In [37]:
len(set(training_data.ItemId))

16959

In [38]:
len(training_data)

575278

In [39]:
training_data.head()

Unnamed: 0,Count,ItemId,UserId
0,1,2,0
1,1,3,0
2,1,1,0
3,1,2,1
4,1,6,1


In [40]:
test_data = pandas.concat(list_df_test, ignore_index=True)

In [41]:
len(test_data)

176179

In [42]:
test_data.head()

Unnamed: 0,Count,ItemId,UserId
0,1,0,0
1,1,4,1
2,1,12,2
3,1,8,2
4,1,17,3


In [53]:
# validate_data = pandas.concat(list_df_validate, ignore_index=True)

#### A sanity check to see if there are different rows. There shouldn't be any overlap between train, test and validate data.


In [45]:
x = pandas.merge(training_data, test_data, how='inner', on=['Count', 'ItemId', 'UserId'])
assert x.empty
# x = pandas.merge(training_data, validate_data, how='inner', on=['Count', 'ItemId', 'UserId'])
# assert x.empty
# x = pandas.merge(test_data, validate_data, how='inner', on=['Count', 'ItemId', 'UserId'])
# assert x.empty

In [46]:
len(x)

0

In [48]:
training_data.to_pickle('./training-data.pkl')

In [49]:
test_data.to_pickle('./test-data.pkl')

In [50]:
# validate_df.to_pickle('./validation-data.pkl')