# Store recommendations in Azure Cosmos DB

In ths notebook, we train our best model, run the predictions, and store the results in Azure Cosmos DB.

The recommendations are queried by the [Azure Function](https://github.com/fleuryc/oc_p9_function "Azure Function") consumed by the [Mobile App](https://github.com/fleuryc/oc_p9_mobile-app "Mobile App").

In [1]:
## Download raw data

!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.



In [2]:
## Import libraries

import os
import sys

# Add source directory to python path
sys.path.append(os.path.abspath("../"))


from datetime import datetime
from pathlib import Path
from azure.cosmos import CosmosClient, PartitionKey

import pandas as pd
import src.data.helpers as data_helpers
from implicit.als import AlternatingLeastSquares
from scipy import sparse
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()
AZURE_COSMOSDB_URI = os.getenv("AZURE_COSMOSDB_URI")
AZURE_COSMOSDB_KEY = os.getenv("AZURE_COSMOSDB_KEY")


# Plotly as Pandas plotting backend
pd.options.plotting.backend = "plotly"

RAW_DATA_PATH = "../data/raw"

CACHE = dict()

In [3]:
## Load click data

clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(Path(RAW_DATA_PATH, "clicks/clicks").glob("clicks_hour_*.csv"))
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks = data_helpers.reduce_dataframe_memory_usage(
    clicks.astype(
        {"session_start": "datetime64[ns]", "click_timestamp": "datetime64[ns]"}
    )
)

clicks.describe(include="all", datetime_is_numeric=True)

100%|██████████| 385/385 [01:26<00:00,  4.44it/s]


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181.0,2988181.0,2988181,2988181.0,2988181.0,2988181,2988181,2988181,2988181,2988181.0,2988181.0,2988181.0
unique,322897.0,1048594.0,,,46033.0,,3,5,8,11.0,28.0,7.0
top,5890.0,1507563657895091.0,,,160974.0,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,1232.0,124.0,,,37213.0,,2904478,1823162,1738138,2852406.0,804985.0,1602601.0
mean,,,2017-10-08 16:17:08.013155328,3.901885,,2017-10-08 16:51:05.070374400,,,,,,
min,,,2017-10-01 04:37:03,2.0,,2017-10-01 05:00:00,,,,,,
25%,,,2017-10-04 15:35:52,2.0,,2017-10-04 16:20:52,,,,,,
50%,,,2017-10-08 22:09:00,3.0,,2017-10-08 22:35:30,,,,,,
75%,,,2017-10-11 21:16:54,4.0,,2017-10-11 21:43:24,,,,,,
max,,,2017-10-17 05:36:19,124.0,,2017-11-13 21:04:14,,,,,,


In [9]:
## Create implicit ratings

ratings = (
    clicks.reset_index()
    .groupby(["user_id", "click_article_id"])
    .agg(
        rating=("index", "count"),
    )
    .reset_index()
)
ratings

Unnamed: 0,user_id,click_article_id,rating
0,0,157541,1
1,0,160158,1
2,0,233470,1
3,0,313996,1
4,0,68866,1
...,...,...,...
2950705,99998,64329,1
2950706,99999,168784,1
2950707,99999,225055,1
2950708,99999,272143,1


In [10]:
## Create implicit ratings sparse matrix

ratings_sparse = sparse.csr_matrix(
    (
        ratings["rating"],
        (ratings["user_id"].astype("int"), ratings["click_article_id"].astype("int")),
    ),
)

ratings_sparse

<322897x364047 sparse matrix of type '<class 'numpy.int64'>'
	with 2950710 stored elements in Compressed Sparse Row format>

In [12]:
## Train ALS model

model = AlternatingLeastSquares()
model.fit(ratings_sparse)

  0%|          | 0/15 [00:00<?, ?it/s]

In [14]:
## Test ALS model

model.recommend(5890, ratings_sparse[5890], N=10)

(array([ 70591, 202493, 206735, 236444, 208150, 236207, 207720,  36399,
        209013,  70646], dtype=int32),
 array([1.5931561, 1.4138165, 1.1736898, 1.1341774, 1.1058738, 1.0358618,
        0.9959603, 0.9771067, 0.9610278, 0.9574686], dtype=float32))

In [31]:
## Create CosmosDB client and database

client = CosmosClient(AZURE_COSMOSDB_URI, AZURE_COSMOSDB_KEY)
database = client.create_database_if_not_exists(id="Recommendation")
container = database.create_container_if_not_exists(
    id="UserArticles",
    partition_key=PartitionKey(path="/id"),
)

In [None]:
## Insert recommendations into CosmosDB

user_ids = ratings["user_id"].unique().astype("int")
recos, _ = model.recommend(user_ids, ratings_sparse[user_ids], N=10)

for i, user_id in tqdm(enumerate(user_ids)):
    container.upsert_item(
        body={
            "id": str(user_id),
            "articles": [str(article_id) for article_id in recos[i]],
        }
    )