In [266]:
import collections
import csv
import io

from types import SimpleNamespace

In [257]:
def csv_to_rows_of_strings(csv_string=None, filehandle=None, path=None):
    """Read a path/csv_string/file obj and spit out rows of string lists.

    Specify ONE of csv_string, filehandle, or path (first source found in
    that order wins if multiple are provided). Make sure to follow the csv
    module instructions (open with newline='') and ensure the encoding is correct
    if you provide a filehandle. Paths to files assume a utf-8 encoded file.
    CSVs are opened with default csv module settings.

    :param csv_string: str, A string containing the contents of a CSV file.
    :param filehandle: An open file object (in string mode) to a CSV file.
    :param path: str, Path on disk to a CSV file.
    """

    # Dump whatever data source into a BytesIO object,
    # then read it with the CSV reader
    data = io.StringIO()
    if csv_string is not None:
        data.write(csv_string)
    elif filehandle is not None:
        data.write(filehandle.read())
    elif path is not None:
        with open(path, encoding='utf8', newline='') as csvfile:
            data.write(csvfile.read())
    else:
        raise Exception("Must provide a source for data!")
    # Put seek position at 0 (like an unread file)
    data.seek(0)

    rows = []
    reader = csv.reader(data)
    for row in reader:
        rows.append(row)
    return rows

In [259]:
class RowColumnView:
    """Gives row index or column-name indexable lists of cell values.

    Headers are separated/removed from data rows.

    Supports:
        - mydata.headers()
        - "ColumnName" in mydata  # Check if CSV has header
        - for row in mydata:
              # Do something with the row
        - for cell in mydata["ColumnName"]
              # Do something with the cell
        - Lazy load rows/columns with rowsi(), columni(), columnsi()
    """

    def __init__(self, rows_of_strings):
        if len(rows_of_strings) < 2:
            raise Exception('Empty CSV!')
        self._rows = rows_of_strings[1:]
        self._headers = rows_of_strings[0]

    def __getitem__(self, item):
        # Column names return a column
        if isinstance(item, str):
            if item not in self._headers:
                raise ValueError("Column name must be in known headers()!")
            index = self._headers.index(item)
            return [row[index] for row in self._rows]
        elif isinstance(item, int):
            return self._rows[int]
        else:
            raise ValueError("Must provide a string column name or row index!")

    def __contains__(self, item):
        if item in self._rows[0]:
            return True
        return False

    def __len__(self):
        return len(self._rows)

    def __iter__(self):
        return (list(row) for row in self._rows)

    def headers(self):
        return list(self._headers)

    def rowsi(self):
        # Iterator (lazy load) over rows
        return (list(row) for row in self._rows)

    def rows(self):
        return [list(row) for row in self.rowsi()]

    def columni(self, item):
        # Iterator (lazy-load) over a column
        if isinstance(item, str):
            if item not in self._headers:
                raise ValueError("Column name must be in known headers()!")
            index = self._headers.index(item)
            return (row[index] for row in self._rows)
        elif isinstance(item, int):
            return (row[item] for row in self._rows)
        else:
            raise TypeError("Must provide a string column name or row index!")

    def columnsi(self):
        # List of iterators (lazy load) for all columns
        return [self.columni(index) for index in range(len(self._headers))]

    def columns(self):
        return [self[colname] for colname in self._headers]

In [262]:
traffic_rows = csv_to_rows_of_strings(path=r'readthedocs_traffic_analytics_jupyterlab_2023-10-20_2024-01-18.csv')
search_rows = csv_to_rows_of_strings(path=r'readthedocs_search_analytics_jupyterlab_2023-10-20_2024-01-18.csv')
traffic_view = RowColumnView(traffic_rows)
search_view = RowColumnView(search_rows)

In [265]:
traffic_view.headers(), search_view.headers()

(['Date', 'Version', 'Path', 'Views'],
 ['Created Date', 'Query', 'Total Results'])

In [320]:
class FileMetrics:

    TYPES = SimpleNamespace(
        TRAFFIC='TRAFFIC',
        SEARCH='SEARCH',
    )
    TRAFFIC_HEADERS = SimpleNamespace(
        DATE='Date',
        VERSION='Version',
        PATH='Path',
        VIEWS='Views'
    )
    THDRS = TRAFFIC_HEADERS
    TRAFFIC_HDR_LIST = [TRAFFIC_HEADERS.DATE, TRAFFIC_HEADERS.VERSION,
                        TRAFFIC_HEADERS.PATH, TRAFFIC_HEADERS.VIEWS]
    SEARCH_HEADERS = SimpleNamespace(
        CREATED_DATE='Created Date',
        QUERY='Query',
        TOTAL_RESULTS='Total Results',
    )
    SHDRS = SEARCH_HEADERS
    SEARCH_HDR_LIST = [SEARCH_HEADERS.CREATED_DATE, SEARCH_HEADERS.QUERY,
                       SEARCH_HEADERS.TOTAL_RESULTS]

    def __init__(self, csv_string=None, filehandle=None, path=None):
        if csv_string is None and filehandle is None and path is None:
            raise ValueError("Must provide a data source!")

        sheet = RowColumnView(csv_to_rows_of_strings(csv_string, filehandle, path))
        self._sheet = sheet

        if set(sheet.headers()) >= set(FileMetrics.TRAFFIC_HDR_LIST):
            self._type = FileMetrics.TYPES.TRAFFIC
        elif set(sheet.headers()) >= set(FileMetrics.SEARCH_HDR_LIST):
            self._type = FileMetrics.TYPES.SEARCH
        else:
            raise ValueError("Ingested CSV is missing expected headers!")

    def type(self):
        return self._type

    def is_traffic(self):
        return self._type == FileMetrics.TYPES.TRAFFIC

    def is_search(self):
        return self._type == FileMetrics.TYPES.SEARCH

    def headers(self):
        return self._sheet.headers()

    def col_index(self, column_name):
        return self._sheet.headers().index(column_name)

    def rows_with_val(column_name, value):
        """Return a list of rows where the specified column value == value"""
        sheet = self._sheet
        column_index = sheet.headers().index(column_name)
        return [row for row in sheet if row[column_index] == value]

    def by_column_val(self, column_name):
        """Return a dict of {column_val: row_list} for the given column name"""
        sheet = self._sheet
        rows_by_header = {}

        hdr_index = sheet.headers().index(column_name)
        for row in sheet:
            val_at_column = row[hdr_index]
            if val_at_column not in rows_by_header:
                rows_by_header[val_at_column] = []
            rows_by_header[val_at_column].append(row)
        return rows_by_header

    def total_views(self):
        sheet = self._sheet
        view_index = self.col_index(FileMetrics.THDRS.VIEWS)
        return sum(int(row[view_index]) for row in sheet)

    def most_popular_queries(self):
        sheet = self._sheet
        counts = collections.Counter()

        query_hdr_index = sheet.headers().index(FileMetrics.SHDRS.QUERY)
        views_hdr_index = sheet.headers().index(FileMetrics.SHDRS.TOTAL_RESULTS)
        for row in sheet:
            counts[row[query_hdr_index]] += int(row[views_hdr_index])
        return counts.most_common()

    def most_popular_pages(self):
        sheet = self._sheet
        counts = collections.Counter()

        path_hdr_index = sheet.headers().index(FileMetrics.THDRS.PATH)
        views_hdr_index = sheet.headers().index(FileMetrics.THDRS.VIEWS)
        for row in sheet:
            counts[row[path_hdr_index]] += int(row[views_hdr_index])
        return counts.most_common()

    def most_popular_versions(self):
        sheet = self._sheet
        counts = collections.Counter()

        path_hdr_index = sheet.headers().index(FileMetrics.THDRS.VERSION)
        views_hdr_index = sheet.headers().index(FileMetrics.THDRS.VIEWS)
        for row in sheet:
            counts[row[path_hdr_index]] += int(row[views_hdr_index])
        return counts.most_common()

In [321]:
met = FileMetrics(path='readthedocs_traffic_analytics_jupyterlab_2023-10-20_2024-01-18.csv')
met.headers()

['Date', 'Version', 'Path', 'Views']

In [322]:
smet = FileMetrics(path='readthedocs_search_analytics_jupyterlab_2023-10-20_2024-01-18.csv')
smet.headers()

['Created Date', 'Query', 'Total Results']

In [325]:
met.most_popular_pages()

[('/index.html', 93495),
 ('/getting_started/installation.html', 87974),
 ('/getting_started/starting.html', 47081),
 ('/getting_started/overview.html', 33396),
 ('/privacy_policies.html', 19053),
 ('/user/extensions.html', 16403),
 ('/user/interface.html', 15071),
 ('/getting_started/faq.html', 14677),
 ('/user/debugger.html', 14214),
 ('/user/files.html', 11785),
 ('/user/index.html', 9327),
 ('/extension/extension_dev.html', 8945),
 ('/user/urls.html', 8565),
 ('/user/notebook.html', 7898),
 ('/user/directories.html', 7713),
 ('/user/terminal.html', 7486),
 ('/user/rtc.html', 6987),
 ('/search.html', 6481),
 ('/user/toc.html', 6040),
 ('/extension/extension_tutorial.html', 5916),
 ('/getting_started/changelog.html', 5905),
 ('/user/commands.html', 5763),
 ('/user/export.html', 5298),
 ('/user/running.html', 4780),
 ('/user/file_editor.html', 4699),
 ('/user/jupyterhub.html', 4658),
 ('/user/code_console.html', 4517),
 ('/developer/contributing.html', 4439),
 ('/user/file_formats.htm

In [324]:
smet.most_popular_queries()

[('pip install -e .', 18857),
 ('pip install jupyterlab', 8647),
 ('generative-ai-jupyterlab', 8628),
 ('download jupyterlab', 6558),
 ('ad -xyz', 6489),
 ('python -m build', 5892),
 ('could not determine jupyterlab build status without nodejs', 4456),
 ('-ip', 4410),
 ('jupyterlab desktop', 4380),
 ('uninstall jupyterlab', 4379),
 ('jupyterlab server', 4316),
 ('jupyterlab', 4313),
 ('jupyter lab -f', 3729),
 ('notebook', 3030),
 ('extension', 2451),
 ("import { reactwidget } from '@jupyterlab/apputils';", 2230),
 ('jupyterlab-git', 2228),
 ('run jupyterlab in an virtual environment', 2228),
 ('activate a virtual environment for jupyterlab', 2228),
 ('how to run jupyterlab remotely', 2227),
 ('jupyterlab download', 2226),
 ('jupyterlab 下载', 2226),
 ('@jupyterlab/services', 2226),
 ('jupyterlab whl', 2226),
 ('check for jupyterlab updates', 2226),
 ('@jupyterlab/application-extension:logo', 2225),
 ('updating jupyterlab', 2223),
 ('pip install --upgrade jupyterlab', 2223),
 ('captainji

In [301]:
by_path = met.by_column_val('Path')

In [302]:
met.most_popular_pages()

[('/index.html', 93495),
 ('/getting_started/installation.html', 87974),
 ('/getting_started/starting.html', 47081),
 ('/getting_started/overview.html', 33396),
 ('/privacy_policies.html', 19053),
 ('/user/extensions.html', 16403),
 ('/user/interface.html', 15071),
 ('/getting_started/faq.html', 14677),
 ('/user/debugger.html', 14214),
 ('/user/files.html', 11785),
 ('/user/index.html', 9327),
 ('/extension/extension_dev.html', 8945),
 ('/user/urls.html', 8565),
 ('/user/notebook.html', 7898),
 ('/user/directories.html', 7713),
 ('/user/terminal.html', 7486),
 ('/user/rtc.html', 6987),
 ('/search.html', 6481),
 ('/user/toc.html', 6040),
 ('/extension/extension_tutorial.html', 5916),
 ('/getting_started/changelog.html', 5905),
 ('/user/commands.html', 5763),
 ('/user/export.html', 5298),
 ('/user/running.html', 4780),
 ('/user/file_editor.html', 4699),
 ('/user/jupyterhub.html', 4658),
 ('/user/code_console.html', 4517),
 ('/developer/contributing.html', 4439),
 ('/user/file_formats.htm