Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metrics #841

Merged
merged 19 commits into from
Nov 13, 2023
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ jobs:
- name: Export examples
run: jupyter nbconvert --to python examples/*/*.ipynb --output-dir example_scripts
- name: Download test data
run: curl https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip -o Bike-Sharing-Dataset.zip &&
run: curl -k https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip -o Bike-Sharing-Dataset.zip &&
unzip Bike-Sharing-Dataset.zip -d Bike-Sharing-Dataset
- name: Run examples
run: python example_test.py
Expand Down
2 changes: 1 addition & 1 deletion examples/data_stories/bicycle_demand_monitoring.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
},
"outputs": [],
"source": [
"content = requests.get(\"https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip\").content\n",
"content = requests.get(\"https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip\", verify=False).content\n",
"with zipfile.ZipFile(io.BytesIO(content)) as arc:\n",
" raw_data = pd.read_csv(arc.open(\"hour.csv\"), header=0, sep=',', parse_dates=['dteday'], index_col='dteday')"
]
Expand Down
4 changes: 2 additions & 2 deletions examples/data_stories/bicycle_demand_monitoring_setup.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
"cell_type": "code",
"execution_count": null,
"source": [
"content = requests.get(\"https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip\").content\n",
"content = requests.get(\"https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip\", verify=False).content\n",
"with zipfile.ZipFile(io.BytesIO(content)) as arc:\n",
" raw_data = pd.read_csv(arc.open(\"hour.csv\"), header=0, sep=',', parse_dates=['dteday']) "
],
Expand Down Expand Up @@ -916,4 +916,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
4 changes: 2 additions & 2 deletions examples/data_stories/bicycle_demand_testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
"cell_type": "code",
"execution_count": null,
"source": [
"content = requests.get(\"https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip\").content\n",
"content = requests.get(\"https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip\", verify=False).content\n",
"with zipfile.ZipFile(io.BytesIO(content)) as arc:\n",
" raw_data = pd.read_csv(arc.open(\"hour.csv\"), header=0, sep=',', parse_dates=['dteday'], index_col='dteday')"
],
Expand Down Expand Up @@ -423,4 +423,4 @@
},
"nbformat": 4,
"nbformat_minor": 1
}
}
314 changes: 314 additions & 0 deletions examples/how_to_questions/how_to_run_recsys_metrics.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "8aJYBZFNMyXc",
"collapsed": true
},
"outputs": [],
"source": [
"try:\n",
" import evidently\n",
"except:\n",
" !pip install git+https://github.com/evidentlyai/evidently.git"
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import io\n",
"import os\n",
"import zipfile\n",
"\n",
"import requests"
],
"metadata": {
"id": "UfuNPLwjO99K"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!pip install implicit"
],
"metadata": {
"id": "8A_dH0K0082d"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# load data"
],
"metadata": {
"id": "KjF_x-wfcZOI"
}
},
{
"cell_type": "code",
"source": [
"content = requests.get(\"http://files.grouplens.org/datasets/movielens/ml-100k.zip\").content\n",
"\n",
"with zipfile.ZipFile(io.BytesIO(content)) as arc:\n",
" train = arc.read(\"ml-100k/ua.base\").decode().split(\"\\n\")\n",
" test = arc.read(\"ml-100k/ua.test\").decode().split(\"\\n\")\n",
" movies = arc.read(\"ml-100k/u.item\").decode(encoding='latin-1').split(\"\\n\")\n",
" users = arc.read(\"ml-100k/u.user\").decode(encoding='latin-1').split(\"\\n\")"
],
"metadata": {
"id": "f1wLolXpM02U"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"columns = ['user_id', 'movie_id', 'rating', 'timestamp']\n",
"\n",
"data = [[x for x in e.split('\\t')] for e in train]\n",
"train = pd.DataFrame(data, columns=columns).dropna().astype(int)\n",
"\n",
"data = [[x for x in e.split('\\t')] for e in test]\n",
"test = pd.DataFrame(data, columns=columns).dropna().astype(int)\n",
"\n",
"columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']\n",
"\n",
"data = [[x for x in e.split('|')] for e in users]\n",
"users = pd.DataFrame(data, columns=columns).dropna().astype({'user_id': int, 'age': int})\n",
"\n",
"genres = ['unknown', 'action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'noir',\n",
" 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']\n",
"columns = ['movie_id', 'title', 'year', '-', 'url'] + genres\n",
"data = [[x for x in e.split('|')] for e in movies]\n",
"movies = pd.DataFrame(data, columns=columns).dropna().astype({'movie_id': int})\n",
"movies.drop(columns=['-', 'url'], inplace=True)\n",
"movies[genres] = movies[genres].astype(int)\n",
"movies['moive_age'] = (pd.to_datetime(movies.year).max() - pd.to_datetime(movies.year)).dt.days / 365"
],
"metadata": {
"id": "-V1w4P5LeV4X"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Utils"
],
"metadata": {
"id": "luiAKltK9ze3"
}
},
{
"cell_type": "code",
"source": [
"def transform_predictions(k, user_ids, item_ids):\n",
" return pd.DataFrame(\n",
" data=np.c_[np.repeat(user_ids, k), item_ids.flatten(), [i + 1 for i in range(k)] * len(user_ids)],\n",
" columns=['user_id', 'movie_id', 'rank']\n",
" )\n",
"\n",
"\n",
"def prepare_prediction_df(k, user_ids, item_ids, true):\n",
" preds = transform_predictions(k, user_ids, item_ids)\n",
" preds = preds.merge(true, on=['user_id', 'movie_id'], how='outer')\n",
" preds['rank'] = preds.groupby('user_id')['rank'].transform(lambda x: x.fillna(x.max() + 1))\n",
" return preds\n",
"\n",
"\n",
"def get_embeddings(model, movies_list, users_list, factors):\n",
" item_factors = pd.DataFrame(\n",
" data=np.column_stack((movies_list, model.item_factors)),\n",
" columns=['movie_id'] + [f'item_factor_{i+1}' for i in range(factors)]\n",
" )\n",
" user_factors = pd.DataFrame(\n",
" data=np.column_stack((users_list, model.user_factors)),\n",
" columns=['user_id'] + [f'user_factor_{i+1}' for i in range(factors)]\n",
" )\n",
" return item_factors, user_factors\n",
"\n",
"\n",
"def get_full_df(df, item_factors, user_factors):\n",
" df = df.merge(movies, on=['movie_id'], how='left')\n",
" df = df.merge(users, on=['user_id'], how='left')\n",
" df = df.merge(item_factors, on=['movie_id'], how='left')\n",
" df = df.merge(user_factors, on=['user_id'], how='left')\n",
" return df"
],
"metadata": {
"id": "MqP6bLDv92hY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Train model"
],
"metadata": {
"id": "J4Z6jBMZcwnJ"
}
},
{
"cell_type": "markdown",
"source": [
"Let's get predictions from two models - ALS model and most common item recommender"
],
"metadata": {
"id": "XXba3z0w_y7p"
}
},
{
"cell_type": "code",
"source": [
"from implicit.cpu.als import AlternatingLeastSquares\n",
"from scipy.sparse import csr_matrix\n",
"pivot_table = train.pivot_table(index=['user_id'], columns=['movie_id'], values=\"rating\").fillna(0)\n",
"\n",
"als_model = AlternatingLeastSquares(factors=20, iterations=5, random_state=0)\n",
"als_model.fit(csr_matrix(pivot_table))"
],
"metadata": {
"id": "-FQDsHEA3OKw"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"ids, scores = als_model.recommend(test.user_id.unique() - 1, csr_matrix(pivot_table.loc[test.user_id.unique()]), N=30, filter_already_liked_items=True)\n",
"als_df = prepare_prediction_df(30, test.user_id.unique(), ids, test)"
],
"metadata": {
"id": "1kDXlE-FAEij"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"most_popular_top = list(train.movie_id.value_counts()[:30])\n",
"rec_array = np.array([most_popular_top] * len(test.user_id.unique()))\n",
"most_popular_df = prepare_prediction_df(30, test.user_id.unique(), rec_array, test)"
],
"metadata": {
"id": "j6z-vy8jArCq"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"item_factors, user_factors = get_embeddings(als_model, pivot_table.columns, pivot_table.index, 20)\n",
"als_df = get_full_df(als_df, item_factors, user_factors)\n",
"most_popular_df = get_full_df(most_popular_df, item_factors, user_factors)\n",
"train = get_full_df(train, item_factors, user_factors)"
],
"metadata": {
"id": "3vuUgdDmAW5o"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"item_features = [f'item_factor_{i+1}' for i in range(20)]"
],
"metadata": {
"id": "VlKggz4OCysy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from evidently.metrics import PrecisionTopKMetric\n",
"from evidently.metrics import RecallTopKMetric\n",
"from evidently.metrics import FBetaTopKMetric\n",
"from evidently.metrics import MAPKMetric\n",
"from evidently.metrics import NDCGKMetric\n",
"from evidently.metrics import DiversityMetric\n",
"from evidently.metrics import ItemBiasMetric\n",
"from evidently.metrics import NoveltyMetric\n",
"from evidently.metrics import PersonalisationMetric\n",
"from evidently.metrics import PopularityBias\n",
"from evidently.metrics import SerendipityMetric\n",
"from evidently.metrics import UserBiasMetric\n",
"from evidently.pipeline.column_mapping import ColumnMapping\n",
"from evidently.report import Report"
],
"metadata": {
"id": "vxU6s88ism0_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"report = Report(metrics=[\n",
" PrecisionTopKMetric(k=5),\n",
" RecallTopKMetric(k=5),\n",
" FBetaTopKMetric(k=5),\n",
" MAPKMetric(k=5),\n",
" NDCGKMetric(k=5),\n",
" DiversityMetric(k=5, item_features=item_features),\n",
" NoveltyMetric(k=5),\n",
" PersonalisationMetric(k=5),\n",
" SerendipityMetric(k=5, item_features=item_features),\n",
" PopularityBias(k=5),\n",
" ItemBiasMetric(k=5, column_name='moive_age'),\n",
" ItemBiasMetric(k=5, column_name='crime'),\n",
" UserBiasMetric(column_name='age'),\n",
" UserBiasMetric(column_name='gender')\n",
"\n",
"\n",
"])\n",
"column_mapping=ColumnMapping(recommendations_type='rank', target='rating', prediction='rank', item_id='title', user_id='user_id')\n",
"report.run(\n",
" reference_data=most_popular_df.dropna(subset=['title', 'user_id']).fillna(0),\n",
" current_data=als_df.dropna(subset=['title', 'user_id']).fillna(0),\n",
" column_mapping=column_mapping,\n",
" additional_datasets={'current_train_data': train.dropna(subset=['title', 'user_id'])}\n",
" )\n",
"report"
],
"metadata": {
"id": "7KIQreI6tKEA"
},
"execution_count": null,
"outputs": []
}
]
}
2 changes: 1 addition & 1 deletion examples/how_to_questions/how_to_use_column_mapping.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@
"source": [
"#Timeseries dataset\n",
"url = \"https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip\"\n",
"with zipfile.ZipFile(io.BytesIO(requests.get(url).content)) as arc:\n",
"with zipfile.ZipFile(io.BytesIO(requests.get(url, verify=False).content)) as arc:\n",
" raw_data = pd.read_csv(arc.open(\"hour.csv\"), header=0, sep=',', parse_dates=['dteday'], index_col='dteday')\n",
"\n",
"raw_data.index = raw_data.index + pd.to_timedelta(raw_data['hr'], unit='h')\n",
Expand Down
2 changes: 0 additions & 2 deletions src/evidently/base_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ class GenericInputData:
current_data: object
column_mapping: ColumnMapping
data_definition: DataDefinition

additional_datasets: Dict[str, Any]


Expand All @@ -124,7 +123,6 @@ class InputData:
current_additional_features: Optional[pd.DataFrame]
column_mapping: ColumnMapping
data_definition: DataDefinition

additional_datasets: Dict[str, Any]

@staticmethod
Expand Down
Loading