# Generate Pet Dataset

### Imports

In [1]:
import numpy as np
import pandas as pd
from faker import Faker
import os

## Pet dataset

Assumes...

* Discrete uniform distribution of ratings per user
* Each user rated more than 1/3 of documents


### Presets

In [2]:
fake = Faker()
fake.seed(23)
np.random.seed(23)

num_users = 100
num_docs = 1000
doc_uris = []
user_ratings = {}

### Generate fake URIs

In [3]:
for _ in range(num_docs):
    doc_uris.append(fake.uri())
user_ratings["doc_uri"] = doc_uris

### Generate fake users and 5-star ratings

In [4]:
for _ in range(num_users):
    #TBD
    ratings = np.random.randint(0, 6, size=num_docs).tolist() # discr uniform ratings
    num_na = np.random.randint(0, int(num_docs / 1.5) + 1) # 0 < num_na <= 2/3 of docs
    random_ixs = np.random.choice(range(num_docs), size=num_na, replace=False) # mask
    for i in random_ixs:
        if i:
            ratings[i] = np.NaN
    user_ratings[fake.name()] = ratings

### Dataframe

In [5]:
df = pd.DataFrame.from_dict(user_ratings).set_index("doc_uri")
df.head()

Unnamed: 0_level_0,Aaron Keith III,Aaron Mills,Abigail Wong,Adam Ramirez,Adam Rogers,Adam Williams,Albert Paul,Alexis Levy,Alicia Garcia,Alicia Wiley,...,Tina Fisher,Tonya Long,Travis Montgomery,Travis Montoya,Veronica Jackson,Veronica Walker,Victoria Perez,William Carpenter,William Vaughn,Zachary Miles
doc_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
http://www.vargas.biz/login.php,1.0,4.0,0.0,0.0,5.0,2.0,2.0,4.0,0.0,2.0,...,3.0,5.0,5.0,0.0,4.0,1.0,0.0,1.0,4.0,1.0
http://wallace-walker.info/index/,1.0,,,,,,3.0,5.0,5.0,3.0,...,,,,,1.0,,3.0,5.0,3.0,3.0
http://www.jimenez.biz/,3.0,0.0,3.0,0.0,,,,,0.0,1.0,...,,1.0,3.0,,2.0,1.0,,4.0,,2.0
http://www.logan.com/about.html,0.0,,,2.0,5.0,,1.0,5.0,5.0,4.0,...,3.0,,,,2.0,3.0,,2.0,,1.0
http://cox.org/list/tag/faq.html,5.0,,0.0,,2.0,,3.0,2.0,,4.0,...,,2.0,,5.0,,2.0,5.0,,0.0,5.0


## Persist dataset

In [6]:
f_name = "petdata_1000_100.csv"
path = os.path.join("../data", f_name)

df.to_csv(path)