In [1]:
from faker import Faker
fake = Faker(['zh_TW'])
Faker.seed(7777)

In [2]:
for _ in range(5):
    print(fake.name())

何雅玲
王郁雯
胡鈺婷
呂懿
熊家豪


In [3]:
import random
from random import randint, choices
import pandas as pd

random.seed(7777)

In [4]:
id_dict = {}
for i in range(1000):
    id_dict[i] = fake.name()

list(id_dict.items())[:5]

[(0, '靳瑋婷'), (1, '劉佳玲'), (2, '王冠廷'), (3, '奚淑貞'), (4, '孟淑娟')]

In [5]:
def save(dataframe, filename):
    dataframe.to_csv(filename, index=False, encoding='utf-8')

In [6]:
raw = pd.read_csv('./raw.csv')
raw.head()

Unnamed: 0,store_name,手沖,義式,有貓,酒吧,飲料,咖啡
0,甜在心咖啡館,1,1,0,0,0,1
1,曉咖啡,1,1,1,0,0,1
2,Dr.Bean,1,1,0,0,0,1
3,咖啡平方 東安店,0,1,0,0,0,1
4,1/2P Café 陶甕烘焙咖啡,1,0,0,0,0,1


In [7]:
cafe = list(raw[raw['咖啡'] == 1]['store_name'].values)
cafe[:5]

['甜在心咖啡館', '曉咖啡', 'Dr.Bean', '咖啡平方 東安店', '1/2P Café 陶甕烘焙咖啡']

In [8]:
bar = list(raw[raw['酒吧'] == 1]['store_name'].values)
bar[:5]

['在島之後After Island. 餐酒館',
 '鯤島 Khuntor',
 'Bar TCRC 前科累累俱樂部',
 'Bären Biergelden 貝倫啤酒館',
 '朝茶暮酒 Bar - MinLife Store']

In [9]:
drinks = list(raw[raw['飲料'] == 1]['store_name'].values)
drinks[:5]

['双生綠豆沙牛奶', '甜又鮮飲料', 'Silver Coin 銀兩', '木子樑室', '黃氏波霸奶茶專賣店']

In [10]:
def choose_store(type):
    prefix = fake.company_prefix()
    if type == 0:
        return f'{prefix}_{choices(cafe)[0]}'
    elif type == 1:
        return f'{prefix}_{choices(bar)[0]}'
    else:
        return f'{prefix}_{choices(drinks)[0]}'

def create_stores(num):

    data = pd.DataFrame()
    for i in range(num):
        data.loc[i, 'store_id'] = i
        data.loc[i, 'type'] = randint(0, 2)
        data.loc[i, 'name'] = choose_store(data.loc[i, 'type'])

        data[list(['store_id', 'type'])] = data[list(['store_id', 'type'])].astype(int)

    return data

stores = create_stores(100)
stores = stores.sort_values(by=['store_id'])
save(stores, './stores.csv')

In [11]:
stores.head()

Unnamed: 0,store_id,type,name
0,0,0,國中鋼鐵_猫毛珈琲（原猫門）
1,1,1,天中電視_Bären Biergelden 貝倫啤酒館
2,2,1,發聯科技_Bären Biergelden 貝倫啤酒館
3,3,2,美奧廣告_鮮汁霸果汁
4,4,0,遠西百貨_猫毛珈琲（原猫門）


In [12]:
def search_store_from_raw(store_name):
    store_name = store_name.split('_')[1]
    return raw[raw['store_name'].str.contains(store_name)]

In [13]:
def rating_algo(user_id, store_name):

    store_info = search_store_from_raw(store_name)
    
    # user_preference
    is_catcon = randint(0, 1)
    cafe_pref = choices([0, 1], weights=[0.2, 0.8])[0]
    bar_pref = choices([0, 1], weights=[0.4, 0.6])[0]
    drinks_pref = choices([0, 1], weights=[0.3, 0.7])[0]

    try:
        if is_catcon and store_info['有貓'].values[0]:
            return 5
        elif cafe_pref and store_info['咖啡'].values[0]:
            if store_info['手沖'].values[0]:
                return randint(4, 5)
            elif store_info['義式'].values[0]:
                return randint(4, 5)
            else:
                return randint(3, 5)
        elif bar_pref and store_info['酒吧'].values[0]:
            return randint(3, 5)
        elif drinks_pref and store_info['飲料'].values[0]:
            return randint(3, 5)
        elif cafe_pref and bar_pref and drinks_pref:
            return randint(2, 4)
        elif not cafe_pref and not bar_pref and not drinks_pref:
            return randint(1, 3)
        else:
            return randint(1, 4)
    except IndexError:
        print(store_name)

In [14]:
def create_ratings(num):

    data = pd.DataFrame()
    for i in range(num):
        data.loc[i, 'id'] = randint(0, 999)
        data.loc[i, 'name'] = id_dict[data.loc[i, 'id']]
        data.loc[i, 'store_id'] = randint(0, 99)
        data.loc[i, 'store_name'] = stores[stores['store_id'] == data.loc[i, 'store_id']]['name'].values[0]
        data.loc[i, 'rating'] = rating_algo(data.loc[i, 'id'], data.loc[i, 'store_name'])

        data[list(['id', 'store_id', 'rating'])] = data[list(['id', 'store_id', 'rating'])].astype(int)

    return data

users = create_ratings(10000)
users = users.sort_values(by=['id', 'store_id'])
save(users, './users.csv')

In [15]:
users.head()

Unnamed: 0,id,name,store_id,store_name,rating
8105,0,靳瑋婷,32,鐵台_镹 coffee • dessert,5
3569,0,靳瑋婷,72,丹味企業_籠裏 Bar Lonely,4
4540,0,靳瑋婷,78,瑞輝大藥廠_籠裏 Bar Lonely,4
9632,0,靳瑋婷,79,家宜家居（KIEA）_啤酒超市,4
9095,1,劉佳玲,1,天中電視_Bären Biergelden 貝倫啤酒館,4
