# Populate Behance 

This is a auxiliary script to populate the [Behance Dataset](http://cseweb.ucsd.edu/~jmcauley/datasets.html#behance) in the RecSys database.

## Imports

In [1]:
import sys, os 
import numpy as np 
import pandas as pd 
from sqlalchemy import types
lib_path = './../Sources'
if (lib_path not in sys.path):
    sys.path.append(lib_path) #src directory
import lpsrec.database as db

## Getting Connection do Database

In [2]:
username = 'postgres'
password = 'admin'
dbname = 'RecSys'
hostname = 'localhost:5432'
conn = db.get_database_connection(username, password, hostname, dbname)

## Registering Dataset

In [8]:
df_dataset_info = pd.read_sql(con=conn, sql="SELECT * FROM datasets.dataset WHERE version = 'BEH1M'")
df_dataset_info.head()

Unnamed: 0,id_dataset,name,url,registers,size,id_cluster,version,id_utility


In [18]:
df_cluster = pd.read_sql(con=conn, sql="SELECT * FROM datasets.clusters WHERE tag = 'Behance'")
if df_cluster.shape[0] > 0:
    id_cluster = df_cluster['id_cluster'][0]
else:
    conn.execute("INSERT INTO datasets.clusters (tag) VALUES ('Behance')")
    id_cluster = pd.read_sql(con=conn, sql="SELECT * FROM datasets.clusters WHERE tag = 'Behance'")['id_cluster'][0]

In [16]:
# Register Utilities
df_utility = pd.read_sql(con=conn, sql="SELECT * FROM datasets.utility WHERE utility = 'art'")
if df_utility.shape[0] > 0:
    id_utility = df_utility['id_utility'][0]
else: 
    conn.execute("insert into datasets.utility (utility, description) values ('art', 'art website likes')")
    df_utility = pd.read_sql(con=conn, sql="SELECT * FROM datasets.utility WHERE utility = 'art'")
    id_utility = df_utility['id_utility'][0]

In [23]:
# Actual parameter values may differ, what you see is a default string representation of values
df_dataset = pd.read_sql(con=conn, sql="SELECT * FROM datasets.dataset WHERE version = 'BEH1M'")
if df_dataset.shape[0] > 0:
    id_dataset = df_dataset['id_dataset'][0]
else: 
    sql_str = """
        INSERT INTO datasets.dataset ("name",url,id_cluster,"version", id_utility)
        VALUES ('Behance Art Reviews',
                'https://drive.google.com/drive/folders/0B9Ck8jw-TZUEc3NlMjVXdDlPU1k', 
                """ + str(id_cluster) + """,
                'BEH1M', 
                """ + str(id_utility) + """)"""
    conn.execute(sql_str)
    id_dataset = pd.read_sql(con=conn, sql="SELECT * FROM datasets.dataset WHERE version = 'BEH1M'")['id_dataset'][0]


## Loading Dataset 

In [None]:
dataset_path = './../Datasets/Behance/Behance1M_raw/'
os.listdir(dataset_path)

In [29]:
df_ratings = pd.read_csv(os.path.join(dataset_path, 'Behance_appreciate_1M'), sep=' ', header=None)
df_ratings.columns = ['id_user', 'id_item', 'timestamp']
print ("Number of ratings: ", df_ratings.shape[0])
print ("Number of users who have ratings: ", df_ratings['id_user'].unique().shape[0])
print ("Number of items who have ratings: ", df_ratings['id_item'].unique().shape[0])
df_ratings.head()

Number of ratings:  1000000
Number of users who have ratings:  63497
Number of items who have ratings:  178788


Unnamed: 0,id_user,id_item,timestamp
0,276633,1588231,1307583271
1,1238354,1529213,1307583273
2,165550,485000,1307583337
3,2173258,776972,1307583340
4,165550,158226,1307583406


## Inserting Users

In [27]:
df_datasets_user = pd.DataFrame(columns=["id_dataset", "id_user_dataset"])
df_datasets_user['id_user_dataset'] = df_ratings['id_user'].unique()
df_datasets_user['id_dataset'] = np.repeat(id_dataset, df_datasets_user.shape[0])
df_datasets_user.tail()

Unnamed: 0,id_dataset,id_user_dataset
63492,8,1719679
63493,8,3256217
63494,8,1694139
63495,8,1684834
63496,8,697018


Registering users on `datasets.user`

In [31]:
%%time
df_types = {"id_dataset": types.INTEGER(), "id_user_dataset": types.INTEGER()}
conn.execute("delete from datasets.user_info where id_user in (select id_user from datasets.user where id_dataset = {})" .format(id_dataset))
conn.execute("delete from datasets.user where id_user in (select id_user from datasets.user where id_dataset = {})" .format(id_dataset))
df_datasets_user.to_sql(con=conn, schema='datasets', if_exists='append', name='user', index=False, dtype=df_types)

Wall time: 9.47 s


In [32]:
pd.read_sql(con=conn, sql="SELECT count(1) FROM datasets.user WHERE id_dataset = " + str(id_dataset))

Unnamed: 0,count
0,63497


## Inserting Items

In [33]:
df_datasets_item = pd.DataFrame(columns=["id_dataset", "id_item_dataset"])
df_datasets_item['id_item_dataset'] = df_ratings['id_item'].unique()
df_datasets_item['id_dataset'] = np.repeat(id_dataset, df_datasets_item.shape[0])
df_datasets_item.tail()

Unnamed: 0,id_dataset,id_item_dataset
178783,8,1768808
178784,8,2496565
178785,8,2134380
178786,8,372177
178787,8,1345469


Registering items on `datasets.item`

In [34]:
%%time
df_types = {"id_dataset": types.INTEGER(), "id_item_dataset": types.INTEGER()}
conn.execute("delete from datasets.item_info where id_item in (select id_item from datasets.item where id_dataset = {})" .format(id_dataset))
conn.execute("delete from datasets.item where id_item in (select id_item from datasets.item where id_dataset = {})" .format(id_dataset))
df_datasets_item.to_sql(con=conn, schema='datasets', name='item', if_exists='append', index=False, dtype=df_types)

Wall time: 19.9 s


In [35]:
pd.read_sql(con=conn, sql="SELECT count(1) FROM datasets.item WHERE id_dataset = " + str(id_dataset))

Unnamed: 0,count
0,178788


## Inserting Ratings

In [39]:
df_ratings.to_sql(con=conn, schema='transistory', if_exists='replace', name='beh1m_ratings', index=False, dtype={'id_user': types.VARCHAR(50), 'id_item': types.VARCHAR(50), 'timestamp': types.INTEGER()})

In [40]:
df_ratings_temp = pd.read_sql(con=conn, sql="SELECT * FROM transistory.beh1m_ratings")
df_ratings_temp.tail()

Unnamed: 0,id_user,id_item,timestamp
999995,697018,329834,1321254585
999996,3265059,704143,1321254605
999997,157213,1345469,1321254649
999998,19740,2240652,1321254672
999999,697018,1912005,1321254674


In [44]:
sql_str = """
insert into datasets.feedback 
(id_user, id_item, value, "timestamp", id_feedback_type)
select us.id_user, it.id_item, 1, to_timestamp("timestamp"), ft.id_feedback_type as type  
from transistory.beh1m_ratings tran 
inner join datasets."user" us on us.id_user_dataset = tran.id_user and us.id_dataset = """ + str(id_dataset) + """ 
inner join datasets.item it on it.id_item_dataset = tran.id_item and it.id_dataset = """ + str(id_dataset) + """ 
inner join datasets.feedback_type ft on ft.info_type = 'implicit'"""
conn.execute(sql_str)

<sqlalchemy.engine.result.ResultProxy at 0x1fd80dfd7b8>

In [48]:
sql_str = """
    SELECT * FROM datasets.feedback fb
    INNER JOIN datasets.item it ON fb.id_item = it.id_item
    INNER JOIN datasets.user us ON fb.id_user = us.id_user
    INNER JOIN datasets.dataset ds ON it.id_dataset = ds.id_dataset AND us.id_dataset = ds.id_dataset
    WHERE ds.id_dataset = """ + str(id_dataset)
pd.read_sql(con=conn, sql=sql_str).tail()

Unnamed: 0,id_user,id_item,value,timestamp,id_feedback_type,id_item.1,id_dataset,id_item_dataset,id_user.1,id_dataset.1,id_user_dataset,id_dataset.2,name,url,registers,size,id_cluster,version,id_utility
999995,4188603,844681,1,2011-06-09 06:59:45,2,844681,8,1498639,4188603,8,654822,8,Behance Art Reviews,https://drive.google.com/drive/folders/0B9Ck8j...,,,4,BEH1M,4
999996,4188603,886310,1,2011-07-04 11:26:51,2,886310,8,1718465,4188603,8,654822,8,Behance Art Reviews,https://drive.google.com/drive/folders/0B9Ck8j...,,,4,BEH1M,4
999997,4188603,897199,1,2011-07-10 05:13:13,2,897199,8,1759040,4188603,8,654822,8,Behance Art Reviews,https://drive.google.com/drive/folders/0B9Ck8j...,,,4,BEH1M,4
999998,4188603,992017,1,2011-10-12 06:03:16,2,992017,8,2302088,4188603,8,654822,8,Behance Art Reviews,https://drive.google.com/drive/folders/0B9Ck8j...,,,4,BEH1M,4
999999,4188603,1010575,1,2011-11-02 10:47:43,2,1010575,8,2401834,4188603,8,654822,8,Behance Art Reviews,https://drive.google.com/drive/folders/0B9Ck8j...,,,4,BEH1M,4


## Update Datasets Stats 

In [56]:
# To be implemented

## Updating USS and ISS Table

In [53]:
%%time
conn.execute("CALL sparsity.update_uss()")
print ("USS table updated")
conn.execute("CALL sparsity.update_iss()")
print ("ISS table updated")

<sqlalchemy.engine.result.ResultProxy at 0x1fd870477b8>

## Getting Sparsity Dataset

In [55]:
pd.read_sql(con=conn, sql="select * from sparsity.get_dataset_from_sparsity('BEH1M', 1.0, 1.0)").tail()

Unnamed: 0,id_user_dataset,id_item_dataset,value,timestamp,id_user,id_item
999995,1694139,1617781,1,2011-11-14 04:01:30,4251616,861263
999996,1684834,1064879,1,2011-11-14 04:03:52,4251617,849314
999997,1684834,915804,1,2011-11-14 04:06:16,4251617,849320
999998,697018,329834,1,2011-11-14 04:09:45,4251618,844882
999999,697018,1912005,1,2011-11-14 04:11:14,4251618,932713
