In [126]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/CS6220'
%cd $path

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/CS6220


# Fraudulent User Detection Using Amazon Dataset
### Penghao Xu, Yuan Chen, Jiawei Wu, Haojing Lu

## Part 1. Dataset preprocessing

This script is used to clean the Amazon review dataset (http://jmcauley.ucsd.edu/data/amazon/links.html) and generate data for baseline and the new proposed model.


In [127]:
import json
import pandas as pd
import gzip
import os
import numpy as np
from collections import Counter

Download data if needed

In [128]:
# Uncomment to download data
# !wget http://snap.stanford.edu/data/amazon/productGraph/kcore_5.json.gz

In [129]:
## 5-core data is used in this study
# DO NOT extract the dataset. gzip format is required
filename = 'kcore_5.json.gz'
# filename = 'reviews_Musical_Instruments.json.gz'
assert filename.endswith('gz'), 'Gzipped dataset is required!'

# set output folder
folder = 'dataset'
if not os.path.isdir(folder):
    os.mkdir(folder)

## 1. Generate rating-only dataset
The rating-only dataset has 4 columns: User, item, and rating. This dataset is used for baseline model REV2

In [130]:
# Process data and generate helpfulness score
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path, benign=0.8, fraudulent=0.2):
    i = 0
    df = {}
    for d in parse(path):
        i += 1
        if i > 1000000:
          break
        # report every 5m
        if not i % 5000000:
            print(f'{i} reviews processed!')
        # skip if no helpful information
        if  i < 10: print(d)
        # if not d['helpful'][1]:
        #     continue
        # extract useful features
        df[i] = {}
        for k in ['reviewerID', 'asin']:
            df[i][k] = d[k]
        df[i]['rating'] = (d['overall'] - 3) / 2
        df[i]['helpful'] = d['helpful'][0]
        df[i]['votes'] = d['helpful'][1]
        # df[i]['helpfulness'] = d['helpful'][0]/d['helpful'][1]
    df = pd.DataFrame.from_dict(df, orient='index')
    return df
df = get_df(filename)
df

{'reviewerID': 'ACNGUPJ3A3TM9', 'asin': '0000013714', 'reviewerName': 'GCM', 'helpful': [0, 0], 'reviewText': "We use this type of hymnal at church.  I was looking for the same one; however, this wasn't it. It is a good hymnal, but there isn't enough information to find the version I need.", 'overall': 4.0, 'summary': 'Nice Hymnal', 'unixReviewTime': 1386028800, 'reviewTime': '12 3, 2013'}
{'reviewerID': 'A2SUAM1J3GNN3B', 'asin': '0000013714', 'reviewerName': 'J. McDonald', 'helpful': [2, 3], 'reviewText': 'I bought this for my husband who plays the piano.  He is having a wonderful time playing these old hymns.  The music  is at times hard to read because we think the book was published for singing from more than playing from.  Great purchase though!', 'overall': 5.0, 'summary': 'Heavenly Highway Hymns', 'unixReviewTime': 1252800000, 'reviewTime': '09 13, 2009'}
{'reviewerID': 'APOZ15IEYQRRR', 'asin': '0000013714', 'reviewerName': 'maewest64', 'helpful': [0, 0], 'reviewText': 'This is 

Unnamed: 0,reviewerID,asin,rating,helpful,votes
1,ACNGUPJ3A3TM9,0000013714,0.5,0,0
2,A2SUAM1J3GNN3B,0000013714,1.0,2,3
3,APOZ15IEYQRRR,0000013714,1.0,0,0
4,AYEDW3BFK53XK,0000013714,1.0,0,0
5,A1KLCGLCXYP1U1,0000013714,0.0,0,0
...,...,...,...,...,...
999996,A25U4ZSWP74LYL,0143038583,1.0,0,1
999997,AYTBAXJ2Q958V,0143038583,0.0,5,7
999998,A3NU1NMLU2PR2X,0143038583,1.0,0,1
999999,A33ESXDBS05VUR,0143038583,-1.0,13,38


Check the benign and fraudulent user counts in the original dataset.

In [131]:
# reviewer_df = df.groupby('reviewerID').votes.sum()
reviewer_df = df.groupby('reviewerID')[['helpful', 'votes']].sum()
benign = reviewer_df[reviewer_df['votes'] >= 20][reviewer_df['helpful'] / reviewer_df['votes'] > 0.8]
fraudulent = reviewer_df[reviewer_df['votes'] >= 20][reviewer_df['helpful'] / reviewer_df['votes'] < 0.2]
print(f'Benign users: {len(benign)}')
print(f'Fraudulent users: {len(fraudulent)}')
# reviewers = reviewer_df[reviewer_df >= 20]
# print(reviewers)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


Benign users: 17015
Fraudulent users: 2662


In [132]:
# users = df.groupby('reviewerID').helpfulness.mean()
# benign = users[users > 0.8]
# fraudulent = users[users < 0.2]
# print(f'Benign users: {len(benign)}')
# print(f'Fraudulent users: {len(fraudulent)}')

Select only benign and fraudulent users. Discard the other users without label.

In [133]:
df_benign = df[df.reviewerID.isin(set(benign.index))].copy()
df_benign['label'] = 'Benign'
df_fra = df[df.reviewerID.isin(set(fraudulent.index))].copy()
df_fra['label'] = 'Fraudulent'
df = pd.concat([df_benign, df_fra])
df

Unnamed: 0,reviewerID,asin,rating,helpful,votes,label
24,A1MYJDE1JEKLHX,0000031887,0.5,0,0,Benign
52,A12OFS8WQP86O5,0000031887,1.0,20,23,Benign
167,A38AAPXSJN4C5G,0000230022,1.0,0,0,Benign
205,A27ZH1AQORJ1L,000100039X,1.0,3,3,Benign
207,A26GKZPS079GFF,000100039X,1.0,2,3,Benign
...,...,...,...,...,...,...
999395,A1VTW4K2KM0D0A,0143038583,-0.5,10,59,Fraudulent
999572,A1JAV22A2FJAHY,0143038583,1.0,7,53,Fraudulent
999581,AM76JU1FHC8HY,0143038583,-1.0,26,204,Fraudulent
999613,A913X3CZ55YAA,0143038583,0.5,7,24,Fraudulent


Check the number of reviews from benign and fraudulent users.

In [134]:
counts = Counter(df.label)
print(f'Reviews from benign users: {counts["Benign"]}')
print(f'Reviews from fraudulent users: {counts["Fraudulent"]}')

Reviews from benign users: 136040
Reviews from fraudulent users: 5778


Generate k-core dataset

In [135]:
def generate_kcore(df, k):
    # Repeatly remove the users with less than k reviews, 
    # then remove the items with less than k reviews until
    # no one is removed.
    diff = 1
    while diff:
        cache = len(df)
        counts = df.groupby('reviewerID').asin.count()
        counts = counts[counts >= k]
        df = df[df.reviewerID.isin(set(counts.index))]
        counts = df.groupby('asin').reviewerID.count()
        counts = counts[counts >= k]
        df = df[df.asin.isin(set(counts.index))]
        diff = cache - len(df)
    return df.copy()

In [136]:
# k = 3
split_rate = (0.4, 0.3, 0.3)
output_base = f'{folder}/processed_80_20'

# perform a random shuffle
np.random.seed(3407)
df = df.iloc[np.random.permutation(len(df))]

# split
dfs = {}
n_train = int(len(df) * split_rate[0])
n_val = int(len(df) * split_rate[1])
dfs['train'] = df.iloc[:n_train]
dfs['val'] = df.iloc[n_train:n_train+n_val]
dfs['test'] = df.iloc[n_train+n_val:]
dfs['all'] = df

for curr in ('train', 'val', 'test', 'all'):
    # dfs[curr] = generate_kcore(dfs[curr], k)
    df_out = dfs[curr][['reviewerID', 'asin', 'rating']]
    df_out.to_csv(f'{output_base}_{curr}.csv', index=False)
    print(df_out)

            reviewerID        asin  rating
709911   AMFYF3C08JMWD  0071760520     0.5
336053  A2B8GXSCB1R05T  0061230936     0.0
271490  A1MC81HLJ6Z9ZQ  0060935766    -0.5
768865   AHUT55E980RDR  0123855489     1.0
824129  A1CNXVJ4KWIOTY  0140231617     1.0
...                ...         ...     ...
133006  A2ODBHT4URXVXQ  0060517662     1.0
751571   A6M8E1M7OSY1Q  0099481685     1.0
894756  A3V6Z4RCDGRC44  0141187204     1.0
472072  A1AIJUCU29SMXE  006195876X     0.5
795494  A3V1EPSE6XDC0C  0137147783     0.5

[56727 rows x 3 columns]
            reviewerID        asin  rating
716041  A3HYKSKAT77SFK  0072125004     0.5
830658   AMCUV2IFH93OO  014026759X     0.5
15770    AE31M52VLKOG6  0006550436     1.0
157279  A30KEXFT9SILL6  0060562382     0.5
523242   ABUY5XK1R4YYZ  0062027786     0.0
...                ...         ...     ...
166775  A2F3M93RRLFQNJ  0060577304     1.0
611675  A2ANWGPM3M2JPD  0062219480     1.0
563192  A2NC18AD9TSCT4  0062085743     0.5
591563   AT0E6UBVJMJ6W  0062

Check the number of reviews from benign and fraudulent users again

In [137]:
benign = {}
fraudulent = {}
for t in ['train', 'val', 'test', 'all']:
    df = dfs[t]
    benign[t] = set(df[df.label == 'Benign'].reviewerID.unique())
    fraudulent[t] = set(df[df.label == 'Fraudulent'].reviewerID.unique())
    print(t)
    print(f'Benign users: {len(benign[t])}')
    print(f'Fraudulent users: {len(fraudulent[t])}')
    counts = Counter(df.label)
    print(f'Reviews from benign users: {counts["Benign"]}')
    print(f'Reviews from fraudulent users: {counts["Fraudulent"]}')

train
Benign users: 12199
Fraudulent users: 1500
Reviews from benign users: 54410
Reviews from fraudulent users: 2317
val
Benign users: 10676
Fraudulent users: 1187
Reviews from benign users: 40826
Reviews from fraudulent users: 1719
test
Benign users: 10744
Fraudulent users: 1208
Reviews from benign users: 40804
Reviews from fraudulent users: 1742
all
Benign users: 17015
Fraudulent users: 2662
Reviews from benign users: 136040
Reviews from fraudulent users: 5778


Output user labels

In [138]:
userfile_base = f'{folder}/user_label'
for t in ['train', 'val', 'test', 'all']:
    with open(userfile_base + f'_{t}.csv', 'w') as fw:
        fw.write('reviewerID,fairness,label\n')
        for u in benign[t]:
            score = reviewer_df.loc[u]['helpful'] / reviewer_df.loc[u]['votes']
            fw.write(f'{u},{score},Benign\n')
        for u in fraudulent[t]:
            score = reviewer_df.loc[u]['helpful'] / reviewer_df.loc[u]['votes']
            fw.write(f'{u},{score},Fraudulent\n')

## 2. Generate toy datasets for coding
Here, toy datasets are generated to speed up model design and debugging.

In [139]:
# # select users
# n_benign = 50000
# n_fraudulent = 5000


# # output names
# toy_out_base = f'{folder}/toy_80_20'

# # generate toy train val and test data
# toys = {}
# curr_benign = 0
# curr_fraudulent = 0
# for t in ['train', 'val', 'test']:
#     toy_users = set(sorted(list(benign['all']))[curr_benign:curr_benign+n_benign] + \
#                         sorted(list(fraudulent['all']))[curr_fraudulent:curr_fraudulent+n_fraudulent])
#     curr_benign += n_benign
#     curr_fraudulent += n_fraudulent
#     df = dfs['all']
#     toys[t] = df[df.reviewerID.isin(toy_users)].copy()
#     toys[t] = generate_kcore(toys[t], k)
#     df_toy_out = toys[t][['reviewerID', 'asin', 'rating']]
#     df_toy_out.to_csv(f'{toy_out_base}_{t}.csv', index=False)
#     toys[t]

Statistics for toy dataset.

In [140]:
# toy_benign = {}
# toy_fraudulent = {}
# for t in ['train', 'val', 'test']:
#     toy_benign[t] = set(toys[t][toys[t].label == 'Benign'].reviewerID.unique())
#     toy_fraudulent[t] = set(toys[t][toys[t].label == 'Fraudulent'].reviewerID.unique())
#     print(t)
#     print(f'Benign users: {len(toy_benign[t])}')
#     print(f'Fraudulent users: {len(toy_fraudulent[t])}')
#     counts = Counter(toys[t].label)
#     print(f'Reviews from benign users: {counts["Benign"]}')
#     print(f'Reviews from fraudulent users: {counts["Fraudulent"]}')

Output user labels

In [141]:
# toy_user_base = f'{folder}/toy_label'
# for t in ['train', 'val', 'test']:
#     with open(toy_user_base + f'_{t}.csv', 'w') as fw:
#         fw.write('reviewerID,fairness,label\n')
#         for u in benign[t]:
#             fw.write(f'{u},{users[u]},Benign\n')
#         for u in fraudulent[t]:
#             fw.write(f'{u},{users[u]},Fraudulent\n')

## 3. Generate dataset with text reviews

Here, we generate the dataset with text reviews of same review, which can help us to incorporate text embeddings.

In [142]:
# # output name
# output_base = f'{folder}/processed_80_20'
# output_toy_base = f'{folder}/toy_80_20'

# # Only output the reviews from selected entries for processed dataset
# products = {}
# reviewers = {}
# fws = {}
# for t in ['train', 'val', 'test', 'all']:
#     products[t] = set(dfs[t].asin.unique())
#     reviewers[t] = set(dfs[t].reviewerID.unique())
#     fws[t] = open(f'{output_base}_{t}_with_text.csv', 'w')
#     fws[t].write('reviewerID,asin,rating,reviewText\n')
    
# # toy dataset
# toy_products = {}
# toy_reviewers = {}
# toy_fws = {}
# for t in ['train', 'val', 'test']:
#     toy_products[t] = set(toys[t].asin.unique())
#     toy_reviewers[t] = set(toys[t].reviewerID.unique())
#     toy_fws[t] = open(f'{output_toy_base}_{t}_with_text.csv', 'w')
#     toy_fws[t].write('reviewerID,asin,rating,reviewText\n')

# # check all reviews
# i=0
# for d in parse(filename):
#     i += 1
#     # report every 5m
#     if not i % 5000000:
#         print(f'{i} reviews processed!')
#     if not d['helpful'][1]:
#         continue
#     for t in ['train', 'val', 'test', 'all']:
#         if d['reviewerID'] in reviewers[t] and d['asin'] in products[t]:
#             fws[t].write(','.join([d['reviewerID'], d['asin'], str((d['overall']-3)/2), \
#                            d['reviewText'].replace('\n',' ').replace(',', ' ')]) + '\n')
#     for t in ['train', 'val', 'test']:
#         if d['reviewerID'] in toy_reviewers[t] and d['asin'] in toy_products[t]:
#             toy_fws[t].write(','.join([d['reviewerID'], d['asin'], str((d['overall']-3)/2), \
#                            d['reviewText'].replace('\n',' ').replace(',', ' ')]) + '\n')

# # close files
# for fw in fws.values():
#     fw.close()
# for fw in toy_fws.values():
#     fw.close()