In [1]:
!pip install faker

Collecting faker
  Downloading Faker-13.11.1-py3-none-any.whl (1.5 MB)
[?25l[K     |▏                               | 10 kB 30.0 MB/s eta 0:00:01[K     |▍                               | 20 kB 36.4 MB/s eta 0:00:01[K     |▋                               | 30 kB 43.8 MB/s eta 0:00:01[K     |▉                               | 40 kB 12.8 MB/s eta 0:00:01[K     |█                               | 51 kB 12.8 MB/s eta 0:00:01[K     |█▎                              | 61 kB 15.0 MB/s eta 0:00:01[K     |█▌                              | 71 kB 11.3 MB/s eta 0:00:01[K     |█▊                              | 81 kB 12.4 MB/s eta 0:00:01[K     |██                              | 92 kB 13.7 MB/s eta 0:00:01[K     |██▏                             | 102 kB 12.5 MB/s eta 0:00:01[K     |██▍                             | 112 kB 12.5 MB/s eta 0:00:01[K     |██▋                             | 122 kB 12.5 MB/s eta 0:00:01[K     |██▊                             | 133 kB 12.5 MB/s eta 0:0

In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import datetime
from sklearn.preprocessing import OneHotEncoder, label_binarize, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline

In [2]:
def load_data():

  print("  Data access - Loading all four input relations from github")

  base_url = 'https://github.com/schelterlabs/arguseyes/raw/freamon/arguseyes/example_pipelines/datasets/freamon'  

  reviews = pd.read_csv(f'{base_url}/reviews.csv.gz', compression='gzip', index_col=0)
  ratings = pd.read_csv(f'{base_url}/ratings.csv', index_col=0)
  products = pd.read_csv(f'{base_url}/products.csv', index_col=0)
  categories = pd.read_csv(f'{base_url}/categories.csv', index_col=0)

  return reviews, ratings, products, categories

In [3]:
def random_subset(arr):
  size = np.random.randint(low=1, high=len(arr)+1)
  choice = np.random.choice(arr, size=size, replace=False)
  return [str(item) for item in choice]

In [4]:
def integrate_data(reviews, ratings, products, categories, fake):
  start_date = fake.date_between(start_date=datetime.date(year=2011, month=1, day=1),
                                  end_date=datetime.date(year=2013, month=6, day=1))

  print(f"  Data Integration - Dropping reviews written before {start_date.strftime('%Y-%m-%d')}")
  reviews = reviews[reviews.review_date >= start_date.strftime('%Y-%m-%d')]

  reviews_with_ratings = reviews.merge(ratings, on='review_id')
  products_with_categories = products.merge(left_on='category_id', right_on='id', right=categories)

  random_categories = random_subset(list(categories.category))
  print(f"  Data Integration - restricting products to the following categories {random_categories}")
  products_with_categories = products_with_categories[products_with_categories.category.isin(random_categories)]

  reviews_with_products_and_ratings = reviews_with_ratings.merge(products_with_categories, on='product_id')

  print(f"  Data Integration - joined reviews, ratings, products & categories")
  return reviews_with_products_and_ratings

In [5]:
def compute_feature_and_label_data(reviews_with_products_and_ratings, final_columns, fake):
  reviews_with_products_and_ratings['product_title'] = \
      reviews_with_products_and_ratings['product_title'].fillna(value='')

  reviews_with_products_and_ratings['review_headline'] = \
      reviews_with_products_and_ratings['review_headline'].fillna(value='')

  reviews_with_products_and_ratings['review_body'] = \
      reviews_with_products_and_ratings['review_body'].fillna(value='')

  num_text_columns = np.random.randint(low=1, high=4)
  random_text_columns = np.random.choice(['product_title', 'review_headline', 'review_body'],
                                          size=num_text_columns, replace=False)

  print(f"  Data preparation - using columns {random_text_columns} as textual feature ")
  reviews_with_products_and_ratings['text'] = ' '
  for text_column in random_text_columns:
      reviews_with_products_and_ratings['text'] = reviews_with_products_and_ratings['text'] + ' ' \
                                                  + reviews_with_products_and_ratings[text_column]

  reviews_with_products_and_ratings['is_helpful'] = reviews_with_products_and_ratings['helpful_votes'] > 0

  print(f"  Projecting data to the feature and label columns {final_columns}")
  projected_reviews = reviews_with_products_and_ratings[final_columns]

  split_date = fake.date_between(start_date=datetime.date(year=2013, month=12, day=1),
                                  end_date=datetime.date(year=2015, month=1, day=1))

  print(f"  Data preparation - temporal train/test split based on date {split_date}")
  train_data = projected_reviews[projected_reviews.review_date <= split_date.strftime('%Y-%m-%d')]
  train_labels = label_binarize(train_data['is_helpful'], classes=[True, False]).ravel()

  test_data = projected_reviews[projected_reviews.review_date > split_date.strftime('%Y-%m-%d')]
  test_labels = label_binarize(test_data['is_helpful'], classes=[True, False]).ravel()

  return train_data, train_labels, test_data, test_labels

In [6]:
def define_model(numerical_columns, categorical_columns):
  print(f"  Feature encoding - Setting up feature transformations")
  feature_transformation = ColumnTransformer(transformers=[
    ('numerical_features', StandardScaler(), numerical_columns),
    ('categorical_features', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('textual_features', HashingVectorizer(ngram_range=(1, 3), n_features=100), 'text'),
  ], remainder="drop")

  print(f"  Modeling - defining a logistic regression model")
  sklearn_model = Pipeline([
    ('features', feature_transformation),
    ('learner', SGDClassifier(loss='log', penalty='l1', max_iter=1000))])

  return sklearn_model


### Task 1 - Group Fairness

__Compute the fairness of the pipeline with respect to third party reviews__. In particular, compute the [equal opportunity](https://en.wikipedia.org/wiki/Fairness_(machine_learning)#Group_Fairness_criteria) metric (the difference in false negative rates) between reviews from a third party and reviews not from a third party.

### Task 2 - Data Usage

__Compute which records from the ratings and products relation are used to train the classifier__. Compute two boolean arrays with a dimensionality similar to the cardinality of the relations, where the entry at position i denotes whether the i-th record is included in the training data of the classifier. 


In [7]:
seed = 1234
fake = Faker()
fake.seed_instance(seed)
np.random.seed(seed)

print('---------------------------------------------------------------------')
print(f'Executing pipeline with seed {seed}')
print('---------------------------------------------------------------------')

numerical_columns = random_subset(['total_votes', 'star_rating'])
categorical_columns = random_subset(['customer_id', 'product_id', 'vine', 'category'])
final_columns = numerical_columns + categorical_columns + ['text', 'is_helpful', 'review_date']

reviews, ratings, products, categories = load_data()

integrated_data = integrate_data(reviews, ratings, products, categories, fake)
train_data, train_labels, test_data, test_labels = \
    compute_feature_and_label_data(integrated_data, final_columns, fake)

sklearn_model = define_model(numerical_columns, categorical_columns)

model = sklearn_model.fit(train_data, train_labels)

# TODO these must be computed by you
equal_opportunity = 0.0
ratings_usage = np.full(len(ratings), False)
products_usage = np.full(len(products), False)

print('---------------------------------------------------------------------')
print('Train accuracy', model.score(train_data, train_labels))
print('Test accuracy', model.score(test_data, test_labels))
print(f'Equal opportunity w.r.t. third party reviews {equal_opportunity}')
print(f'# Number of ratings used {np.sum(ratings_usage)}')
print(f'# Number of products used {np.sum(products_usage)}')
print('---------------------------------------------------------------------\n\n')

---------------------------------------------------------------------
Executing pipeline with seed 1234
---------------------------------------------------------------------
  Data access - Loading all four input relations from github
  Data Integration - Dropping reviews written before 2012-11-15
  Data Integration - restricting products to the following categories ['Digital_Video_Games']
  Data Integration - joined reviews, ratings, products & categories
  Data preparation - using columns ['review_body' 'product_title'] as textual feature 
  Projecting data to the feature and label columns ['total_votes', 'star_rating', 'category', 'vine', 'customer_id', 'text', 'is_helpful', 'review_date']
  Data preparation - temporal train/test split based on date 2014-03-01
  Feature encoding - Setting up feature transformations
  Modeling - defining a logistic regression model
---------------------------------------------------------------------
Train accuracy 0.8447098976109215
Test accuracy 0.

In [8]:
used_integrated_data = integrated_data.loc[integrated_data['text'].isin(train_data['text'])]

In [9]:
used_integrated_data

Unnamed: 0,marketplace,customer_id,review_id,product_id,vine,third_party,review_headline,review_body,review_date,star_rating,helpful_votes,total_votes,product_parent,product_title,category_id,id,category,text,is_helpful
36,US,50428972,R2E54ZEF8L3FYJ,B00DNHLFQA,N,Y,No need to keep track of Original CDs,All the CNC games are here and available for d...,2014-02-27,4,1,2,951665344,Command & Conquer The Ultimate Collection [Ins...,0,0,Digital_Video_Games,All the CNC games are here and available for...,True
37,US,16278166,R3ELKU92GYL3GS,B00DNHLFQA,N,Y,Amazing,"Amazing bargain, really fun games. I own mult...",2014-02-17,5,1,2,951665344,Command & Conquer The Ultimate Collection [Ins...,0,0,Digital_Video_Games,"Amazing bargain, really fun games. I own mu...",True
38,US,48363151,R2SKIFVC94BDY7,B00DNHLFQA,N,Y,C&C RULES,Come on its C&C! One of the original and class...,2014-01-28,5,0,0,951665344,Command & Conquer The Ultimate Collection [Ins...,0,0,Digital_Video_Games,Come on its C&C! One of the original and cla...,False
39,US,5163405,R3SX6I5B221VB7,B00DNHLFQA,N,Y,Awesome game and good price,I always loved the Command and Conquer series ...,2014-01-21,5,0,0,951665344,Command & Conquer The Ultimate Collection [Ins...,0,0,Digital_Video_Games,I always loved the Command and Conquer serie...,False
40,US,8231249,R251A6ZAF4MIDG,B00DNHLFQA,N,Y,Bundle,"All the game in one, if you buy them separatel...",2014-01-03,5,0,0,951665344,Command & Conquer The Ultimate Collection [Ins...,0,0,Digital_Video_Games,"All the game in one, if you buy them separat...",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49390,US,19270294,R1GWT2ZQL717C0,B005MKZQXY,N,Y,TERRIBLE!!!,i got this crazy machines game to go straight ...,2012-11-16,1,1,1,440199763,Crazy Machines 1: The Wacky Contraptions Game,0,0,Digital_Video_Games,i got this crazy machines game to go straigh...,True
49391,US,41918149,R2QSO750FOO5F,B009GKT27C,N,Y,Really Installing Steam.,When I purchased this game I expected a .exe f...,2012-11-16,1,3,56,463448240,Rochard [Online Game Code],0,0,Digital_Video_Games,When I purchased this game I expected a .exe...,True
49392,US,37993350,R17PD379UR87JB,B009DNVXQ0,N,Y,Exactly what I expected,Apparently there aren't any reviews here but a...,2012-11-16,4,8,9,242614574,R.A.W: Realms of Ancient War [Online Game Code],0,0,Digital_Video_Games,Apparently there aren't any reviews here but...,True
49393,US,9909207,R3NK89CA8SZ7BY,B009ETPKK8,N,N,Worked Instantly!,Thanks for the extra bit of DLC! I added this...,2012-11-16,5,2,2,89991915,Warlock Master of the Arcane Master of Artifac...,0,0,Digital_Video_Games,Thanks for the extra bit of DLC! I added th...,True


In [10]:
ratings.loc[ratings['review_id'].isin(used_integrated_data['review_id'])]

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes
102692,R3MYFPQS43VI4U,1,0,1
64718,R2X6C7MF5UNVHR,5,0,0
93674,R26ZROGA3BXGD8,5,0,0
71664,R1U3SW7MFTWNY7,5,1,1
105105,R11B5NUL1Y5PD7,4,0,3
...,...,...,...,...
110108,RDRRDZOGF6Y2Z,5,0,0
68826,R31ZU225EBLRV7,5,0,0
94357,R3BZ9VTST4H4MY,5,0,1
92029,RC8AEW15R66CM,4,0,0


In [11]:
products.loc[products['product_id'].isin(used_integrated_data['product_id'])]

Unnamed: 0,product_id,product_parent,product_title,category_id
1,B00F4CEHNK,341969535,Xbox Live Gift Card,0
2,B00DNHLFQA,951665344,Command & Conquer The Ultimate Collection [Ins...,0
3,B004RMK5QG,395682204,Playstation Plus Subscription,0
7,B004RMK4BC,384246568,Playstation Network Card,0
8,B00K59HKIQ,384246568,Playstation Network Card,0
...,...,...,...,...
120987,B005LJYJ4S,811090288,Alpha Polaris [Game Download],0
121123,B009VJJICC,383478126,Darksiders II DLC - Abyssal Forge [Online Game...,0
121192,B00A3XV23I,820398554,Prototype [Online Game Code],0
121312,B001KC01EC,814738431,Hyperspace Invader [Download],0


In [12]:
from sklearn.metrics import confusion_matrix
def execute_pipeline(seed):
  fake = Faker()
  fake.seed_instance(seed)
  np.random.seed(seed)

  print('---------------------------------------------------------------------')
  print(f'Executing pipeline with seed {seed}')
  print('---------------------------------------------------------------------')

  numerical_columns = random_subset(['total_votes', 'star_rating'])
  categorical_columns = random_subset(['customer_id', 'product_id', 'vine', 'category'])
  final_columns = numerical_columns + categorical_columns + ['text', 'is_helpful', 'review_date']

  reviews, ratings, products, categories = load_data()

  integrated_data = integrate_data(reviews, ratings, products, categories, fake)
  train_data, train_labels, test_data, test_labels = \
      compute_feature_and_label_data(integrated_data, final_columns, fake)

  sklearn_model = define_model(numerical_columns, categorical_columns)

  model = sklearn_model.fit(train_data, train_labels)

  # TODO these must be computed by you
  non_third_party_data = integrated_data.loc[integrated_data['third_party']=='N']
  third_party_data = integrated_data.loc[integrated_data['third_party']=='Y']

  test_data['tracking_id'] = np.arange(len(test_data))

  #non_third_party_test = test_data.loc[test_data['customer_id'].isin(non_third_party_data['customer_id'])]
  non_third_party_test = test_data.loc[test_data['text'].isin(non_third_party_data['text'])]
  non_third_party_test_labels = test_labels[non_third_party_test.tracking_id]
  non_third_party_test = non_third_party_test.drop('tracking_id', axis = 1)

  #third_party_test = test_data.loc[test_data['customer_id'].isin(third_party_data['customer_id'])]
  third_party_test = test_data.loc[test_data['text'].isin(third_party_data['text'])]
  third_party_test_labels = test_labels[third_party_test.tracking_id]
  third_party_test = third_party_test.drop('tracking_id', axis = 1)

  non_third_party_conf_mat = confusion_matrix(non_third_party_test_labels, model.predict(non_third_party_test))
  third_party_conf_mat = confusion_matrix(third_party_test_labels, model.predict(third_party_test))
  
  equal_opportunity = third_party_conf_mat[0,1]/third_party_conf_mat[1,1] - non_third_party_conf_mat[0,1]/non_third_party_conf_mat[1,1]
  ratings_usage = np.full(len(ratings), False)
  products_usage = np.full(len(products), False)

  print('---------------------------------------------------------------------')
  print('Train accuracy', model.score(train_data, train_labels))
  print('Test accuracy', model.score(test_data, test_labels))
  print(f'Equal opportunity w.r.t. third party reviews {equal_opportunity}')
  print(f'# Number of ratings used {np.sum(ratings_usage)}')
  print(f'# Number of products used {np.sum(products_usage)}')
  print('---------------------------------------------------------------------\n\n')


In [13]:
execute_pipeline(1234)

---------------------------------------------------------------------
Executing pipeline with seed 1234
---------------------------------------------------------------------
  Data access - Loading all four input relations from github
  Data Integration - Dropping reviews written before 2012-11-15
  Data Integration - restricting products to the following categories ['Digital_Video_Games']
  Data Integration - joined reviews, ratings, products & categories
  Data preparation - using columns ['review_body' 'product_title'] as textual feature 
  Projecting data to the feature and label columns ['total_votes', 'star_rating', 'category', 'vine', 'customer_id', 'text', 'is_helpful', 'review_date']
  Data preparation - temporal train/test split based on date 2014-03-01
  Feature encoding - Setting up feature transformations
  Modeling - defining a logistic regression model
---------------------------------------------------------------------
Train accuracy 0.8447098976109215
Test accuracy 0.

In [94]:
seeds_to_evaluate = [1234, 5678, 91011, 121314, 151617]
for seed in seeds_to_evaluate:
  execute_pipeline(seed)

---------------------------------------------------------------------
Executing pipeline with seed 1234
---------------------------------------------------------------------
  Data access - Loading all four input relations from github
  Data Integration - Dropping reviews written before 2012-11-15
  Data Integration - restricting products to the following categories ['Digital_Video_Games']
  Data Integration - joined reviews, ratings, products & categories
  Data preparation - using columns ['review_body' 'product_title'] as textual feature 
  Projecting data to the feature and label columns ['total_votes', 'star_rating', 'category', 'vine', 'customer_id', 'text', 'is_helpful', 'review_date']
  Data preparation - temporal train/test split based on date 2014-03-01
  Feature encoding - Setting up feature transformations
  Modeling - defining a logistic regression model
---------------------------------------------------------------------
Train accuracy 0.8447098976109215
Test accuracy 0.