In [1]:
!pip install faker

Collecting faker
  Downloading Faker-13.11.1-py3-none-any.whl (1.5 MB)
[?25l[K     |▏                               | 10 kB 18.9 MB/s eta 0:00:01[K     |▍                               | 20 kB 12.2 MB/s eta 0:00:01[K     |▋                               | 30 kB 9.4 MB/s eta 0:00:01[K     |▉                               | 40 kB 8.5 MB/s eta 0:00:01[K     |█                               | 51 kB 4.2 MB/s eta 0:00:01[K     |█▎                              | 61 kB 5.0 MB/s eta 0:00:01[K     |█▌                              | 71 kB 5.5 MB/s eta 0:00:01[K     |█▊                              | 81 kB 4.1 MB/s eta 0:00:01[K     |██                              | 92 kB 4.5 MB/s eta 0:00:01[K     |██▏                             | 102 kB 5.0 MB/s eta 0:00:01[K     |██▍                             | 112 kB 5.0 MB/s eta 0:00:01[K     |██▋                             | 122 kB 5.0 MB/s eta 0:00:01[K     |██▊                             | 133 kB 5.0 MB/s eta 0:00:01[K   

In [2]:
import pandas as pd
import numpy as np
from faker import Faker
import datetime
from sklearn.preprocessing import OneHotEncoder, label_binarize, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline

In [3]:
def load_data():

  print("  Data access - Loading all four input relations from github")

  base_url = 'https://github.com/schelterlabs/arguseyes/raw/freamon/arguseyes/example_pipelines/datasets/freamon'  

  reviews = pd.read_csv(f'{base_url}/reviews.csv.gz', compression='gzip', index_col=0)
  ratings = pd.read_csv(f'{base_url}/ratings.csv', index_col=0)
  products = pd.read_csv(f'{base_url}/products.csv', index_col=0)
  categories = pd.read_csv(f'{base_url}/categories.csv', index_col=0)

  return reviews, ratings, products, categories

In [4]:
def random_subset(arr):
  size = np.random.randint(low=1, high=len(arr)+1)
  choice = np.random.choice(arr, size=size, replace=False)
  return [str(item) for item in choice]

In [5]:
def integrate_data(reviews, ratings, products, categories, fake):
  start_date = fake.date_between(start_date=datetime.date(year=2011, month=1, day=1),
                                  end_date=datetime.date(year=2013, month=6, day=1))

  print(f"  Data Integration - Dropping reviews written before {start_date.strftime('%Y-%m-%d')}")
  reviews = reviews[reviews.review_date >= start_date.strftime('%Y-%m-%d')]

  reviews_with_ratings = reviews.merge(ratings, on='review_id')
  products_with_categories = products.merge(left_on='category_id', right_on='id', right=categories)

  random_categories = random_subset(list(categories.category))
  print(f"  Data Integration - restricting products to the following categories {random_categories}")
  products_with_categories = products_with_categories[products_with_categories.category.isin(random_categories)]

  reviews_with_products_and_ratings = reviews_with_ratings.merge(products_with_categories, on='product_id')

  print(f"  Data Integration - joined reviews, ratings, products & categories")
  return reviews_with_products_and_ratings

In [7]:
def compute_feature_and_label_data(reviews_with_products_and_ratings, final_columns, fake):
  reviews_with_products_and_ratings['product_title'] = \
      reviews_with_products_and_ratings['product_title'].fillna(value='')

  reviews_with_products_and_ratings['review_headline'] = \
      reviews_with_products_and_ratings['review_headline'].fillna(value='')

  reviews_with_products_and_ratings['review_body'] = \
      reviews_with_products_and_ratings['review_body'].fillna(value='')

  num_text_columns = np.random.randint(low=1, high=4)
  random_text_columns = np.random.choice(['product_title', 'review_headline', 'review_body'],
                                          size=num_text_columns, replace=False)

  print(f"  Data preparation - using columns {random_text_columns} as textual feature ")
  reviews_with_products_and_ratings['text'] = ' '
  for text_column in random_text_columns:
      reviews_with_products_and_ratings['text'] = reviews_with_products_and_ratings['text'] + ' ' \
                                                  + reviews_with_products_and_ratings[text_column]

  reviews_with_products_and_ratings['is_helpful'] = reviews_with_products_and_ratings['helpful_votes'] > 0

  print(f"  Projecting data to the feature and label columns {final_columns}")
  projected_reviews = reviews_with_products_and_ratings[final_columns]

  split_date = fake.date_between(start_date=datetime.date(year=2013, month=12, day=1),
                                  end_date=datetime.date(year=2015, month=1, day=1))

  print(f"  Data preparation - temporal train/test split based on date {split_date}")
  train_data = projected_reviews[projected_reviews.review_date <= split_date.strftime('%Y-%m-%d')]
  train_labels = label_binarize(train_data['is_helpful'], classes=[True, False]).ravel()

  test_data = projected_reviews[projected_reviews.review_date > split_date.strftime('%Y-%m-%d')]
  test_labels = label_binarize(test_data['is_helpful'], classes=[True, False]).ravel()

  return train_data, train_labels, test_data, test_labels

In [8]:
def define_model(numerical_columns, categorical_columns):
  print(f"  Feature encoding - Setting up feature transformations")
  feature_transformation = ColumnTransformer(transformers=[
    ('numerical_features', StandardScaler(), numerical_columns),
    ('categorical_features', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('textual_features', HashingVectorizer(ngram_range=(1, 3), n_features=100), 'text'),
  ], remainder="drop")

  print(f"  Modeling - defining a logistic regression model")
  sklearn_model = Pipeline([
    ('features', feature_transformation),
    ('learner', SGDClassifier(loss='log', penalty='l1', max_iter=1000))])

  return sklearn_model


### Task 1 - Group Fairness

__Compute the fairness of the pipeline with respect to third party reviews__. In particular, compute the [equal opportunity](https://en.wikipedia.org/wiki/Fairness_(machine_learning)#Group_Fairness_criteria) metric (the difference in false negative rates) between reviews from a third party and reviews not from a third party.

### Task 2 - Data Usage

__Compute which records from the ratings and products relation are used to train the classifier__. Compute two boolean arrays with a dimensionality similar to the cardinality of the relations, where the entry at position i denotes whether the i-th record is included in the training data of the classifier. 


In [26]:
def execute_pipeline(seed):
  fake = Faker()
  fake.seed_instance(seed)
  np.random.seed(seed)

  print('---------------------------------------------------------------------')
  print(f'Executing pipeline with seed {seed}')
  print('---------------------------------------------------------------------')

  numerical_columns = random_subset(['total_votes', 'star_rating'])
  categorical_columns = random_subset(['customer_id', 'product_id', 'vine', 'category'])
  final_columns = numerical_columns + categorical_columns + ['text', 'is_helpful', 'review_date', 'third_party']

  reviews, ratings, products, categories = load_data()

  integrated_data = integrate_data(reviews, ratings, products, categories, fake)
  train_data, train_labels, test_data, test_labels = \
      compute_feature_and_label_data(integrated_data, final_columns, fake)

  sklearn_model = define_model(numerical_columns, categorical_columns)

  model = sklearn_model.fit(train_data, train_labels)

  # TODO these must be computed by you
  equal_opportunity = 0.0
  ratings_usage = np.full(len(ratings), False)
  products_usage = np.full(len(products), False)

  print('---------------------------------------------------------------------')
  print('Train accuracy', model.score(train_data, train_labels))
  print('Test accuracy', model.score(test_data, test_labels))
  print(f'Equal opportunity w.r.t. third party reviews {equal_opportunity}')
  print(f'# Number of ratings used {np.sum(ratings_usage)}')
  print(f'# Number of products used {np.sum(products_usage)}')
  print('---------------------------------------------------------------------\n\n')

  predicted = model.predict(test_data)
  print(predicted)
  print(test_labels)
  print(test_data)
  # gapminder[gapminder['year']==2002]

execute_pipeline(1234)

---------------------------------------------------------------------
Executing pipeline with seed 1234
---------------------------------------------------------------------
  Data access - Loading all four input relations from github
  Data Integration - Dropping reviews written before 2012-11-15
  Data Integration - restricting products to the following categories ['Digital_Video_Games']
  Data Integration - joined reviews, ratings, products & categories
  Data preparation - using columns ['review_body' 'product_title'] as textual feature 
  Projecting data to the feature and label columns ['total_votes', 'star_rating', 'category', 'vine', 'customer_id', 'text', 'is_helpful', 'review_date', 'third_party']
  Data preparation - temporal train/test split based on date 2014-03-01
  Feature encoding - Setting up feature transformations
  Modeling - defining a logistic regression model
---------------------------------------------------------------------
Train accuracy 0.8447098976109215
T

In [10]:
seeds_to_evaluate = [1234, 5678, 91011, 121314, 151617]
for seed in seeds_to_evaluate:
  execute_pipeline(seed)

---------------------------------------------------------------------
Executing pipeline with seed 1234
---------------------------------------------------------------------
  Data access - Loading all four input relations from github
  Data Integration - Dropping reviews written before 2012-11-15
  Data Integration - restricting products to the following categories ['Digital_Video_Games']
  Data Integration - joined reviews, ratings, products & categories
  Data preparation - using columns ['review_body' 'product_title'] as textual feature 
  Projecting data to the feature and label columns ['total_votes', 'star_rating', 'category', 'vine', 'customer_id', 'text', 'is_helpful', 'review_date']
  Data preparation - temporal train/test split based on date 2014-03-01
  Feature encoding - Setting up feature transformations
  Modeling - defining a logistic regression model
---------------------------------------------------------------------
Train accuracy 0.8447098976109215
Test accuracy 0.

In [12]:
 reviews, ratings, products, categories = load_data()

  Data access - Loading all four input relations from github


In [13]:
reviews

Unnamed: 0,marketplace,customer_id,review_id,product_id,vine,third_party,review_headline,review_body,review_date
0,US,21269168,RSH1OZ87OYK92,B013PURRZW,N,N,A slight improvement from last year.,I keep buying madden every year hoping they ge...,2015-08-31
1,US,133437,R1WFOQ3N9BO65I,B00F4CEHNK,N,Y,Five Stars,Awesome,2015-08-31
2,US,45765011,R3YOOS71KM5M9,B00DNHLFQA,N,Y,Hail to the great Yuri!,If you are prepping for the end of the world t...,2015-08-31
3,US,113118,R3R14UATT3OUFU,B004RMK5QG,N,Y,Five Stars,Perfect,2015-08-31
4,US,22151364,RV2W9SGDNQA2C,B00G9BNLQE,N,Y,Five Stars,Awesome!,2015-08-31
...,...,...,...,...,...,...,...,...,...
101831,US,41754720,R19OFJV91M7D8X,B000YMR61A,N,N,"Easy to use, 1 comment 1 serious problem",I chose the deluxe version CD because of mortg...,2008-02-11
101832,US,51669529,R1I6G894K5AGG5,B000YMR61A,N,N,Schedule C IS for business- figures it would ...,"Schedule C IS for business, so figures it wou...",2008-02-08
101833,US,24731012,R17OE43FFEP81I,B000YMR5X4,N,N,Hassel to download,I wish that companies can test several scenari...,2008-02-05
101834,US,16049580,R15MGDDK63B52Z,B000YMR61A,N,N,beware of vista,i just installed turbotax deluxe 2007. If you ...,2008-02-05


In [28]:
fake = Faker()
fake.seed_instance(seed)
np.random.seed(seed)

print('---------------------------------------------------------------------')
print(f'Executing pipeline with seed {seed}')
print('---------------------------------------------------------------------')

numerical_columns = random_subset(['total_votes', 'star_rating'])
categorical_columns = random_subset(['customer_id', 'product_id', 'vine', 'category'])
final_columns = numerical_columns + categorical_columns + ['text', 'is_helpful', 'review_date', 'third_party']

reviews, ratings, products, categories = load_data()

integrated_data = integrate_data(reviews, ratings, products, categories, fake)
train_data, train_labels, test_data, test_labels = \
    compute_feature_and_label_data(integrated_data, final_columns, fake)

sklearn_model = define_model(numerical_columns, categorical_columns)

model = sklearn_model.fit(train_data, train_labels)

# TODO these must be computed by you
equal_opportunity = 0.0
ratings_usage = np.full(len(ratings), False)
products_usage = np.full(len(products), False)

print('---------------------------------------------------------------------')
print('Train accuracy', model.score(train_data, train_labels))
print('Test accuracy', model.score(test_data, test_labels))
print(f'Equal opportunity w.r.t. third party reviews {equal_opportunity}')
print(f'# Number of ratings used {np.sum(ratings_usage)}')
print(f'# Number of products used {np.sum(products_usage)}')
print('---------------------------------------------------------------------\n\n')


---------------------------------------------------------------------
Executing pipeline with seed 151617
---------------------------------------------------------------------
  Data access - Loading all four input relations from github
  Data Integration - Dropping reviews written before 2011-10-26
  Data Integration - restricting products to the following categories ['Digital_Software', 'Digital_Video_Games']
  Data Integration - joined reviews, ratings, products & categories
  Data preparation - using columns ['review_body' 'product_title' 'review_headline'] as textual feature 
  Projecting data to the feature and label columns ['star_rating', 'total_votes', 'vine', 'category', 'text', 'is_helpful', 'review_date', 'third_party']
  Data preparation - temporal train/test split based on date 2014-06-26
  Feature encoding - Setting up feature transformations
  Modeling - defining a logistic regression model
---------------------------------------------------------------------
Train accu

In [29]:

# predicted = model.predict(test_data)


In [30]:
# print(len(predicted))
#print(test_labels)
#print(test_data)
# gapminder[gapminder['year']==2002]

In [31]:
my_testdata = test_data.copy()

In [32]:
my_testdata

Unnamed: 0,star_rating,total_votes,vine,category,text,is_helpful,review_date,third_party
0,2,3,N,Digital_Video_Games,I keep buying madden every year hoping they ...,True,2015-08-31,N
1,5,4,N,Digital_Video_Games,"Worked great!!!!! Best way to buy a game, on...",True,2015-08-27,Y
2,5,0,N,Digital_Video_Games,If you are prepping for the end of the world...,False,2015-08-31,Y
3,2,3,N,Digital_Video_Games,after i forgot my password for my origin acc...,False,2015-08-19,Y
4,5,0,N,Digital_Video_Games,Works great used to play these games when i ...,False,2015-07-16,Y
...,...,...,...,...,...,...,...,...
93282,5,0,N,Digital_Software,I am a long time fan of WordPerfect. The &#3...,False,2014-06-30,Y
93284,3,1,N,Digital_Software,Used to use this a lot to keep in touch with...,False,2014-06-29,N
93296,1,0,N,Digital_Software,Although the download apparently worked OK t...,False,2014-06-28,Y
93298,1,1,N,Digital_Software,Bought this for my Mom's memorial video at h...,False,2014-06-28,Y


In [34]:
my_testdata.loc[(my_testdata['is_helpful'] == True) & (my_testdata['third_party'] == 'Y') ]

Unnamed: 0,star_rating,total_votes,vine,category,text,is_helpful,review_date,third_party
1,5,4,N,Digital_Video_Games,"Worked great!!!!! Best way to buy a game, on...",True,2015-08-27,Y
5,1,2,N,Digital_Video_Games,Gaming codes for two of the games didn't dow...,True,2015-07-05,Y
11,1,2,N,Digital_Video_Games,None of this works well with windows 8 and w...,True,2015-01-14,Y
12,1,4,N,Digital_Video_Games,I got this to play zero hour but it doesn't ...,True,2014-12-27,Y
20,1,4,N,Digital_Video_Games,Doesn't work on Windows 7. Don't waste your ...,True,2014-08-08,Y
...,...,...,...,...,...,...,...,...
93256,3,3,N,Digital_Software,Not a well-written program. The editor attem...,True,2014-07-10,Y
93258,1,1,N,Digital_Software,"The charged promptly, it downloaded quickly,...",True,2014-07-09,Y
93261,1,3,N,Digital_Software,"As much as it cost, this product does nothin...",True,2014-07-09,Y
93269,2,1,N,Digital_Software,For the discounted price it is a good tool t...,True,2014-07-07,Y


In [35]:
my_testdata.loc[(my_testdata['is_helpful'] == True) & (my_testdata['third_party'] == 'N') ]

Unnamed: 0,star_rating,total_votes,vine,category,text,is_helpful,review_date,third_party
0,2,3,N,Digital_Video_Games,I keep buying madden every year hoping they ...,True,2015-08-31,N
6,1,3,N,Digital_Video_Games,Found the game for half the price I paid. Ca...,True,2015-04-17,N
112,1,2,N,Digital_Video_Games,Were my code Playstation Plus Subscription O...,True,2015-07-21,N
197,1,11,N,Digital_Video_Games,Nobody should buy this membership. If you do...,True,2015-05-17,N
555,1,16,N,Digital_Video_Games,Do Research Before Buying a PS Plus Subscrip...,True,2014-11-26,N
...,...,...,...,...,...,...,...,...
93098,1,1,N,Digital_Software,"I found that, every once in a while, my Mac ...",True,2014-07-20,N
93171,1,7,N,Digital_Software,Is this a joke? Am I now supposed to pay for...,True,2014-07-17,N
93260,5,2,N,Digital_Software,"There is an amazing solution for writers, th...",True,2014-07-09,N
93266,5,2,N,Digital_Software,I have been in the video business for 30+ ye...,True,2014-07-04,N


In [36]:
my_testdata.loc[(my_testdata['is_helpful'] == False) & (my_testdata['third_party'] == 'Y') ]

Unnamed: 0,star_rating,total_votes,vine,category,text,is_helpful,review_date,third_party
2,5,0,N,Digital_Video_Games,If you are prepping for the end of the world...,False,2015-08-31,Y
3,2,3,N,Digital_Video_Games,after i forgot my password for my origin acc...,False,2015-08-19,Y
4,5,0,N,Digital_Video_Games,Works great used to play these games when i ...,False,2015-07-16,Y
7,3,1,N,Digital_Video_Games,I absolutely love the Command and Conquer ga...,False,2015-04-06,Y
8,5,0,N,Digital_Video_Games,Offered this game to my dad that brought me ...,False,2015-03-10,Y
...,...,...,...,...,...,...,...,...
93273,3,1,N,Digital_Software,I miss the clipart that my other desktop pub...,False,2014-07-03,Y
93282,5,0,N,Digital_Software,I am a long time fan of WordPerfect. The &#3...,False,2014-06-30,Y
93296,1,0,N,Digital_Software,Although the download apparently worked OK t...,False,2014-06-28,Y
93298,1,1,N,Digital_Software,Bought this for my Mom's memorial video at h...,False,2014-06-28,Y


In [37]:
my_testdata.loc[(my_testdata['is_helpful'] == False) & (my_testdata['third_party'] == 'N') ]

Unnamed: 0,star_rating,total_votes,vine,category,text,is_helpful,review_date,third_party
54,5,0,N,Digital_Video_Games,Good Playstation Plus Subscription Five Stars,False,2015-08-31,N
79,5,0,N,Digital_Video_Games,''We are aware that users are experiencing s...,False,2015-08-10,N
82,5,0,N,Digital_Video_Games,If you have a current PlayStation device in ...,False,2015-08-08,N
84,4,0,N,Digital_Video_Games,I only chose four stars because of the Plays...,False,2015-08-08,N
87,5,0,N,Digital_Video_Games,If you have a current PlayStation device in ...,False,2015-08-07,N
...,...,...,...,...,...,...,...,...
93193,1,0,N,Digital_Software,I find the policy of making a download avail...,False,2014-07-13,N
93202,5,0,N,Digital_Software,The best anti-virus you can get.<br />Fast s...,False,2014-07-12,N
93247,5,0,N,Digital_Software,There's still no challenger for Corel Painte...,False,2014-07-10,N
93265,5,0,N,Digital_Software,The Movavi Video Converter is an awesome pro...,False,2014-07-07,N
