<a href="https://colab.research.google.com/gist/effyli/e4e855cc3eb819114eda53d24966e03f/copy-of-freamon-user-study.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%pip install faker

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
Collecting faker
  Downloading Faker-13.11.1-py3-none-any.whl (1.5 MB)
     |████████████████████████████████| 1.5 MB 2.5 MB/s            
Installing collected packages: faker
[33m  DEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
Successfully in

In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import datetime
from sklearn.preprocessing import OneHotEncoder, label_binarize, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline

In [4]:
def load_data():

  print("  Data access - Loading all four input relations from github")

  base_url = 'https://github.com/schelterlabs/arguseyes/raw/freamon/arguseyes/example_pipelines/datasets/freamon'  

  reviews = pd.read_csv(f'{base_url}/reviews.csv.gz', compression='gzip', index_col=0)
  ratings = pd.read_csv(f'{base_url}/ratings.csv', index_col=0)
  products = pd.read_csv(f'{base_url}/products.csv', index_col=0)
  categories = pd.read_csv(f'{base_url}/categories.csv', index_col=0)

  return reviews, ratings, products, categories

In [5]:
def random_subset(arr):
  size = np.random.randint(low=1, high=len(arr)+1)
  choice = np.random.choice(arr, size=size, replace=False)
  return [str(item) for item in choice]

In [6]:
def integrate_data(reviews, ratings, products, categories, fake):
  start_date = fake.date_between(start_date=datetime.date(year=2011, month=1, day=1),
                                  end_date=datetime.date(year=2013, month=6, day=1))

  print(f"  Data Integration - Dropping reviews written before {start_date.strftime('%Y-%m-%d')}")
  reviews = reviews[reviews.review_date >= start_date.strftime('%Y-%m-%d')]

  reviews_with_ratings = reviews.merge(ratings, on='review_id')
  products_with_categories = products.merge(left_on='category_id', right_on='id', right=categories)

  random_categories = random_subset(list(categories.category))
  print(f"  Data Integration - restricting products to the following categories {random_categories}")
  products_with_categories = products_with_categories[products_with_categories.category.isin(random_categories)]

  reviews_with_products_and_ratings = reviews_with_ratings.merge(products_with_categories, on='product_id')

  print(f"  Data Integration - joined reviews, ratings, products & categories")
  return reviews_with_products_and_ratings

In [7]:
def compute_feature_and_label_data(reviews_with_products_and_ratings, final_columns, fake):
  reviews_with_products_and_ratings['product_title'] = \
      reviews_with_products_and_ratings['product_title'].fillna(value='')

  reviews_with_products_and_ratings['review_headline'] = \
      reviews_with_products_and_ratings['review_headline'].fillna(value='')

  reviews_with_products_and_ratings['review_body'] = \
      reviews_with_products_and_ratings['review_body'].fillna(value='')

  num_text_columns = np.random.randint(low=1, high=4)
  random_text_columns = np.random.choice(['product_title', 'review_headline', 'review_body'],
                                          size=num_text_columns, replace=False)

  print(f"  Data preparation - using columns {random_text_columns} as textual feature ")
  reviews_with_products_and_ratings['text'] = ' '
  for text_column in random_text_columns:
      reviews_with_products_and_ratings['text'] = reviews_with_products_and_ratings['text'] + ' ' \
                                                  + reviews_with_products_and_ratings[text_column]

  reviews_with_products_and_ratings['is_helpful'] = reviews_with_products_and_ratings['helpful_votes'] > 0

  print(f"  Projecting data to the feature and label columns {final_columns}")
  projected_reviews = reviews_with_products_and_ratings[final_columns]

  split_date = fake.date_between(start_date=datetime.date(year=2013, month=12, day=1),
                                  end_date=datetime.date(year=2015, month=1, day=1))

  print(f"  Data preparation - temporal train/test split based on date {split_date}")
  train_data = projected_reviews[projected_reviews.review_date <= split_date.strftime('%Y-%m-%d')]
  train_labels = label_binarize(train_data['is_helpful'], classes=[True, False]).ravel()

  test_data = projected_reviews[projected_reviews.review_date > split_date.strftime('%Y-%m-%d')]
  test_labels = label_binarize(test_data['is_helpful'], classes=[True, False]).ravel()

  return train_data, train_labels, test_data, test_labels

In [8]:
def define_model(numerical_columns, categorical_columns):
  print(f"  Feature encoding - Setting up feature transformations")
  feature_transformation = ColumnTransformer(transformers=[
    ('numerical_features', StandardScaler(), numerical_columns),
    ('categorical_features', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    ('textual_features', HashingVectorizer(ngram_range=(1, 3), n_features=100), 'text'),
  ], remainder="drop")

  print(f"  Modeling - defining a logistic regression model")
  sklearn_model = Pipeline([
    ('features', feature_transformation),
    ('learner', SGDClassifier(loss='log', penalty='l1', max_iter=1000))])

  return sklearn_model

In [9]:
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
           FP += 1
        if y_actual[i]==y_hat[i]==0:
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
           FN += 1

    return(TP, FP, TN, FN)

In [10]:
def compute_eq_oppo(test_data, test_labels, model):
    # the FN rate for data from the third party - the FN rate for data from non-third party
    p_labels = model.predict(test_data)
    third_ids = test_data.index[test_data['third_party'] == 'Y'].tolist()
    not_third_ids = test_data.index[test_data['third_party'] == 'N'].tolist()
    keys = test_data.index.tolist()
    third_y_actual = []
    third_y_hat = []
    not_third_y_actual = []
    not_third_y_hat =[]
    for i in third_ids:
        index = keys.index(i)
        third_y_actual.append(test_labels[index])
        third_y_hat.append(p_labels[index])
    for j in not_third_ids:
        index = keys.index(j)
        not_third_y_actual.append(test_labels[index])
        not_third_y_hat.append(p_labels[index])
    _,_,_,third_FN = perf_measure(third_y_actual, third_y_hat)
    _,_,_,not_third_FN = perf_measure(not_third_y_actual, not_third_y_hat)
    _,_,_,FN = perf_measure(p_labels, test_labels)
    return abs(third_FN/FN - not_third_FN/FN)

In [11]:
def train_ratings(test_data):
    keys = test_data.index.tolist()
    keys = np.array(keys)
    return keys

In [12]:
train_ratings(test_data)

NameError: name 'test_data' is not defined


### Task 1 - Group Fairness

__Compute the fairness of the pipeline with respect to third party reviews__. In particular, compute the [equal opportunity](https://en.wikipedia.org/wiki/Fairness_(machine_learning)#Group_Fairness_criteria) metric (the difference in false negative rates) between reviews from a third party and reviews not from a third party.

### Task 2 - Data Usage

__Compute which records from the ratings and products relation are used to train the classifier__. Compute two boolean arrays with a dimensionality similar to the cardinality of the relations, where the entry at position i denotes whether the i-th record is included in the training data of the classifier. 


In [13]:
def execute_pipeline(seed):
  fake = Faker()
  fake.seed_instance(seed)
  np.random.seed(seed)

  print('---------------------------------------------------------------------')
  print(f'Executing pipeline with seed {seed}')
  print('---------------------------------------------------------------------')

  numerical_columns = random_subset(['total_votes', 'star_rating'])
  categorical_columns = random_subset(['customer_id', 'product_id', 'vine', 'category'])
  final_columns = numerical_columns + categorical_columns + ['text', 'is_helpful', 'review_date'] + ['third_party']

  reviews, ratings, products, categories = load_data()

  integrated_data = integrate_data(reviews, ratings, products, categories, fake)
  train_data, train_labels, test_data, test_labels = \
      compute_feature_and_label_data(integrated_data, final_columns, fake)

  sklearn_model = define_model(numerical_columns, categorical_columns)

  model = sklearn_model.fit(train_data, train_labels)

  # TODO these must be computed by you
  equal_opportunity = compute_eq_oppo(test_data, test_labels, model)
  ratings_usage = np.full(len(ratings), False)
  products_usage = np.full(len(products), False)

  print('---------------------------------------------------------------------')
  print('Train accuracy', model.score(train_data, train_labels))
  print('Test accuracy', model.score(test_data, test_labels))
  print(f'Equal opportunity w.r.t. third party reviews {equal_opportunity}')
  print(f'# Number of ratings used {np.sum(ratings_usage)}')
  print(f'# Number of products used {np.sum(products_usage)}')
  print('---------------------------------------------------------------------\n\n')


In [14]:
execute_pipeline(1234)

---------------------------------------------------------------------
Executing pipeline with seed 1234
---------------------------------------------------------------------
  Data access - Loading all four input relations from github
  Data Integration - Dropping reviews written before 2012-11-15
  Data Integration - restricting products to the following categories ['Digital_Video_Games']
  Data Integration - joined reviews, ratings, products & categories
  Data preparation - using columns ['review_body' 'product_title'] as textual feature 
  Projecting data to the feature and label columns ['total_votes', 'star_rating', 'category', 'vine', 'customer_id', 'text', 'is_helpful', 'review_date', 'third_party']
  Data preparation - temporal train/test split based on date 2014-03-01
  Feature encoding - Setting up feature transformations
  Modeling - defining a logistic regression model
---------------------------------------------------------------------
Train accuracy 0.8447098976109215
T

In [None]:
seeds_to_evaluate = [1234, 5678, 91011, 121314, 151617]
for seed in seeds_to_evaluate:
  execute_pipeline(seed)