In [1]:
!unzip hw5_part_two_release.zip

Archive:  hw5_part_two_release.zip
  inflating: data/questions-words.txt  
  inflating: data/definitional_pairs.json  
  inflating: data/equalize_pairs.json  
  inflating: data/gender_specific_full.json  
  inflating: data/profession_words.json  
  inflating: hw5_part2.ipynb         
  inflating: hw5_part1.py            
  inflating: hw5_part1_utils.py      
  inflating: hw5_part2.py            
  inflating: hw5_part3.py            
  inflating: adult.npz               
  inflating: homework.pdf            


In [2]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [0]:
import hw5_part1_utils

from typing import Tuple
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
import numpy as np

from tqdm import tqdm


In [0]:
# YOUR IMPLEMENTATION FOR THE SHADOW MODEL ATTACK GOES HERE ###################


def synthesize_attack_data(
    target_model: hw5_part1_utils.TargetModel,
    shadow_data: np.ndarray,
    shadow_labels: np.ndarray,
    num_shadow_models: int = 4
):
    """Synthesize attack data.

    Arguments:

        target_model {TargetModel} -- an instance of the TargetModel class;
          behaves as a keras model but additionally has a train_shadow_model
          function, which takes a subset of the shadow data and labels and
          returns a model with identical architecture and hyperparameters to
          the original target model, but that is trained on the given shadow
          data.

        shadow_data {np.ndarray} -- data available to the attack to train
          shadow models. If the arget model's training set is size N x D,
          shadow_data is 2N x D.

        shadow_labels {np.ndarray} -- the corresponding labels to the
          shadow_data, given as a numpy array of 2N integers in the range 0 to
          C where C is the number of classes.

        num_shadow_models {int} -- the number of shadow models to use when
          constructing the attack model's dataset.

    Returns: three np.ndarrays; let M = 2N * num_shadow_models

        attack_data {np.ndarray} [M, 2C] -- shadow data label probability and
           label one-hot

        attack_classes {np.ndarray} [M, 1 of {0,1,...,C-1}] -- shadow data
           labels

        attack_labels {np.ndarray} [M, 1 of {0,1}] -- attack data labels
           (training membership)

    """

    C = shadow_labels.max() + 1

    attack_data: np.ndarray = None
    attack_classes: np.ndarray = None
    attack_labels: np.ndarray = None

    # SOLUTION
    A = []
    S_classes = []
    S_labels = []
    for i in range(num_shadow_models):
        data_split = hw5_part1_utils.DataSplit(shadow_labels, np.random.seed(i)) #get in and out indexes 
        S_in_idx = data_split.in_idx 
        S_out_idx = data_split.out_idx

        S_in = shadow_data[S_in_idx] #get the S in data using the S in indexes for the shadow data
        S_in_labels = shadow_labels[S_in_idx] #get the S in labels using the S in indexes for the shadow labels
        S_in_labels_one_hot = to_categorical(S_in_labels, 10) #get one hot encondings of S in labels
 
        S_out = shadow_data[S_out_idx] #get the S out data using the S out indexes for the shadow data
        S_out_labels = shadow_labels[S_out_idx] #get the S out labels using the S out indexes for the shadow labels
        S_out_labels_one_hot = to_categorical(shadow_labels[S_out_idx], 10) #get one hot encondings of S out labels
        
        trained_model = target_model.train_shadow_model(S_in, S_in_labels) #train the shadow model with generated S in and S in labels data
        S_in_preds = trained_model.predict(S_in) #predict the output from trained model using S in data
        S_out_preds = trained_model.predict(S_out) #predict the output from trained model using S out data
        A_in = np.hstack((S_in_preds, S_in_labels_one_hot)) #get A in data by stacking the preds and labels
        A_out = np.hstack((S_out_preds, S_out_labels_one_hot)) #get A out data by stacking the preds and labels
        A.append(np.vstack((A_in, A_out)))
        S_classes.append(np.hstack((S_in_labels, S_out_labels))) #append stacked S in and out labels for attack_classes
        S_labels.append(np.hstack((np.ones(len(S_in_labels)), np.zeros(len(S_out_labels))))) #one hot vector for attack classes
    attack_data = np.vstack((A))
    attack_classes = np.hstack((S_classes))
    attack_labels = np.hstack((S_labels))
    # END OF SOLUTION

    return attack_data, attack_classes, attack_labels


def build_attack_models(
    target_model: hw5_part1_utils.TargetModel,
    shadow_data: np.ndarray,
    shadow_labels: np.ndarray,
    num_shadow_models: int = 4
):
    """Build attacker models.

    Arguments:

        target_model {TargetModel} -- an instance of the TargetModel class;
          behaves as a keras model but additionally has a train_shadow_model
          function, which takes a subset of the shadow data and labels and
          returns a model with identical architecture and hyperparameters to
          the original target model, but that is trained on the given shadow
          data.

        shadow_data {np.ndarray} -- data available to the attack to train
          shadow models. If the arget model's training set is size N x D,
          shadow_data is 2N x D.

        shadow_labels {np.ndarray} -- the corresponding labels to the
          shadow_data, given as a numpy array of 2N integers in the range 0 to
          C where C is the number of classes.

        num_shadow_models {int} -- the number of shadow models to use when
          constructing the attack model's dataset.

    Returns:

        {tuple} -- a tuple of C keras models, where the c^th model predicts the
        probability that an instance of class c was a training set member.

    """

    attack_data, attack_classes, attack_labels = \
        synthesize_attack_data(
            target_model,
            shadow_data,
            shadow_labels,
            num_shadow_models=4
        )

    # to return
    attack_models: Tuple[Model] = None

    C = shadow_labels.max() + 1

    # SOLUTION
    attack_models = []
    for i in range(C): #range over class numbers
        attack_model = get_attack_architecture(C) #initialize attack model for class
        attack_model.fit(attack_data, attack_labels) #fit model with attack data and labels
        attack_models.append(attack_model) #append attack model
    attack_models = tuple(attack_models) #turn into a tuple
    # END OF SOLUTION

    return attack_models

'''Initial model architecture taken from hw5_part1_utils.py and modified to meet the 
   assigment requests'''
def get_attack_architecture(C):
    l_in = Input((2*C,)) #input layer size based on class number
    l_inter = Dense(4*C, activation='relu')(l_in) #relu activation with hidden layer dependendent on class
    l_out = Dense(1, activation='sigmoid')(l_inter) #output layer of size 1

    m = Model(l_in, l_out) #initialize model

    m.compile(
        loss='binary_crossentropy', #use binary cross entropy
        optimizer='adam',
        metrics=['accuracy'],
        experimental_run_tf_function=False 
    )

    return m


def evaluate_membership(attack_models, y_pred, y):
    """Evaluate the attacker about the membership inference

    Arguments:

        attack_model {tuple} -- a tuple of C keras models, where C is the
          number of classes.

        y_pred {np.ndarray} -- an N x C numpy array with the predictions of the
          model on the N instances we are performing the inference attack on.

        y {np.ndarray} -- the true labels for each of the instances given as a
          numpy array of N integers.

    Returns:

        {np.ndarray} -- an array of N floats in the range [0,1] representing
          the estimated probability that each of the N given instances is a
          training set member.

    """

    # To return
    preds: np.ndarray = None

    # SOLUTION
    predictions = []
    attack_models = list(attack_models)
    #used to give proper y value size to model for predictions
    y_for_model = [np.hstack((yp, to_categorical(Y, 10))) for yp, Y in zip(y_pred, y)] 
    for i in range(len(y)):
        if i%100 == 0:
            print("Index: {}".format(i))
        predictions.append(attack_models[y[i]].predict(np.asmatrix(y_for_model[i])))
    preds = np.array(predictions)

    # END OF SOLUTION

    return preds

# YOU DO NOT NEED TO MODIFY THE REST OF THIS CODE. ############################


if __name__ == '__main__':
    # Load the dataset.
    data = hw5_part1_utils.CIFARData()

    # Make a target model for the dataset.
    target_model = \
        hw5_part1_utils.CIFARModel(
            epochs=48,
            batch_size=2048,
            noload=True, # prevents loading an existing pre-trained target
                         # model
        ).init(
            data.train, data.labels_train,
            # data.test, data.labels_test # validation data
        )

    tqdm.write('Building attack model...')
    attack_models = build_attack_models(
        target_model,
        data.shadow,
        data.labels_shadow
    )

    tqdm.write('Evaluating target model...')
    y_pred_in = target_model.predict(data.train)
    y_pred_out = target_model.predict(data.test)

    tqdm.write('  Train Accuracy: {:.4f}'.format(
        (y_pred_in.argmax(axis=1) == data.labels_train).mean()))
    tqdm.write('  Test Accuracy:  {:.4f}'.format(
        (y_pred_out.argmax(axis=1) == data.labels_test).mean()))

    in_preds = evaluate_membership(
        attack_models,
        y_pred_in,
        data.labels_train
    )
    out_preds = evaluate_membership(
        attack_models,
        y_pred_out,
        data.labels_test
    )

    wrongs_in = y_pred_in.argmax(axis=1) != data.labels_train
    wrongs_out = y_pred_out.argmax(axis=1) != data.labels_test

    true_positives = (in_preds > 0.5).mean()
    true_negatives = (out_preds < 0.5).mean()
    attack_acc = (true_positives + true_negatives) / 2.

    attack_precision = (in_preds > 0.5).sum() / (
        (in_preds > 0.5).sum() + (out_preds > 0.5).sum()
    )

    # Compare to a baseline that merely guesses correct classified instances
    # are in and incorrectly classified instances are out.
    baseline_true_positives = \
        (y_pred_in.argmax(axis=1) == data.labels_train).mean()
    baseline_true_negatives = \
        (y_pred_out.argmax(axis=1) != data.labels_test).mean()
    baseline_attack_acc = \
        (baseline_true_positives + baseline_true_negatives) / 2.

    baseline_precision = \
        (y_pred_in.argmax(axis=1) == data.labels_train).sum() / (
            (y_pred_in.argmax(axis=1) == data.labels_train).sum() +
            (y_pred_out.argmax(axis=1) == data.labels_test).sum()
        )

    tqdm.write(
      f"\nTrue positive rate: {true_positives:0.4f}, " +
      f"true negative rate: {true_negatives:0.4f}"
    )
    tqdm.write(
      f"Shadow Attack Accuracy: {attack_acc:0.4f}, precision: {attack_precision:0.4f} " +
      f"(baseline: {baseline_attack_acc:0.4f}, {baseline_precision:0.4f})"
    )

Training target model...

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Building attack model...
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 80000 samples
Train on 80000 samples
Train on 80000 samples
Train on 80000 samples
Train on 80000 samples
Train on 80000 samples
Train on 80000 samples
Train on 80000 samples
Train on 80000 samples
Train on 80000 samples
Evaluating target model...
  Train Accuracy: 0.9529
  Test Accuracy:  0.4144
Index: 0
Index: 100
Index: 200
Index: 300
Index: 400
Index: 500
Index: 600
Index: 700
Index: 800
Index: 900
Index: 1000
Index: 1100
Index: 1200
Index: 1300
Index: 1400
Index: 1500
Index: 1600
Index: 1700
Index: 1800
Index: 1900
Index: 2000
Index: 2100
Index: 2200
Index: 2300
Index: 2400
Index: 2500
Index: 2600
Index: 2700
Index: 2800
Index: 2900
Index: 3000
Index: 3100
Index: 3200
Index: 3300
Index: 3400
Index: 3500
Index: 3600
Index: 3700
Index: 3800
Index: 3900

In [0]:
N = 3
C = 10
y_pred = np.random.rand(N, C)
y = np.random.rand(N)
y_for_model = [np.hstack((yp, to_categorical(Y, 10))) for yp, Y in zip(y_pred, y)]
# for yp, Y in zip(y_pred, y):
    #y_for_model = np.hstack((yp, to_categorical(Y, 10)))
for i in range(len(y)):
    print(np.asmatrix(y_for_model[i]).shape)

(1, 20)
(1, 20)
(1, 20)


Question 3

In [0]:
!pip install gensim json
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
!gunzip GoogleNews-vectors-negative300.bin.gz
!mv GoogleNews-vectors-negative300.bin data

[31mERROR: Could not find a version that satisfies the requirement json (from versions: none)[0m
[31mERROR: No matching distribution found for json[0m
--2020-05-10 14:51:59--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.88.117
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.88.117|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2020-05-10 14:52:35 (43.9 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [0]:
import gensim.models
import numpy as np
from sklearn.decomposition import PCA
import json
from tqdm import tqdm

In [0]:
class WordEmbeddingDebiaser:

    def __init__(
        self,
        embedding_file_path,
        definitional_file_path='./data/definitional_pairs.json',
        equalize_file_path='./data/equalize_pairs.json',
        gender_specific_file_path='./data/gender_specific_full.json'
    ):

        self.model = gensim.models.KeyedVectors.load_word2vec_format(
            embedding_file_path, binary=True
        )

        # collect first 300000 words
        self.words = sorted([w for w in self.model.vocab],
                            key=lambda w: self.model.vocab[w].index)[:300000]

        # all vectors in an array (same order as self.words)
        self.vecs = np.array([self.model[w] for w in self.words])
        tqdm.write('vectors loaded')
        # should take 2-5 min depending on your machine

        self.n, self.d = self.vecs.shape

        # word to index dictionary
        self.w2i = {w: i for i, w in enumerate(self.words)}

        # Some relevant words sets required for debiasing
        with open(definitional_file_path, "r") as f:
            self.definition_pairs = json.load(f)

        with open(equalize_file_path, "r") as f:
            self.equalize_pairs = json.load(f)

        with open(gender_specific_file_path, "r") as f:
            self.gender_specific_words = json.load(f)
        self._normalize()

    # Some potentially helpful functions, you don't have to use/implement them.
    def _normalize(self):
        """
        normalize self.vecs
        """
        self.vecs /= np.linalg.norm(self.vecs, axis=1)[:, np.newaxis]

    def _drop(self, u, v):
        """
        remove a direction v from u
        """
        return u - v * u.dot(v) / v.dot(v)

    def w2v(self, word):
        """
        for a word, return its corresponding vector
        """
        return self.vecs[self.w2i[word]]

    def debias(self):
        self.gender_direction = self.identify_gender_subspace()
        self.neutralize()
        self.equalize()

    def identify_gender_subspace(self):
        """Using self.definitional_pairs to identify a gender axis (1 dimensional).

          Output: a gender direction using definitonal pairs

        ****Note****

         no other unimported packages listed above are allowed, please use
         numpy.linalg.svd for PCA

        """
        # SOLUTION
        #get indexes of 'female' and 'male' related words in definitional pairs
        index_list = [[self.words.index(f), self.words.index(m)] \
                      for f,m in self.definition_pairs] 
        
        center_list = [[(self.vecs[pair[0]] + self.vecs[pair[1]])/2] \
                       for pair in index_list] #get average the data pair
        
        differences = [[self.vecs[index_list[i][0]] - center_list[i], \
                        self.vecs[index_list[i][1]] - center_list[i]] \
                       for i in range(len(center_list))] #calc dist away from mean
        
        differences = np.array([pair[i][0] for pair in differences for i in range(len(pair))]) #turn into array
     
        _, _, Vh = np.linalg.svd(differences, full_matrices=False) #SVD to compute PCA on the data
        self.gend_direct = -Vh[0] #gender direction denoted by 1st eigenvector
        '''Checked my self.pca against the code below for sklearn.decomposition.PCA method
        pca = PCA(n_components=1)
        pca = pca.fit(differences)
        pca_vals = pca.components_[0]
        print(self.gend_direct.round(2) == pca_vals.round(2)) #True
        '''
        # END OF SOLUTION

    def neutralize(self):
        """Performing the neutralizing step: projecting all gender neurtal words away
        from the gender direction

        No output, please adjust self.vecs

        """
        # SOLUTION
        #modify self.vecs only if the word is not in the gender specific words
        self.vecs = np.array([self._drop(self.vecs[i], self.gend_direct) \
                              if self.words[i] not in self.gender_specific_words \
                              else self.vecs[i] for i in range(len(self.words))])
        self._normalize()
        # END OF SOLUTION

    def equalize(self):
        """Performing the equalizing step: make sure all equalized pairs are
        equaldistant to the gender direction.

        No output, please adapt self.vecs

        """
        # SOLUTION
        dist = [(self.w2v(f) + self.w2v(m)) / 2 for f,m in self.equalize_pairs] #dist for words in equalize pairs
        remove = [self._drop(dist[i], self.gend_direct) for i in range(len(dist))] #removing directions
        
        #distance metric to know how far to move the words to make them equidistant in step below
        v = [np.sqrt(1 - np.linalg.norm(remove[i]) ** 2) if np.dot(dist[i]*2, self.gend_direct) < 0\
             else -np.sqrt(1 - np.linalg.norm(remove[i]) ** 2) for i in range(len(remove))]
       
        idx = 0
        '''update self.vecs accordingly equalizing distance between words in 
           equalize pairs and gender neutral words for the gender subspace'''  
        for f, m in self.equalize_pairs:
            self.vecs[self.words.index(f)] = v[idx] * self.gend_direct + remove[idx]
            self.vecs[self.words.index(m)] = -v[idx] * self.gend_direct + remove[idx]
            idx += 1
        self._normalize()
        # END OF SOLUTION

    def compute_analogy(self, w3, w1='woman', w2='man'):
        """input: w3, w1, w2, satifying the analogy w1: w2 :: w3 : w4

        output: w4(a word string) which is the solution to the analogy (w4 is
          constrained to be different from w1, w2 and w3)

        """
        diff = self.w2v(w2) - self.w2v(w1)
        vec = diff / np.linalg.norm(diff) + self.w2v(w3)
        vec = vec / np.linalg.norm(vec)
        if w3 == self.words[np.argsort(vec.dot(self.vecs.T))[-1]]:
            return self.words[np.argsort(vec.dot(self.vecs.T))[-2]]
        return self.words[np.argmax(vec.dot(self.vecs.T))]


if __name__ == '__main__':

    # Original Embedding

    we = WordEmbeddingDebiaser('./data/GoogleNews-vectors-negative300.bin')

    print('=' * 50)
    print('Original Embeddings')
    # she-he analogy evaluation
    w3s1 = [
        'her', 'herself', 'spokeswoman', 'daughter', 'mother', 'niece',
        'chairwoman', 'Mary', 'sister', 'actress'
    ]
    w3s2 = [
        'nurse', 'dancer', 'feminist', 'baking', 'volleyball', 'softball',
        'salon', 'blond', 'cute', 'beautiful'
    ]

    w4s1 = [we.compute_analogy(w3) for w3 in w3s1]
    w4s2 = [we.compute_analogy(w3) for w3 in w3s2]

    print('Appropriate Analogies')
    for w3, w4 in zip(w3s1, w4s1):
        print("'woman' is to '%s' as 'man' is to '%s'" % (w3, w4))

    print('Potentially Biased Analogies')
    for w3, w4 in zip(w3s2, w4s2):
        print("'woman' is to '%s' as 'man' is to '%s'" % (w3, w4))

    we.debias()

    print('=' * 50)
    print('Debiased  Embeddings')
    # she-he analogy evaluation
    w4s1 = [we.compute_analogy(w3) for w3 in w3s1]
    w4s2 = [we.compute_analogy(w3) for w3 in w3s2]

    print('Appropriate Analogies')
    for w3, w4 in zip(w3s1, w4s1):
        print("'woman' is to '%s' as 'man' is to '%s'" % (w3, w4))

    print('Potentially Biased Analogies')
    for w3, w4 in zip(w3s2, w4s2):
        print("'woman' is to '%s' as 'man' is to '%s'" % (w3, w4))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


vectors loaded
Original Embeddings
Appropriate Analogies
'woman' is to 'her' as 'man' is to 'his'
'woman' is to 'herself' as 'man' is to 'himself'
'woman' is to 'spokeswoman' as 'man' is to 'spokesman'
'woman' is to 'daughter' as 'man' is to 'son'
'woman' is to 'mother' as 'man' is to 'father'
'woman' is to 'niece' as 'man' is to 'nephew'
'woman' is to 'chairwoman' as 'man' is to 'chairman'
'woman' is to 'Mary' as 'man' is to 'Paul'
'woman' is to 'sister' as 'man' is to 'brother'
'woman' is to 'actress' as 'man' is to 'actor'
Potentially Biased Analogies
'woman' is to 'nurse' as 'man' is to 'medic'
'woman' is to 'dancer' as 'man' is to 'magician'
'woman' is to 'feminist' as 'man' is to 'anarchist'
'woman' is to 'baking' as 'man' is to 'roasting'
'woman' is to 'volleyball' as 'man' is to 'football'
'woman' is to 'softball' as 'man' is to 'baseball'
'woman' is to 'salon' as 'man' is to 'barber_shop'
'woman' is to 'blond' as 'man' is to 'burly'
'woman' is to 'cute' as 'man' is to 'goofy'
