In [None]:
import sys
import numpy as np # Numpy is Python's built in library for matrix operations.
from scipy.stats import mode
import sys
%matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt
from scipy.io import loadmat
import array as arr

# **Find the Class Priors for a Simple Array**
In the project, you will need to write a function that determines the class priors for baby names. For example, if you have a simple array of binary labels that represents a training dataset of baby names, where ‚Äú0‚Äù indicates a boy name and ‚Äú1‚Äù indicates a girl name, your sample might look like this:


[0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0]


Given this example array, you could easily count the number of occurrences and determine that the class prior for boy names is 7/12 and for girl names is 5/12. But what if you have a dataset that is much larger? How might you calculate the class prior efficiently?
Instructions:
Launch the iPython interpreter by typing ipython at the command prompt.
Import NumPy with import numpy as np.
Import the data set ‚Äúlabels‚Äù with from helper import labels. This is a list of all the labels in our dataset, which has 100 labeled examples.
Determine the class priors for the two classes, boy and girl. Boys are labeled with 0, and girls with 1. Hint: use np.mean.
Answer the multiple choice questions below to check your solution.

In [None]:
labels = np.array([0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0])



Class prior for girls = sum of girls / total = sum of 1 / total numbers in array = sum of 1s / 100

In [None]:
girls = np.mean(labels)

print(girls)

0.49


Class prior for guys = sum of guys / 100 = total - sum of girls / 100 = 1 - girls

In [None]:
boys = 1-girls

print(boys)

0.51


Take the log of the class priors, so it is easier to sum those later

In [None]:
NaiveBayesPY_boys = np.log(boys)
NaiveBayesPY_girls = np.log(girls)

# **Find the Conditional Probability**
In the final project of this course, you will need to write a function that assumes conditional independence and determines class-conditional probabilities.

Imagine you have the following vector of probabilities of 5 features given a class label c:
P(x|y=c) = [0.212, 0.234, 0.155, .021, .04]

Since you will assume independence, you can simply multiply these probabilities to get the class conditional probability of observing this data point (i.e. this combination of feature values).

Instructions:
Launch the iPython interpreter by typing ipython at the command prompt.
Import NumPy by running import numpy as np.

Create the array np.array([0.212, 0.234, 0.155, .021, .04]), which corresponds to the likelihood of observing the features x_1 through x_5 conditional on the class label y=c.

Compute the product, sum of the logarithm, and exponentiated sum of the logarithm of the probabilities.

You may find np.prod, np.sum, np.log and np.exp helpful.
Think about what this product signifies, in terms of probabilities.

Answer the multiple choice questions below to check your solution.

Note: You will notice that the product becomes very small quickly, whereas the logarithm stays well above machine precision and is therefore more computationally tractable!

In [None]:
p_x_given_y_is_c = np.array([0.212, 0.234, 0.155, .021, .04])

Compute product of all features given y = c

In [None]:
prod_p_x_given_y_is_c = np.prod(p_x_given_y_is_c)
print(prod_p_x_given_y_is_c)

6.458961600000001e-06


Compute sum of the log of features given y = c

In [None]:
sum_of_logs = np.sum(np.log(p_x_given_y_is_c))
print(sum_of_logs)

-11.950041996124366


Compute exponent of sum of logs

In [None]:
exp_sum_of_logs = np.exp(sum_of_logs)
print(exp_sum_of_logs)

6.458961599999997e-06


# **Extract and Hash Binary Features**
In the project, you will need to extract and hash features of names.
Let‚Äôs say you have the following list of names and you want to determine whether each name in the list ends with a vowel (a, e, i, o, u, y) or a consonant.
 names = ['Natalia', 'Anastasia', 'Emilia', 'Marie', 'Jonas', 'Jordan', 'Brett']

Given this list, you should return a ‚Äú1‚Äù for ‚ÄúNatalia‚Äù since that name ends in a vowel. Since ‚ÄúJonas‚Äù ends in a consonant, you would return a "0". For the exercise, you‚Äôll practice writing code that can extract a binary feature representing whether each name ends in a vowel or a consonant.

Instructions:
Launch the iPython interpreter by typing ipython at the command prompt.
Import a list of names from the helper file using from helper import names. Extract a binary feature that indicates whether the name ends in a vowel (1) or a consonant (0).
Check the sum of all features across all names. (i.e., Determine the total number of names ending in a vowel.)
Answer the multiple choice questions below to check your solution.

In [None]:
import numpy as np

# List of names
names = np.array(['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'])

# Define a function to check if a name ends in a vowel
def ends_in_vowel(name):
    vowels = np.array(['a', 'e', 'i', 'o', 'u'])
    if name[-1].lower() in vowels:
        return True
    else:
        return False

# Use np.vectorize to apply ends_in_vowel to each element of names
v_ends_in_vowel = np.vectorize(ends_in_vowel)
vowels_mask = v_ends_in_vowel(names)

# Use the vowels_mask to filter the original array of names and convert the filtered names to binary
binary_names = np.array([bin(ord(name))[2:] for name in names[vowels_mask]]) #???

# Print binary names
print(binary_names)

TypeError: ignored

In [None]:
names = ['Natalia', 'Anastasia', 'Emilia', 'Marie', 'Jonas', 'Jordan', 'Brett']


In [None]:
def isVowel(ch):
    return 1*(ch == 'a' or ch == 'e' or ch == 'i' or ch == 'o' or
            ch == 'u')

In [None]:
def last_letter_is_vowel(name):
  vowels = np.array(['a', 'e', 'i', 'o', 'u', 'y'])
  if name[-1].lower() in vowels:
        return 1*True
  else:
        return 1*False


In [None]:
last_vowel_array = np.vectorize(last_letter_is_vowel)

In [None]:
np.sum(last_vowel_array(names), axis=0)

4

In [None]:
last_letter_is_vowel('Elena')

1

In [None]:
last_letter_is_vowel('jeff')

0

In [None]:
def last_letter(x):
  return x[len(x)-1]

In [None]:
last_letter('Elena')

'a'

In [None]:
ch = last_letter('Elena')

In [None]:
isVowel(ch)

1

In [None]:
isVowel(last_letter('Jeff'))

0

In [None]:
def last_vowel(x):
    return isVowel(last_letter(x))

In [None]:
last_vowel_array(names)

array([1, 1, 1, 1, 0, 0, 0])

In [None]:
def bin_function(x):
  y = np.array(x, dtype=str)
  l = []
  for i in y:
    index = ord(i)
    np.append(l, last_vowel(i))

In [None]:
print(bin_function(names))

None


In [None]:
x = np.array(names, dtype=str)
print(x)

['Natalia' 'Anastasia' 'Emilia' 'Marie' 'Jonas' 'Jordan' 'Brett']


In [None]:
list = np.empty(len(names))
print(list)

[0. 0. 0. 0. 0. 0. 0.]


In [None]:
list = np.empty(len(names))



In [None]:
np.insert(arr = list, obj = 0, values = last_vowel('Elena'), axis = 0)

array([1, 0, 0, 0, 0, 0, 0, 0])

In [None]:
zeros = np.zeros(len(names), dtype=int)
print(zeros)


[0 0 0 0 0 0 0]


In [None]:
list = np.array(names)
print(list)

['Natalia' 'Anastasia' 'Emilia' 'Marie' 'Jonas' 'Jordan' 'Brett']


In [None]:
np.insert(arr = list, obj = 0, values = last_vowel(list[0]), axis = 0)

array(['1', 'Natalia', 'Anastasia', 'Emilia', 'Marie', 'Jonas', 'Jordan',
       'Brett'], dtype='<U9')

In [None]:
print(np.append(list, last_vowel(list[0])))

['Natalia' 'Anastasia' 'Emilia' 'Marie' 'Jonas' 'Jordan' 'Brett' '1']


In [None]:
import sys
sys.path.append('/home/codio/workspace/.guides/hf')
from helper import *

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
print('You\'re running python %s' % sys.version.split(' ')[0])

You're running python 3.9.16


# **The hashfeatures and name2features Functions**

Below, the hashfeatures and name2features functions will take the plain text names and convert them to binary feature vectors (vectors with 0s and 1s) so that you'll be able to work with the data effectively.

**Converting non-numeric training data to numeric vectors**

The hashfeatures function relies on Python's inbuilt hash function, which converts any data type in Python to a large integer. The hash function ensures that the same objects have the same hash output in a single Python session. Across different Python sessions, you can ensure hash outputs the same integer for the same objects by seeding it.

In our case, hash will convert string objects to integers, which we will "truncate" to our desired dimension d.

hashfeatures has been implemented for you. It works as follows: It splits the baby name FIX times such that split 0 < m <= FIX gives 2 strings. The first is a prefix of baby of length m and the second is the suffix of baby of length m.

For example, with debug=True, calling hashfeatures on Addisyn with d=128 and FIX=3 will give strings:

(A>, <n)

(Ad>, <yn)

(Add>, <syn)

Let's call each split's output (prefix, suffix). hashfeatures then converts both prefix and suffix to large integers P and S using hash. However, since we need a binary feature vector of d dimensions, hashfeatures takes the remainder of P and S with d, and sets the (P % d)th and (S % d)th dimensions of the feature vector to 1.

We define the hashfeatures function to output binary vectors, but they need not be binary; they can also have floating point values. In the Challenge section later in this assignment, you will be able to try your own name2features function to improve the model's performance.

In [None]:
def hashfeatures(baby, d, FIX, debug=False):
    """
    Input:
        baby : a string representing the baby's name to be hashed
        d: the number of dimensions to be in the feature vector
        FIX: the number of chunks to extract and hash from each string
        debug: a bool for printing debug values (default False)

    Output:
        v: a feature vector representing the input string
    """
    v = np.zeros(d)
    for m in range(1, FIX+1):
        prefix = baby[:m] + ">"
        P = hash(prefix) % d
        v[P] = 1

        suffix = "<" + baby[-m:]
        S = hash(suffix) % d
        v[S] = 1

        if debug:
            print(f"Split {m}/{FIX}:\t({prefix}, {suffix}),\t1s at indices [{P}, {S}]")
    if debug:
        print(f"Feature vector for {baby}:\n{v.astype(int)}\n")
    return v

# **Changing hashfeatures arguments**

**Changing d**

It is likely that different strings get hashed to different numbers, but trigger the same dimension in the feature vector to be 1, thus causing collisions. This is because the length of our feature vector d is generally much smaller than the magnitudes of hash outputs. We can reduce the number of collisions by increasing d and reducing the likelihood of P % d or S % d for different strings being equal. If FIX is left unchanged, then short baby names will trigger fewer dimensions of this longer feature vector, resulting in sparsity. Although the resulting vectors are more expressive, we will likely face the curse of dimensionality if we carelessly increase d. Moreover, if the model is also expressive enough, expressive feature vectors often result in model overfitting: model not generalizing well to unseen test data because it fits training data almost too well.

On the other hand, reducing d causes the number of collisions to generally increase by increasing the likelihood of P % d or S % d of different strings to be equal. This creates dense vectors that are not very expressive (feature vectors for different baby names will start having 1s in the same dimensions as d decreases). When feature vectors are not very expressive, it becomes difficult for models to learn classification boundaries between different classes, causing model underfitting: model not generalizing well to unseen test data because it cannot learn decision boundaries effectively.

**Changing FIX**

On the lines of overfitting and underfitting, take some time and think what the effect of changing FIX will be on feature vector density, and possibly model performance down-the-road.

In the trivial case, we do not want FIX to be very large. If FIX is larger than the length of baby name, then we will be running the loop in hashfeatures for extra iterations but computing the same prefixes and suffixes (running Max with FIX=5 will give (Max>, <Max) for iterations m=3 onwards.

The non-trivial case is when FIX is generally smaller than lengths of most baby names. FIX=1 will only split once and give sparse feature vectors if d is reasonably large, whereas FIX=5 will split 5 times and give denser vectors.

Let's test hashfeatures on a few examples, varying baby, d, and FIX.

In [None]:
v = hashfeatures("Addisyn", d=128, FIX=3, debug=True)
v = hashfeatures("Addisyn", d=4, FIX=3, debug=True)
v = hashfeatures("Addisyn", d=128, FIX=7, debug=True)
v = hashfeatures("Max", d=128, FIX=4, debug=True)

Split 1/3:	(A>, <n),	1s at indices [56, 9]
Split 2/3:	(Ad>, <yn),	1s at indices [77, 84]
Split 3/3:	(Add>, <syn),	1s at indices [75, 34]
Feature vector for Addisyn:
[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Split 1/3:	(A>, <n),	1s at indices [0, 1]
Split 2/3:	(Ad>, <yn),	1s at indices [1, 0]
Split 3/3:	(Add>, <syn),	1s at indices [3, 2]
Feature vector for Addisyn:
[1 1 1 1]

Split 1/7:	(A>, <n),	1s at indices [56, 9]
Split 2/7:	(Ad>, <yn),	1s at indices [77, 84]
Split 3/7:	(Add>, <syn),	1s at indices [75, 34]
Split 4/7:	(Addi>, <isyn),	1s at indices [46, 33]
Split 5/7:	(Addis>, <disyn),	1s at indices [99, 105]
Split 6/7:	(Addisy>, <ddisyn),	1s at indices [37, 113]
Split 7/7:	(Addisyn>, <Addisyn),	1s at indices [119, 39]
Feature vector for Addisyn:
[0 0 0 0 0 0 0 0 0 1 0

The key takeaway here is that the feature extraction process is highly tunable and will almost always impact model performance. While you may sometimes be able to manually select key features by perusing training data, it is often good practice to run multiple experiments with different approaches to define the set of features.

In [None]:
def name2features(filename, d=128, FIX=3, LoadFile=True, debug=False):
    """
    Output:
        X : n feature vectors of dimension d, (nxd)
    """
    # read in baby names
    if LoadFile:
        with open(filename, 'r') as f:
            babynames = [x.rstrip() for x in f.readlines() if len(x) > 0]
    else:
        babynames = filename.split('\n')
    n = len(babynames)
    X = np.zeros((n, d))
    for i in range(n):
        X[i,:] = hashfeatures(babynames[i], d, FIX)
    return (X, babynames) if debug else X

In the code cell above, name2features reads every name in the given file and converts it into a 128-dimensional feature vector by first assembling substrings (based on the parameter FIX), then hashing these assembled substrings and modifying the feature vector index (the modulo of the number of dimensions d) that corresponds to this hash value.

More often than not, having a good understanding of the training data can help in training a good Machine Learning model. Let's check out the feature matrix visually. On the Y-axis are baby names, and on the X-axis are features for each baby name. The heatmap is white when the feature value is 1 and black otherwise.

You will notice that certain feature indices are white for many baby names. What could be the reason?

In [None]:
Xboys, namesBoys = name2features("boys.train", d=128, FIX=3, debug=True)
Xgirls, namesGirls = name2features("girls.train", d=128, FIX=3, debug=True)
X = np.concatenate([Xboys[:20], Xgirls[:20]], axis=0)

plt.figure(figsize=(20, 8))
ax = sns.heatmap(X.astype(int), cbar=False)
ax.set_xlabel('feature indices')
ax.set_ylabel('baby names')
ticks = ax.set_yticks(np.arange(40, dtype=int))
ticklabels = ax.set_yticklabels(namesBoys[:20] + namesGirls[:20])
plt.show()

FileNotFoundError: ignored

# **The genTrainFeatures Function**
We have provided you with a python function genTrainFeatures, which transforms the names into features and loads them into memory.

In [None]:
def genTrainFeatures(dimension=128):
    """
    Input:
        dimension: desired dimension of the features
    Output:
        X: n feature vectors of dimensionality d (nxd)
        Y: n labels (-1 = girl, +1 = boy) (n)
    """

    # Load in the data
    Xgirls = name2features("girls.train", d=dimension)
    Xboys = name2features("boys.train", d=dimension)
    X = np.concatenate([Xgirls, Xboys])

    # Generate Labels
    Y = np.concatenate([-np.ones(len(Xgirls)), np.ones(len(Xboys))])

    # shuffle data into random order
    ii = np.random.permutation([i for i in range(len(Y))])

    return X[ii, :], Y[ii]

You can call the following command to return two vectors, one holding all the concatenated feature vectors and one holding the labels of all boys and girls names.

In [None]:
X, Y = genTrainFeatures(128)
print(f'Shape of training data: {X.shape}')
print(f'X:\n{X.astype(int)}')
print(f'Y:\n{Y.astype(int)}')

FileNotFoundError: ignored

In [None]:
Shape of training data: (1200, 128)
X:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 1 0]
 ...
 [1 0 0 ... 0 0 1]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]]
Y:
[-1 -1  1 ...  1  1  1]

Shape of X = (1200, 128)
Shape of Y = (1, 1200)

# **The Na√Øve Bayes Classifier **
The Na√Øve Bayes classifier is a linear classifier based on Bayes Rule. The following cells will walk you through steps and ask you to finish the necessary functions in a pre-defined order. *As a general rule, you should avoid tight loops at all costs.*

# **Part One: Class Probability**
Estimate the class probability  ùëÉ(ùë¶)
  in naivebayesPY. This should return the probability that a sample in the training set is positive or negative, independent of its features.

In [None]:
X = np.array([[0,1,1,0,1],
            [1,0,0,1,0],
            [1,1,1,1,0],
            [0,1,1,0,1],
            [1,0,1,0,0],
            [0,0,1,0,0],
            [1,1,1,0,1]])

Y = np.array([1,-1, 1, 1,-1,-1, 1])

In [None]:
def naivebayesPY(X, Y):
    """
    naivebayesPY(X, Y) returns [pos,neg]

    Computation of P(Y)
    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (n)

    Output:
        pos: probability p(y=1)
        neg: probability p(y=-1)
    """

    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    Y = np.concatenate([Y, [-1,1]])
    n = len(Y)

    return [np.sum(Y == 1)/n, np.sum(Y == -1)/n]

    raise NotImplementedError()

In [None]:
naivebayesPY(X,Y)

[0.5555555555555556, 0.4444444444444444]

Introducing Laplace smooothing does not match the output expected in tests below anymore - why?

But it works with

np.sum(Y == 1)/n, np.sum(Y == -1)/n

In [None]:
# The following tests will check that the probabilities returned by your function sum to 1 (test1) and return the correct probabilities for a given set of input vectors (tests 2-4)

# Check that probabilities sum to 1
def naivebayesPY_test1():
    pos, neg = naivebayesPY(X,Y)
    return np.linalg.norm(pos + neg - 1) < 1e-5

# Test the Naive Bayes PY function on a simple example
def naivebayesPY_test2():
    x = np.array([[0,1],[1,0]])
    y = np.array([-1,1])
    pos, neg = naivebayesPY(x,y)
    pos0, neg0 = .5, .5
    test = np.linalg.norm(pos - pos0) + np.linalg.norm(neg - neg0)
    return test < 1e-5

# Test the Naive Bayes PY function on another example
def naivebayesPY_test3():
        x = np.array([[0,1,1,0,1],
            [1,0,0,1,0],
            [1,1,1,1,0],
            [0,1,1,0,1],
            [1,0,1,0,0],
            [0,0,1,0,0],
            [1,1,1,0,1]])
        y = np.array([1,-1, 1, 1,-1,-1, 1])
        pos, neg = naivebayesPY(x,y)
        pos0, neg0 = 5/9., 4/9.
        test = np.linalg.norm(pos - pos0) + np.linalg.norm(neg - neg0)
        return test < 1e-5

# Tests plus-one smoothing
def naivebayesPY_test4():
    x = np.array([[0,1,1,0,1],[1,0,0,1,0]])
    y = np.array([1,1])
    pos, neg = naivebayesPY(x,y)
    pos0, neg0 = 3/4., 1/4.
    test = np.linalg.norm(pos - pos0) + np.linalg.norm(neg - neg0)
    return test < 1e-5

# **Part Two: Conditional Probability**
Estimate the conditional probabilities  ùëÉ([ùê±]ùõº=1|ùëå=ùë¶)
  in naivebayesPXY. Notice that by construction, our features are binary categorical features. Use a **categorical** distribution as model and return the probability vectors for each feature being 1 given a class label. Note that the result will be two vectors of length d (the number of features), where the values represent the probability that feature i is equal to 1.

Here we compute the fraction of counts that a feature is hot or not ( [ùê±]ùõº=1
  or  [ùê±]ùõº=0
 ) conditioned on gender. For example, if  [ùê±]1=[1,0,1,0,1]
  and  ùëå=[1,1,1,1,‚àí1]
  (boy= 1
  and girl= ‚àí1
 ), then  ùëÉ([ùê±]1=1|ùëå=1)=(1+0+1+0)/4=0.5
  and  ùëÉ([ùê±]1=1|ùëå=‚àí1)=(1)/1=1
 . You need to compute this for each dimension 0 <= i < d for each gender.

In [None]:
def naivebayesPXY(X,Y): #incorrect
    """
    naivebayesPXY(X, Y) returns [posprob,negprob]

    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (n)

    Output:
        posprob: probability vector of p(x_alpha = 1|y=1)  (d)
        negprob: probability vector of p(x_alpha = 1|y=-1) (d)
    """

    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    n, d = X.shape
    X = np.concatenate([X, np.ones((2,d)), np.zeros((2,d))])
    Y = np.concatenate([Y, [-1,1,-1,1]])
    X_boys = X[Y == 1]
    X_girls = X[Y == -1]

    for i in range(0, X.shape[1]):

      #sum of 1s at feature i across all boy names
      posprob = np.vectorize((np.sum(X_boys[:,i] == 1))/ X_boys.shape[0])

      #sum of 1s at feature i across all girl names
      negprob = np.vectorize((np.sum(X_girls[:,i] == 1))/ X_girls.shape[0])

    return [posprob, negprob]

    raise NotImplementedError()

**Possible error: **

Certainly! It sounds like you're dealing with a NumPy error related to the type of the objects you're passing into a function.

The error message you provided suggests that you might be passing a NumPy vectorized function or a list containing NumPy vectorized functions to a function that doesn't support those types of objects.

NumPy vectorized functions are functions that can apply element-wise operations on NumPy arrays. They can be created using the numpy.vectorize method, and can be very useful for performing operations on arrays without having to use loops.

If you're passing a NumPy vectorized function to a function that doesn't support it, you might need to modify your code to handle the vectorized function differently. You could try applying the vectorized function to the inputs before passing them to the function, or you could modify the function to accept vectorized functions as arguments.

Alternatively, if you're passing a list containing NumPy vectorized functions to a function, you might need to iterate over the list and apply each function to the inputs separately. Again, you could modify the function to accept a list of vectorized functions as an argument if that's what you need.

here are a couple of issues with the code you posted:

The vectorize function is being used incorrectly. The purpose of vectorize is to apply a scalar function to each element of a vectorized input. In this code, np.sum(X_boys[:,i] == 1) is already a scalar value, so wrapping it with np.vectorize() is not necessary. Instead, you can just call np.sum(X_boys[:,i] == 1) / X_boys.shape[0] directly.
The return statement is inside the for loop, so the function will only iterate once before returning the results. You should move the return statement outside the loop, after the two probability vectors have been computed.
There is an error in the input data preprocessing. The code adds four extra rows to X and Y, but it only needs to add two. Specifically, it should add one row for each class (positive and negative), rather than two rows for each class.
Here is a corrected version of the code:

In [None]:
def naivebayesPXY(X,Y):
    """
    naivebayesPXY(X, Y) returns [posprob,negprob] #list of arrays

    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (n)

    Output:
        posprob: probability vector of p(x_alpha = 1|y=1)  (d)
        negprob: probability vector of p(x_alpha = 1|y=-1) (d)
    """

    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    n, d = X.shape
    X = np.concatenate([X, np.ones((1,d)), np.zeros((1,d))])
    Y = np.concatenate([Y, [-1,1]])
    X_boys = X[Y == 1]
    X_girls = X[Y == -1]

    posprob = np.zeros(d)
    negprob = np.zeros(d)

    for i in range(d):
        # sum of 1s at feature i across all boy names
        posprob[i] = np.sum(X_boys[:,i] == 1) / X_boys.shape[0]

        # sum of 1s at feature i across all girl names
        negprob[i] = np.sum(X_girls[:,i] == 1) / X_girls.shape[0]

    return [posprob, negprob]

    raise NotImplementedError()

In [None]:
X_boys = X[Y == 1]
print(X_boys)

[[0 1 1 0 1]
 [1 1 1 1 0]
 [0 1 1 0 1]
 [1 1 1 0 1]]


In [None]:
naivebayesPXY(X,Y)

NameError: ignored

To avoid using a for loop

In this version of the code, posprob and negprob are computed using np.sum() with axis=0 to sum across the rows (i.e., examples) and compute the fraction of 1s for each feature. This gives you the same results as the loop-based approach, but it should be faster for large datasets.

In [None]:
def naivebayesPXY(X,Y):
    """
    naivebayesPXY(X, Y) returns [posprob,negprob]

    Input:
        X : n input vectors of d dimensions (nxd)
        Y : n labels (-1 or +1) (n)

    Output:
        posprob: probability vector of p(x_alpha = 1|y=1)  (d)
        negprob: probability vector of p(x_alpha = 1|y=-1) (d)
    """

    # add one positive and negative example to avoid division by zero ("plus-one smoothing")
    n, d = X.shape
    X = np.concatenate([X, np.ones((1,d)), np.zeros((1,d))])
    Y = np.concatenate([Y, [-1,1]])
    X_boys = X[Y == 1]
    X_girls = X[Y == -1]

    # compute probability vectors using vectorized operations
    posprob = np.sum(X_boys, axis=0) / X_boys.shape[0]
    negprob = np.sum(X_girls, axis=0) / X_girls.shape[0]

    return [posprob, negprob]

In [None]:
naivebayesPXY(X,Y)

[array([0.4, 0.8, 0.8, 0.2, 0.6]), array([0.75, 0.25, 0.75, 0.5 , 0.25])]

In [None]:
posprob = naivebayesPXY(X,Y)[0]
negprob = naivebayesPXY(X,Y)[1]

In [None]:
naivebayesPXY(X,Y)[0].shape

(5,)

In [None]:
np.reshape(naivebayesPXY(X,Y)[0],(5, 1))

array([[0.4],
       [0.8],
       [0.8],
       [0.2],
       [0.6]])

In [None]:
np.reshape(naivebayesPXY(X,Y)[0],(1, 5))

array([[0.4, 0.8, 0.8, 0.2, 0.6]])

# **Part Three: Log Likelihood**
Calculate the log likelihood  logùëÉ(ùê±|ùëå=ùë¶)
  for each point in X_test given label Y_test in loglikelihood.

Recall
Na√Øve Bayes assumption: the likelihood  ùëÉ(ùê±|ùëå=ùë¶)
  of a data point  ùê±
  is equal to the product of the conditional probabilities of each feature  [ùê±]ùõº
  having value  ùë•ùõº
 , i.e.,
ùëÉ(ùê±|ùëå=ùë¶)=‚àèùõº=1ùëëùëÉ([ùê±]ùõº=ùë•ùõº|ùëå=ùë¶).

For example, with  ùê±=[1,0,1]
  and corresponding label  ùëå=1
 , you will calculate likelihood  ùëÉ(ùê±|ùëå=1)
  as  ùëÉ([ùê±]1=1|ùëå=1)‚ãÖùëÉ([ùê±]2=0|ùëå=1)‚ãÖùëÉ([ùê±]3=1|ùëå=1)
 .

Given probabilities:

posprob vector:  ùëÉ([ùê±]ùõº=1|ùëå=1)

negprob vector:  ùëÉ([ùê±]ùõº=1|ùëå=‚àí1)

Fact  log(ùëéùëè)=logùëé+logùëè
 .

To simplify your code, we recommend calculating log likelihoods for positive points ( ùê±
  with  ùëå=1
 ) and those for negative points separately.

In [None]:
def loglikelihood(posprob, negprob, X_test, Y_test):
    """
    loglikelihood(posprob, negprob, X_test, Y_test) returns loglikelihood of each point in X_test

    Input:
        posprob: conditional probabilities for the positive class (d)
        negprob: conditional probabilities for the negative class (d)
        X_test : features (nxd)
        Y_test : labels (-1 or +1) (n)

    Output:
        loglikelihood of each point in X_test (n)
    """

    X_test_boys = X_test[Y_test == 1]
    X_test_girls = X_test[Y_test == -1]

# for each feature a in X_test, if [x]a in X_test is 1, given it is a boy, multiply it with conditional probability posprob from training data. If it is a zero, multiply it by 1 - conditional probability posprob. Sum the two probabilities together. This should output a vector (d) of probabilities for that particular test point, given it is a boy

# for each feature a in X_test, if [x]a in X_test is 1, given it is a girl, multiply it with conditional probability negprob from training data. If it is a zero, multiply it by 1 - conditional probability negprob. Sum the two probabilities together. This should output a vector (d) of probabilities for that particular test point, given it is a boy

#log each nxd output matrix and sum across columns to obtain 1xn loglikelihood for each data point in X_test, given it either boys or girls

    raise NotImplementedError()

You can avoid using a for loop by taking advantage of NumPy's array broadcasting and vectorization capabilities. Here's how you can modify the function to avoid using a for loop:

Here, we use NumPy's multiply function to element-wise multiply X_test_boys and posprob, and then multiply the complement of X_test_boys and the complement of posprob. We then add these two arrays together to get the probabilities for each feature for the positive class. We repeat the same process for the negative class.

We then use NumPy's log function to take the natural logarithm of each probability, and then sum across columns using np.sum to obtain the loglikelihood for each data point in X_test_boys and X_test_girls.

Finally, we concatenate the loglikelihoods for boys and girls using np.concatenate to obtain the loglikelihood for each data point in X_test.

Here, we use the argsort method in NumPy to obtain the indices that would sort Y_test. We then use these indices to reorder the loglikelihood array using array indexing. The resulting loglikelihood array will have the same order as the original X_test input.

Note that this implementation assumes that posprob and negprob are 1D arrays of length d. If they are 2D arrays, you may need to modify the implementation accordingly.

Error - Positive Logarithm outputs

Positive logarithm outputs can occur when probabilities that are supposed to be between 0 and 1 are not properly computed or when the input values to the logarithm function are not within the appropriate range.

Here are a few potential issues in the code that could be causing the positive logarithm outputs:

Conditional probabilities outside the range [0, 1]: Check that the values of posprob and negprob are between 0 and 1. If they are not, then the conditional probabilities are not being properly calculated, which could lead to positive logarithm outputs.

Incorrect multiplication of probabilities: Check that the multiplication of X_test with posprob and negprob is being performed correctly. If this calculation is incorrect, then the resulting probabilities may be greater than 1 or less than 0, which could lead to positive logarithm outputs.

Inappropriate input values to the logarithm function: Check that the input values to the np.log function are within the appropriate range. The logarithm of values less than or equal to zero is undefined, so if any of the values are outside this range, you will get positive logarithm outputs.

In [None]:
def loglikelihood(posprob, negprob, X_test, Y_test):
    """
    loglikelihood(posprob, negprob, X_test, Y_test) returns loglikelihood of each point in X_test

    Input:
        posprob: conditional probabilities for the positive class (d)
        negprob: conditional probabilities for the negative class (d)
        X_test : features (nxd)
        Y_test : labels (-1 or +1) (n)

    Output:
        loglikelihood of each point in X_test (n)
    """

    # Split X_test into boys and girls
    X_test_boys = X_test[Y_test == 1]
    X_test_girls = X_test[Y_test == -1]

    # Calculate loglikelihood for boys
    prob_boys = np.multiply(X_test_boys, posprob) + np.multiply(1 - X_test_boys, 1 - posprob)
    logprob_boys = np.log(prob_boys)
    loglikelihood_boys = np.sum(logprob_boys, axis=1)

    # Calculate loglikelihood for girls
    prob_girls = np.multiply(X_test_girls, negprob) + np.multiply(1 - X_test_girls, 1 - negprob)
    logprob_girls = np.log(prob_girls)
    loglikelihood_girls = np.sum(logprob_girls, axis=1)

    # Combine loglikelihood for boys and girls
    loglikelihood = np.concatenate((loglikelihood_boys, loglikelihood_girls))

    # Reshuffle loglikelihood to match original order of data points in X_test
    idx = np.argsort(Y_test) # get the indices that sort Y_test
    loglikelihood = loglikelihood[idx] # reorder loglikelihood using the sorted indices

    return loglikelihood

    raise NotImplementedError()


In [None]:
loglikelihood(posprob, negprob, X, Y)

array([-3.88830648, -2.94248776, -1.84387547, -1.6910819 , -1.6910819 ,
       -2.09654701, -2.94248776])

In [None]:
X = np.array([[0,1,1,0,1],
            [1,0,0,1,0],
            [1,1,1,1,0],
            [0,1,1,0,1],
            [1,0,1,0,0],
            [0,0,1,0,0],
            [1,1,1,0,1]])

Y = np.array([1,-1, 1, 1,-1,-1, 1])

Likelihood given name is boys'

In [None]:
X[Y==1]

array([[0, 1, 1, 0, 1],
       [1, 1, 1, 1, 0],
       [0, 1, 1, 0, 1],
       [1, 1, 1, 0, 1]])

In [None]:
posprob

array([0.4, 0.8, 0.8, 0.2, 0.6])

In [None]:
np.multiply(X[Y == 1], posprob)

array([[0. , 0.8, 0.8, 0. , 0.6],
       [0.4, 0.8, 0.8, 0.2, 0. ],
       [0. , 0.8, 0.8, 0. , 0.6],
       [0.4, 0.8, 0.8, 0. , 0.6]])

In [None]:
np.log(np.multiply(X[Y == 1], posprob))

  np.log(np.multiply(X[Y == 1], posprob))


array([[       -inf, -0.22314355, -0.22314355,        -inf, -0.51082562],
       [-0.91629073, -0.22314355, -0.22314355, -1.60943791,        -inf],
       [       -inf, -0.22314355, -0.22314355,        -inf, -0.51082562],
       [-0.91629073, -0.22314355, -0.22314355,        -inf, -0.51082562]])

In [None]:
np.sum(np.log(np.multiply(X[Y == 1], posprob)))

  np.sum(np.log(np.multiply(X[Y == 1], posprob)))


-inf

In [None]:
np.log(posprob)

array([-0.91629073, -0.22314355, -0.22314355, -1.60943791, -0.51082562])

In [None]:
np.multiply(X[Y == 1], np.log(posprob))

array([[-0.        , -0.22314355, -0.22314355, -0.        , -0.51082562],
       [-0.91629073, -0.22314355, -0.22314355, -1.60943791, -0.        ],
       [-0.        , -0.22314355, -0.22314355, -0.        , -0.51082562],
       [-0.91629073, -0.22314355, -0.22314355, -0.        , -0.51082562]])

In [None]:
np.sum(np.multiply(X[Y == 1], np.log(posprob)), axis=1)

array([-0.95711273, -2.97201575, -0.95711273, -1.87340346])

In [None]:
prob_boys_positive = np.sum(np.multiply(X[Y == 1], np.log(posprob)), axis=1)

In [None]:
X[Y == 1]@np.log(posprob)

array([-0.95711273, -2.97201575, -0.95711273, -1.87340346])

In [None]:
1 - X[Y == 1]

array([[1, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [1, 0, 0, 1, 0],
       [0, 0, 0, 1, 0]])

In [None]:
1 - posprob

array([0.6, 0.2, 0.2, 0.8, 0.4])

In [None]:
np.log(1-posprob)

array([-0.51082562, -1.60943791, -1.60943791, -0.22314355, -0.91629073])

In [None]:
np.multiply(1 - X[Y == 1], np.log(1 - posprob))

array([[-0.51082562, -0.        , -0.        , -0.22314355, -0.        ],
       [-0.        , -0.        , -0.        , -0.        , -0.91629073],
       [-0.51082562, -0.        , -0.        , -0.22314355, -0.        ],
       [-0.        , -0.        , -0.        , -0.22314355, -0.        ]])

In [None]:
prob_boys_complement = np.sum(np.multiply(1 - X[Y == 1], np.log(1 - posprob)), axis=1)

In [None]:
prob_boys_likelihood = prob_boys_positive + prob_boys_complement
print(prob_boys_likelihood)

[-1.6910819  -3.88830648 -1.6910819  -2.09654701]


Likelihood given name is girls'

In [None]:
X[Y==-1]

array([[1, 0, 0, 1, 0],
       [1, 0, 1, 0, 0],
       [0, 0, 1, 0, 0]])

In [None]:
negprob

array([0.75, 0.25, 0.75, 0.5 , 0.25])

In [None]:
np.log(negprob)

array([-0.28768207, -1.38629436, -0.28768207, -0.69314718, -1.38629436])

In [None]:
np.multiply(X[Y == -1], np.log(negprob))

array([[-0.28768207, -0.        , -0.        , -0.69314718, -0.        ],
       [-0.28768207, -0.        , -0.28768207, -0.        , -0.        ],
       [-0.        , -0.        , -0.28768207, -0.        , -0.        ]])

In [None]:
(X[Y == -1])@np.log(negprob)

array([-0.98082925, -0.57536414, -0.28768207])

In [None]:
prob_girls_positive = np.sum(np.multiply(X[Y == -1], np.log(negprob)), axis=1)
print(prob_girls_positive)

[-0.98082925 -0.57536414 -0.28768207]


In [None]:
1 - X[Y==-1]

array([[0, 1, 1, 0, 1],
       [0, 1, 0, 1, 1],
       [1, 1, 0, 1, 1]])

In [None]:
1 - negprob

array([0.25, 0.75, 0.25, 0.5 , 0.75])

In [None]:
np.log(1-negprob)

array([-1.38629436, -0.28768207, -1.38629436, -0.69314718, -0.28768207])

In [None]:
np.multiply(1 - X[Y == -1], np.log(1 - negprob))

array([[-0.        , -0.28768207, -1.38629436, -0.        , -0.28768207],
       [-0.        , -0.28768207, -0.        , -0.69314718, -0.28768207],
       [-1.38629436, -0.28768207, -0.        , -0.69314718, -0.28768207]])

In [None]:
prob_girls_complement = np.sum(np.multiply(1-X[Y == -1], np.log(1-negprob)), axis=1)
print(prob_girls_complement)

[-1.96165851 -1.26851133 -2.65480569]


In [None]:
prob_girls_likelihood = prob_girls_positive + prob_girls_complement
print(prob_girls_likelihood)

[-2.94248776 -1.84387547 -2.94248776]


In [None]:
likelihood = np.concatenate((prob_boys_likelihood, prob_girls_likelihood), axis = 0)
print(likelihood)

[-1.6910819  -3.88830648 -1.6910819  -2.09654701 -2.94248776 -1.84387547
 -2.94248776]


In [None]:
Y

array([ 1, -1,  1,  1, -1, -1,  1])

In [None]:
_,idx=np.unique(Y, axis=0,return_index=True)
print(idx)

[1 0]


In [None]:
likelihood[np.sort(idx)]

array([-1.6910819 , -3.88830648])

indices expected 0, *, *, 5, 1,

In [None]:
idx = np.sort(Y, axis=None) # get the indices that sort Y
print(idx)


[-1 -1 -1  1  1  1  1]


In [None]:
likelihood = likelihood[idx]
print(likelihood)

[-1.6910819  -3.88830648 -1.6910819  -2.09654701 -2.94248776 -1.84387547
 -2.94248776]


In [None]:
X

array([[0, 1, 1, 0, 1],
       [1, 0, 0, 1, 0],
       [1, 1, 1, 1, 0],
       [0, 1, 1, 0, 1],
       [1, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [1, 1, 1, 0, 1]])

In [None]:
X_indexed = list(enumerate(X))
print(X_indexed)

[(0, array([0, 1, 1, 0, 1])), (1, array([1, 0, 0, 1, 0])), (2, array([1, 1, 1, 1, 0])), (3, array([0, 1, 1, 0, 1])), (4, array([1, 0, 1, 0, 0])), (5, array([0, 0, 1, 0, 0])), (6, array([1, 1, 1, 0, 1]))]


In [None]:
original_order_likelihood = [likelihood[i] for i,_ in X_indexed]
print(original_order_likelihood)

[-1.6910819014746106, -3.88830647881083, -1.6910819014746106, -2.0965470095827747, -2.9424877590351786, -1.8438754703670688, -2.9424877590351786]


In [None]:
X = [3, 7, 1, 5]
Y = [+1, -1, -1, +1]

# create a list of tuples with the index and value of each element in X
indexed_X = list(enumerate(X))

# sort the list of tuples based on the labels Y
sorted_X = sorted(indexed_X, key=lambda x: Y[x[0]])

# process the sorted list of tuples
processed_X = [process(x[1]) for x in sorted_X]

# sort the processed list of tuples back to the original order
original_order_X = [processed_X[i] for i, _ in indexed_X]


In [None]:
def loglikelihood(posprob, negprob, X_test, Y_test):
    """
    loglikelihood(posprob, negprob, X_test, Y_test) returns loglikelihood of each point in X_test

    Input:
        posprob: conditional probabilities for the positive class (d)
        negprob: conditional probabilities for the negative class (d)
        X_test : features (nxd)
        Y_test : labels (-1 or +1) (n)

    Output:
        loglikelihood of each point in X_test (n)
    """

    n, d = X_test.shape
    loglikelihood = np.zeros(n)

    # Filter by boys and girls
    positive = (Y_test == 1)
    negative = (Y_test == -1)

    #use outerproduct to multiply vector element-wise to matrix and sum across rows, modify the zero-vector
    #impute the inverses for posprob and negprob
    loglikelihood[positive] = (X_test[positive])@np.log(posprob) + (1-X_test[positive])@np.log(1-posprob)
    loglikelihood[negative] = (X_test[negative])@np.log(negprob) + (1-X_test[negative])@np.log(1-negprob)

    return loglikelihood

    raise NotImplementedError()

In [None]:
loglikelihood(posprob, negprob, X, Y)

array([-1.6910819 , -2.94248776, -3.88830648, -1.6910819 , -1.84387547,
       -2.94248776, -2.09654701])

# **Part Four: Na√Øve Bayes Prediction**

Observe that for a test point  ùê±ùë°ùëíùë†ùë°
 , we should classify it as positive if the log ratio  log(ùëÉ(ùëå=1|ùê±=ùê±ùë°ùëíùë†ùë°)ùëÉ(ùëå=‚àí1|ùê±=ùê±ùë°ùëíùë†ùë°))>0
  and negative otherwise.
  
  Implement the naivebayes_pred by first calculating the log ratio  log(ùëÉ(ùëå=1|ùê±=ùê±ùë°ùëíùë†ùë°)ùëÉ(ùëå=‚àí1|ùê±=ùê±ùë°ùëíùë†ùë°))
  for each test point in  ùê±ùë°ùëíùë†ùë°
  using Bayes' rule and predict the label of the test points by looking at the log ratio.

Recall
Bayes' theorem:
ùëÉ(ùëå=ùë¶|ùê±=ùê±ùë°ùëíùë†ùë°)= ùëÉ(ùê±=ùê±ùë°ùëíùë†ùë°|ùëå=ùë¶)‚ãÖùëÉ(ùëå=ùë¶)ùëÉ(ùê±)

ùëÉ(ùëå=ùë¶|ùê±=ùê±ùë°ùëíùë†ùë°) posterior ‚àù ùëÉ(ùê±=ùê±ùë°ùëíùë†ùë°|ùëå=ùë¶)ikelihood‚ãÖùëÉ(ùëå=prior)

where  ‚àù
  is the proportionality symbol. Proportionality applies because we have dropped the denominator  
  
  ùëÉ(ùê±), which is just a multiplicative constant when finding  ùë¶
  that maximixes the posterior.

Given probabilities:

pos:  ùëÉ(ùëå=1)

neg:  ùëÉ(ùëå=‚àí1)

posprob vector:  ùëÉ([ùê±]ùõº=1|ùëå=1)

negprob vector:  ùëÉ([ùê±]ùõº=1|ùëå=‚àí1)

loglikelihood function you just implemented.

Facts  log(ùëéùëè)=logùëé+logùëè

  and  log(ùëéùëè)=logùëé‚àílogùëè

  (can simplify your calculations).

In [None]:
def naivebayes_pred(pos, neg, posprob, negprob, X_test):
    """
    naivebayes_pred(pos, neg, posprob, negprob, X_test) returns the prediction of each point in X_test

    Input:
        pos: class probability for the negative class
        neg: class probability for the positive class
        posprob: conditional probabilities for the positive class (d)
        negprob: conditional probabilities for the negative class (d)
        X_test : features (nxd)

    Output:
        prediction of each point in X_test (n)
    """

    n, d = X_test.shape

    #log of priors
    pos = np.log(naivebayesPY(X,Y))[0]
    neg = np.log(naivebayesPY(X,Y))[1]

    # create loglikelihoods for all test data being labelled as boy or girl, using conditinoal probabilities from training data
    loglikelihood_pos = loglikelihood(posprob,negprob, X_test, np.ones(n))
    loglikelihood_neg = loglikelihood(posprob, negprob, X_test, -np.ones(n))

    #naivebayes_pred quotient between the posteriors of boy vs girl
    naivebayes_pred_pos = pos + loglikelihood_pos
    naivebayes_pred_neg = neg + loglikelihood_neg
    quotient = naivebayes_pred_pos - naivebayes_pred_neg

    #prediction vector, if quotient <0, output is -1
    pred = -np.ones(n)
    pred[quotient > 0] = 1

    return pred

    raise NotImplementedError()

In [None]:
naivebayesPY(X, Y)

[0.5555555555555556, 0.4444444444444444]

In [None]:
np.log(naivebayesPY(X, Y))

array([-0.58778666, -0.81093022])

In [None]:
np.log(naivebayesPY(X,Y))[0]

-0.587786664902119

In [None]:
loglikelihood(posprob, negprob, X, Y)

array([-1.6910819 , -2.94248776, -3.88830648, -1.6910819 , -1.84387547,
       -2.94248776, -2.09654701])

In [None]:
loglikelihood[Y ==1] #why not?

TypeError: ignored

In [None]:
#avoid for-loops though
for i in range(n):
      if quotient >0:
        return i + "boy"
      else:
        return i + "girl"