
People’s name can be connected to which country he/she comes from. Here we have 4000 (fake) names: Japanese, American, Arabic, and Greek. Implement a NB classifier that can make a prediction given a new
name.



In [None]:
# Reading the data
import os
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

drive.mount('/content/drive')

datapath = '/content/drive/My Drive/Colab Notebooks/'
os.path.exists(datapath)

Mounted at /content/drive


True

## Merging all the names in one CSV File

**File Name:-** names.csv

**Content:-**

  *Header:* Name, Country

  *Data:* Name from each txt file, Country to which the name belongs (American, Arabic, Greek, and Japanese)

In [None]:
if os.path.exists(datapath + 'AML_HW3/names.csv'):
    os.remove(datapath + 'AML_HW3/names.csv')

head = "Name,Country\n"
with open(datapath + 'AML_HW3/names.csv', 'a') as csv_file:
  csv_file.write(head)

  with open(datapath + "AML_HW3/us.txt") as txt_file:
      line = txt_file.readline()
      while(line):
        txt = line.strip() + ",American\n"
        csv_file.write(txt)
        line = txt_file.readline()

  with open(datapath + "AML_HW3/arabic.txt") as txt_file:
      line = txt_file.readline()
      while(line):
        txt = line.strip() + ",Arabic\n"
        csv_file.write(txt)
        line = txt_file.readline()

  with open(datapath + "AML_HW3/greek.txt") as txt_file:
      line = txt_file.readline()
      while(line):
        txt = line.strip() + ",Greek\n"
        csv_file.write(txt)
        line = txt_file.readline()

  with open(datapath + "AML_HW3/japan.txt") as txt_file:
      line = txt_file.readline()
      while(line):
        txt = line.strip() + ",Japanese\n"
        csv_file.write(txt)
        line = txt_file.readline()


In [None]:
# Printing the data to know what we have
data = pd.read_csv(datapath + "AML_HW3/names.csv")
data_copy = data.copy()
print(data.describe())

        Name   Country
count   4000      4000
unique  3775         4
top     鈴木 零  American
freq       5      1000


## Using CountVectorizer to vectorize the input names

In [None]:
name_vectorizer = CountVectorizer()
data_X = name_vectorizer.fit_transform(data.Name)
name_vectorizer.get_feature_names_out()

print(data_X.toarray())
print(data_X.shape)
print(len(name_vectorizer.get_feature_names_out()))
print(name_vectorizer.get_feature_names_out())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(4000, 3043)
3043
['aaron' 'abbott' 'abigail' ... '青木' '香織' '高橋']


## Splitting the data into training (70%) and testing (30%) with shuffle = True

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data_X, data.Country, test_size=0.3, random_state=42, shuffle=True)

In [None]:
print("X Train: ", x_train.shape, "Y Train: ", len(y_train))
print("X Test: ", x_test.shape, "Y Test: ", len(y_test))

X Train:  (2800, 3043) Y Train:  2800
X Test:  (1200, 3043) Y Test:  1200


In [None]:
# Converting sparse matrix to dense matrix for applying Naive Bayes
X_train = x_train.toarray()
X_test = x_test.toarray()

## Performing a test with sklearn function

To check if the data could get predictions

To compare score with our implementation

In [None]:
# Link to MultinomialNB https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.945

## Implementing the Multinomial Naive Bayes algorithm

Training the model

In [None]:
# No of training samples
names_count = X_train.shape[0]
print(names_count)

# Divide the training data(names) based on the four countries
countries = [[p for p, q in zip(X_train, y_train) if q == r] for r in np.unique(y_train)]
print(len(countries))

# Calcualte the prior log probabilities for each country
countries_prior_log_p = [np.log(len(p) / names_count) for p in countries]
print(countries_prior_log_p)

# Count each name for each class and add 1 to it as a smoothing parameter
count_names_per_country = np.array([np.array(p).sum(axis=0) for p in countries]) + 1
print(count_names_per_country)

# Calculate log probabilities of each name
name_log_p = np.log(count_names_per_country / count_names_per_country.sum(axis=1)[np.newaxis].T)
print(name_log_p)

2800
4
[-1.40795585790107, -1.3963446969733921, -1.377759458670053, -1.3636945292026497]
[[ 6  2  2 ...  1  1  1]
 [ 1  1  1 ...  1  1  1]
 [ 1  1  1 ...  1  1  1]
 [ 1  1  1 ...  5 22 39]]
[[-6.60800063 -7.70661291 -7.70661291 ... -8.39976009 -8.39976009
  -8.39976009]
 [-8.53030683 -8.53030683 -8.53030683 ... -8.53030683 -8.53030683
  -8.53030683]
 [-8.42463921 -8.42463921 -8.42463921 ... -8.42463921 -8.42463921
  -8.42463921]
 [-8.370316   -8.370316   -8.370316   ... -6.76087808 -5.27927354
  -4.70675435]]


Predicting the model based on the above training

In [None]:
# Calculate the log probabilities of each country from the test set
predict_log_p = [(name_log_p * p).sum(axis=1) + countries_prior_log_p for p in X_test]

# Pick the country with the max log probability value
prediction = np.argmax(predict_log_p, axis=1)

In [None]:
print(prediction)
print(y_test)

[0 3 0 ... 1 0 3]
555     American
3491    Japanese
527     American
3925    Japanese
2989       Greek
          ...   
3856    Japanese
226     American
1612      Arabic
535     American
3848    Japanese
Name: Country, Length: 1200, dtype: object


Calculate the score and accuracy

In [None]:
country = {
    "American" : 0,
    "Arabic" : 1,
    "Greek" : 2,
    "Japanese" : 3
}

correct = 0

for p in range(len(y_test)):
  if prediction[p] == country[y_test[p:p+1].tolist()[0].strip()]:
    correct += 1

score = correct/len(y_test)
accuracy = 100 * score
print("Score: ", score)
print("Accuracy: ", accuracy) 

Score:  0.945
Accuracy:  94.5
