## Beforehand...
** 1.1 NLTK Setup  **
   - Install the NLTK library (refer to the previous python file)
   - Once NLTK is installed, the text data files (corpora) should be downloaded.  See the following cell to start the download.

In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


## Question 1

In [2]:
# code to build a classifier to classify names as male or female
# demonstrates the basics of feature extraction and model building
# retrieve dataset inside nltk library

import nltk
nltk.download('names')  #complete this

names = [(name, 'male') for name in nltk.corpus.names.words("male.txt")]       #complete this
names += [(name, 'female') for name in nltk.corpus.names.words("female.txt")]    #complete this
print (names[0:10])

#print (names)
print("\nNumber of male names:")
print (len(nltk.corpus.names.words('male.txt')))

print("\nNumber of female names:")
print (len(nltk.corpus.names.words('female.txt')))

male_names = nltk.corpus.names.words('male.txt')
print("\nFirst 10 male names:")
print (male_names[0:10])

female_names = nltk.corpus.names.words('female.txt')
print("\nFirst 10 female names:")
print (female_names[0:10])

[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male'), ('Abbott', 'male'), ('Abby', 'male'), ('Abdel', 'male'), ('Abdul', 'male'), ('Abdulkarim', 'male')]

Number of male names:
2943

Number of female names:
5001

First 10 male names:
['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim']

First 10 female names:
['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale']


[nltk_data] Downloading package names to
[nltk_data]     C:\Users\TARUMT\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


In [3]:
data_path = nltk.data.find('corpora/names')                                         #complete this
print("NLTK data directory:", data_path)

NLTK data directory: C:\Users\TARUMT\AppData\Roaming\nltk_data\corpora\names


## Question 2

In [4]:
 # feature extraction function (since the data is clean, we may skip the data cleaning)

def extract_gender_features(name):
    name = name.lower()
    features = {}
    features["suffix"] = name[-1:]
    features["suffix2"] = name[-2:] if len(name) > 1 else name[0]
    features["suffix3"] = name[-3:] if len(name) > 2 else name[0]
    features["suffix4"] = name[-4:] if len(name) > 3 else name[0]
    features["suffix5"] = name[-5:] if len(name) > 4 else name[0]
    features["suffix6"] = name[-6:] if len(name) > 5 else name[0]
    features["prefix"] = name[:1] #J
    features["prefix2"] = name[:2] if len(name) > 1 else name[0]
    features["prefix3"] = name[:3] if len(name) > 2 else name[0]
    features["prefix4"] = name[:4] if len(name) > 3 else name[0]
    features["prefix5"] = name[:5] if len(name) > 4 else name[0]

    print (features)
    return features

## Question 3

In [5]:
# perform feature extraction for every data

data = [(extract_gender_features(name), gender) for (name, gender) in names]                         #complete this
print (data)  #categarize features with corresponding genders

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Question 4

In [6]:
# Machine learning: data splitting

import random
random.shuffle(data)                          #complete this

dataCount = len(data)
trainCount = int(.8*dataCount)
print(trainCount)

trainData = data[:trainCount]
testData = data[trainCount:]
print(len(testData))

6355
1589


## Question 5

In [7]:
# Machine learning: data training

bayes = nltk.NaiveBayesClassifier.train(trainData)                           #complete this
print(bayes)

<nltk.classify.naivebayes.NaiveBayesClassifier object at 0x0000028A8E492A60>


## Question 6

In [8]:
# Machine learning: accuracy

print("trainData accuracy=", nltk.classify.accuracy(bayes, trainData))                               #complete this
print("testData accuracy=", nltk.classify.accuracy(bayes, testData))                             #complete this

bayes.show_most_informative_features(25)

trainData accuracy= 0.9411487018095988
testData accuracy= 0.8376337319068596
Most Informative Features
                 suffix2 = 'na'           female : male   =     82.2 : 1.0
                 suffix2 = 'sa'           female : male   =     33.7 : 1.0
                 suffix2 = 'ia'           female : male   =     33.6 : 1.0
                  suffix = 'a'            female : male   =     30.8 : 1.0
                 suffix2 = 'rd'             male : female =     28.3 : 1.0
                  suffix = 'k'              male : female =     25.9 : 1.0
                 suffix2 = 'us'             male : female =     25.8 : 1.0
                 suffix2 = 'io'             male : female =     24.2 : 1.0
                 suffix3 = 'tta'          female : male   =     22.3 : 1.0
                 suffix2 = 'ra'           female : male   =     21.9 : 1.0
                 suffix3 = 'ana'          female : male   =     21.4 : 1.0
                 suffix2 = 'ta'           female : male   =     21.1 : 1

## Question 7

In [14]:
# Predict a new name

input_name = input("Name:")
print(bayes.classify(extract_gender_features(input_name)))                                    #complete this

Name:chua zhong hui
{'suffix': 'i', 'suffix2': 'ui', 'suffix3': 'hui', 'suffix4': ' hui', 'suffix5': 'g hui', 'suffix6': 'ng hui', 'prefix': 'c', 'prefix2': 'ch', 'prefix3': 'chu', 'prefix4': 'chua', 'prefix5': 'chua '}
male


## Question 8

In [10]:
# Display wrong prediction for improvement

errors = []

for (name,label) in names:
    if bayes.classify(extract_gender_features(name)) != label:
        errors.append({"name": name, "label": label})
# 
errors

{'suffix': 'r', 'suffix2': 'ir', 'suffix3': 'mir', 'suffix4': 'amir', 'suffix5': 'aamir', 'suffix6': 'a', 'prefix': 'a', 'prefix2': 'aa', 'prefix3': 'aam', 'prefix4': 'aami', 'prefix5': 'aamir'}
{'suffix': 'n', 'suffix2': 'on', 'suffix3': 'ron', 'suffix4': 'aron', 'suffix5': 'aaron', 'suffix6': 'a', 'prefix': 'a', 'prefix2': 'aa', 'prefix3': 'aar', 'prefix4': 'aaro', 'prefix5': 'aaron'}
{'suffix': 'y', 'suffix2': 'ey', 'suffix3': 'bey', 'suffix4': 'bbey', 'suffix5': 'abbey', 'suffix6': 'a', 'prefix': 'a', 'prefix2': 'ab', 'prefix3': 'abb', 'prefix4': 'abbe', 'prefix5': 'abbey'}
{'suffix': 'e', 'suffix2': 'ie', 'suffix3': 'bie', 'suffix4': 'bbie', 'suffix5': 'abbie', 'suffix6': 'a', 'prefix': 'a', 'prefix2': 'ab', 'prefix3': 'abb', 'prefix4': 'abbi', 'prefix5': 'abbie'}
{'suffix': 't', 'suffix2': 'ot', 'suffix3': 'bot', 'suffix4': 'bbot', 'suffix5': 'abbot', 'suffix6': 'a', 'prefix': 'a', 'prefix2': 'ab', 'prefix3': 'abb', 'prefix4': 'abbo', 'prefix5': 'abbot'}
{'suffix': 't', 'suffix2'

[{'name': 'Abbie', 'label': 'male'},
 {'name': 'Abby', 'label': 'male'},
 {'name': 'Addie', 'label': 'male'},
 {'name': 'Adrien', 'label': 'male'},
 {'name': 'Alex', 'label': 'male'},
 {'name': 'Alexis', 'label': 'male'},
 {'name': 'Ali', 'label': 'male'},
 {'name': 'Alix', 'label': 'male'},
 {'name': 'Allie', 'label': 'male'},
 {'name': 'Allyn', 'label': 'male'},
 {'name': 'Andie', 'label': 'male'},
 {'name': 'Andre', 'label': 'male'},
 {'name': 'Andrea', 'label': 'male'},
 {'name': 'Andy', 'label': 'male'},
 {'name': 'Angel', 'label': 'male'},
 {'name': 'Angie', 'label': 'male'},
 {'name': 'Antoine', 'label': 'male'},
 {'name': 'Antone', 'label': 'male'},
 {'name': 'Ari', 'label': 'male'},
 {'name': 'Arie', 'label': 'male'},
 {'name': 'Ashley', 'label': 'male'},
 {'name': 'Augustine', 'label': 'male'},
 {'name': 'Bealle', 'label': 'male'},
 {'name': 'Bela', 'label': 'male'},
 {'name': 'Bennie', 'label': 'male'},
 {'name': 'Bertie', 'label': 'male'},
 {'name': 'Billie', 'label': 'male

## Question 9

In [11]:
# Store the model into joblib using dump method
# complete this
