<a href="https://colab.research.google.com/github/eugenebaraka/66daysofdata_NLP/blob/main/day10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pos = {}
pos['colorless'] = 'ADJ'
pos['ideas'] = 'N'
pos['sleep'] = 'V'
pos['furiously'] = 'ADV'

for key, val in pos.items():
  print(f"{key}:{val}")

colorless:ADJ
ideas:N
sleep:V
furiously:ADV


In [None]:
from collections import defaultdict
frequency = defaultdict(int)
# frequency['colorless'] = 4
# frequency['ideas']
frequency

defaultdict(int, {})

In [None]:
pos.update({'cats': 'N', 'scratch': 'V', 'peacefully': 'ADV', 'old': 'ADJ'})
pos2 = defaultdict(list)
for key, value in pos.items():
  pos2[value].append(key)
# pos2 = [(key, ) for (key, value) in pos.items()]
pos2['ADV']

['furiously', 'peacefully']

In [8]:
## accessing keys from values in a dictionary - practice
from collections import defaultdict
example_dict = {'Name': 'Paul', 'Occupation': 'Teacher', 'Residence': 'Ontario'}

example_dict2 = defaultdict(list)
print(f"before: {example_dict2}")
for key, value in example_dict.items():
  example_dict2[value].append(key)

print("after:", example_dict2)


before: defaultdict(<class 'list'>, {})
after: defaultdict(<class 'list'>, {'Paul': ['Name'], 'Teacher': ['Occupation'], 'Ontario': ['Residence']})


In [9]:
example_dict2['Ontario']

['Residence']

## N-Gram Tagging

### Unigram tagging

In [16]:
import nltk
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories = 'news')
brown_sents = brown.sents(categories = 'news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

unigram_tagger.evaluate(brown_tagged_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  


0.9349006503968017

In [11]:
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

## Learning to classify text

Classification: choosing the correct label for a given input. 

- Multi-class classification: input has multiple labels
- Open-class classification: set of labels not defined in advance (unspervised?)
- Sequence classification: list of inputs are jointly classified (not independent from each other?)

Supervised Classification: Model trained on text with **correct** labels. 

Process of supervised classification:
- Training: We have input text and a feature extractor is used to convert each input value to a feature set. Pairs of feature sets are fed into the machine learning algorith to generate a model
- Prediction: Same feature extractor is used to convert unseen inputs to feature sets. Feature sets then fed to the produced model to generate predicted labels


In [17]:
## Gender identification:
### Names ending in a, e and i are likely to be female, while names ending in k, o, r, s and t are likely to be male

def gender_features(word):
  return {'last_letter': word[-1]}

## this function will build a feature set

In [22]:
from nltk.corpus import names
import random
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names) #shuffling names

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
## will train the model on the names after 500th name in the dataset

classifier = nltk.NaiveBayesClassifier.train(train_set)

In [34]:
## testing the classifier with new names

classifier.classify(gender_features('Audrey'))

'female'

In [35]:
## What's the model's accuracy?
print(nltk.classify.accuracy(classifier, test_set))

0.778


In [36]:
## most informative features for distinguishing names' genders
## LIKELIHOOD RATIOS
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     35.6 : 1.0
             last_letter = 'k'              male : female =     30.1 : 1.0
             last_letter = 'f'              male : female =     16.5 : 1.0
             last_letter = 'p'              male : female =     10.5 : 1.0
             last_letter = 'd'              male : female =     10.4 : 1.0


In [39]:
## We don't want to construct a list that contains every features of every instance IRL for memory constraints. 
## Use this instead

# Gender identification using names 
from nltk.classify import apply_features
from nltk import NaiveBayesClassifier 
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])
classifier = NaiveBayesClassifier.train(train_set) #training classifier
accuracy = nltk.classify.accuracy(classifier, test_set)
print(f"Model's accuracy: {accuracy}")


Model's accuracy: 0.778


In [40]:
names_list = ['Patrick', 'Audrey', 'Donald', 'Mary', 'Joy', 'Christopher']

for name in names_list:
  predicted_gender = classifier.classify(gender_features(name))
  print(f"{name} is {predicted_gender}")

Patrick is male
Audrey is female
Donald is male
Mary is female
Joy is female
Christopher is male


## Choosing the Right Features

In [46]:
## using the 'Kitchen sink' approach to determine which features will give up the best model
### This is a trial-and-error process
import string
def gender_features2(name):
  features = {}
  features['first_letter'] = name[0].lower()
  features['last_letter'] = name[-1].lower()
  for letter in string.ascii_lowercase:
    features[f'count({letter})'] = name.lower().count(letter)
    features[f'has({letter})'] = (letter in name.lower())
  return features


In [47]:
gender_features2("Eugene")

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 3,
 'count(f)': 0,
 'count(g)': 1,
 'count(h)': 0,
 'count(i)': 0,
 'count(j)': 0,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 0,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 1,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 'e',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': True,
 'has(f)': False,
 'has(g)': True,
 'has(h)': False,
 'has(i)': False,
 'has(j)': False,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': False,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': True,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 'e'}