<a href="https://colab.research.google.com/github/davidrs/nltk-last-words/blob/master/nltk_last_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup Env

In [2]:
!pip install gender_guesser

Collecting gender_guesser
[?25l  Downloading https://files.pythonhosted.org/packages/13/fb/3f2aac40cd2421e164cab1668e0ca10685fcf896bd6b3671088f8aab356e/gender_guesser-0.4.0-py2.py3-none-any.whl (379kB)
[K    100% |████████████████████████████████| 389kB 24.6MB/s 
[?25hInstalling collected packages: gender-guesser
Successfully installed gender-guesser-0.4.0


In [0]:

import pandas as pd
import numpy as np
import random
from io import StringIO

import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import names, stopwords
import gender_guesser.detector as gender

In [4]:
# ntlk.downloads only need to happen once..I think.
nltk.download('stopwords')
nltk.download('punkt') #DRS: added because something complained it was missing.

from nltk.corpus import names, stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Load data.

Upload the 'last-words.csv' dataset from Kaggle: https://www.kaggle.com/mykhe1097/last-words-of-death-row-inmates#Texas%20Last%20Statement%20-%20CSV.csv 

In [5]:
from google.colab import files

uploaded = files.upload()

FILE_NAME = list(uploaded.keys())[0]
print('Uploaded: {}'.format(FILE_NAME))

Saving last-words.csv to last-words.csv
User uploaded file "last-words.csv" with length 273392 bytes


In [30]:
# Convert uploaded file bytes into a string for pandas.
bytes_data = uploaded[FILE_NAME]
s= bytes_data.decode("utf-8", "replace")
data = StringIO(s) 
df = pd.read_csv(data)

df.head(3)

['Execution' 'LastName' 'FirstName' 'TDCJNumber' 'Age' 'Race'
 'CountyOfConviction' 'AgeWhenReceived' 'EducationLevel' 'NativeCounty '
 'PreviousCrime' 'Codefendants' 'NumberVictim' 'WhiteVictim'
 'HispanicVictim' 'BlackVictim' 'VictimOther Races' 'FemaleVictim'
 'MaleVictim' 'LastStatement']


Unnamed: 0,Execution,LastName,FirstName,TDCJNumber,Age,Race,CountyOfConviction,AgeWhenReceived,EducationLevel,NativeCounty,PreviousCrime,Codefendants,NumberVictim,WhiteVictim,HispanicVictim,BlackVictim,VictimOther Races,FemaleVictim,MaleVictim,LastStatement
0,545,Cardenas,Ruben,999275,47,Hispanic,Hidalgo,28.0,11.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,"This is my statement: My final words. First, I..."
1,544,Pruett,Robert,999411,38,White,Bee,22.0,8.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,I just want to let everyone in here to know I ...
2,543,Preyor,Taichin,999494,46,Black,Bexar,34.0,10.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,"First and foremost I'd like to say, ""Justice h..."


In [55]:

# Get the guessed gender for each person based on name.
detector = gender.Detector()
df['gender'] = df.apply(lambda x: detector.get_gender(x['FirstName']), axis=1)
df['is_male'] = df['gender'] == 'male'

def bucket_age(x):
  """Constants based on 25, 75% percentiles. """
  if x['Age']>44:
    return 'old'
  elif x['Age']<34:
    return 'young'
  else:
    return 'middle'
  
df['age_bucket'] = df.apply(bucket_age, axis=1)


# Describe data to check for outliers.
display(df.describe())

# Clean up the data and split paragraphs into word lists.

# Remove stop words:
stopwords_en = set(stopwords.words('english')) # Set checking is faster in Python than list.

def split_words(x):
    word_list = word_tokenize(x['LastStatement'])
    return word_list

df['LastStatement_words'] = df.apply(split_words, axis=1)
df['LastStatement_cleaned'] = df.apply(lambda x: [word for word in x['LastStatement_words'] if word not in stopwords_en], axis=1)
    
df.head(3)

Unnamed: 0,Execution,TDCJNumber,Age,AgeWhenReceived,EducationLevel,NativeCounty,PreviousCrime,Codefendants,NumberVictim,WhiteVictim,HispanicVictim,BlackVictim,VictimOther Races,FemaleVictim,MaleVictim
count,545.0,545.0,545.0,527.0,500.0,516.0,509.0,517.0,527.0,424.0,423.0,423.0,423.0,526.0,526.0
mean,273.0,517422.089908,39.33945,28.426945,10.148,0.368217,0.54224,0.736944,1.394687,0.915094,0.255319,0.20331,0.026005,0.678707,0.712928
std,157.47222,499381.226644,8.458398,7.828356,2.076227,0.539757,0.498703,1.187976,0.799126,0.826487,0.616567,0.628161,0.210585,0.784009,0.692342
min,1.0,511.0,24.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,137.0,808.0,33.0,22.0,9.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,273.0,999014.0,38.0,27.0,10.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0
75%,409.0,999246.0,44.0,33.0,12.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0
max,545.0,999555.0,67.0,57.0,16.0,6.0,1.0,13.0,7.0,5.0,4.0,6.0,3.0,6.0,4.0


Unnamed: 0,Execution,LastName,FirstName,TDCJNumber,Age,Race,CountyOfConviction,AgeWhenReceived,EducationLevel,NativeCounty,...,VictimOther Races,FemaleVictim,MaleVictim,LastStatement,gender,is_male,LastStatement_words,LastStatement_cleaned,featuresets,age_bucket
0,545,Cardenas,Ruben,999275,47,Hispanic,Hidalgo,28.0,11.0,1.0,...,0.0,1.0,0.0,"This is my statement: My final words. First, I...",male,True,"[This, is, my, statement, :, My, final, words,...","[This, statement, :, My, final, words, ., Firs...","{'contains(this)': False, 'contains(statement)...",old
1,544,Pruett,Robert,999411,38,White,Bee,22.0,8.0,0.0,...,0.0,0.0,1.0,I just want to let everyone in here to know I ...,male,True,"[I, just, want, to, let, everyone, in, here, t...","[I, want, let, everyone, know, I, love, much, ...","{'contains(this)': False, 'contains(statement)...",middle
2,543,Preyor,Taichin,999494,46,Black,Bexar,34.0,10.0,0.0,...,0.0,0.0,1.0,"First and foremost I'd like to say, ""Justice h...",unknown,False,"[First, and, foremost, I, 'd, like, to, say, ,...","[First, foremost, I, 'd, like, say, ,, ``, Jus...","{'contains(this)': False, 'contains(statement)...",old


In [0]:
# Prepare features based on word frequency.

# Create a vocabulary of all the words
all_words_raw = []
for words in df['LastStatement_cleaned'] :
    all_words_raw += words
    
# Construct a list of the X most frequent words in the overall corpus
all_words = nltk.FreqDist(w.lower() for w in all_words_raw)

# TODO: Verify this is sorted...example I followed claimed so.
word_features = list(all_words.keys())[:200] 

def document_features(document):
  """Create a feature for each 'common word' and whether last statement contains it."""
  document_words = set(document)
  features = {}
  for word in word_features:
      features['contains(%s)' % word] = (word in document_words)
  return features

# Generate the feature sets for all people's statements.
df['featuresets'] = df['LastStatement_cleaned'].apply(document_features)


In [41]:
# Get ratio of males to females so we know class inbalance and what
# a good classifier would score. Ratio is about 80% is_male
print(df.pivot_table(columns='gender', values='FirstName', aggfunc=len))

gender     andy  female  male  mostly_female  mostly_male  unknown
FirstName     6      10   463              5           28       33


In [56]:
# Convert DF into Classifier format of tuples of features then label.
featuresets =[]
for _, row in df.iterrows():
    featuresets.append((row['featuresets'], row['age_bucket']))

# Shuffle before splitting test/train in case data is ordered in some way.
random.shuffle(featuresets)
    
# Define the train and test sets.
train_set, test_set = featuresets[100:], featuresets[:100]

# Train a naive bayes classifier with train set by nltk
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Get the accuracy of the naive bayes classifier with test set
print(nltk.classify.accuracy(classifier, test_set))

# Debug info: show top n most informative features
classifier.show_most_informative_features(15)

0.43
Most Informative Features
          contains(work) = True              old : middle =      8.4 : 1.0
        contains(trying) = True              old : middle =      5.8 : 1.0
          contains(made) = True           middle : old    =      5.7 : 1.0
            contains(wo) = True            young : old    =      4.2 : 1.0
    contains(understand) = True              old : middle =      3.6 : 1.0
     contains(everybody) = True            young : old    =      3.2 : 1.0
       contains(proceed) = True              old : middle =      3.2 : 1.0
          contains(till) = True              old : middle =      3.2 : 1.0
            contains('d) = True            young : old    =      3.0 : 1.0
       contains(someone) = True            young : old    =      3.0 : 1.0
       contains(forever) = True            young : middle =      3.0 : 1.0
      contains(attorney) = True              old : young  =      2.8 : 1.0
         contains(peace) = True           middle : old    =      2.7 