In [50]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Seattle_Pet_Licenses.csv')

In [3]:
df.head()

Unnamed: 0,License Issue Date,License Number,Animal's Name,Species,Primary Breed,Secondary Breed,ZIP Code
0,April 19 2003,200097.0,Tinkerdelle,Cat,Domestic Shorthair,,98116
1,February 07 2006,75432.0,Pepper,Cat,Manx,Mix,98103
2,August 31 2012,578859.0,Grey Fox,Cat,Siamese,Mix,98125
3,November 14 2013,832989.0,Hannah,Cat,Domestic Longhair,,98133
4,April 03 2014,433713.0,Daisy,Cat,Domestic Shorthair,,98117


In [4]:
df['License Issue Date'] = pd.to_datetime(df["License Issue Date"])

In [7]:
# Yay, we have 2018 data in there.
df.sort_values(by=['License Issue Date'], ascending=False)

Unnamed: 0,License Issue Date,License Number,Animal's Name,Species,Primary Breed,Secondary Breed,ZIP Code
55960,2018-03-10,902201.0,Nala,Dog,Kai Ken,,98103
19031,2018-03-10,140093.0,Mitchell,Cat,Domestic Shorthair,Mix,98121
38020,2018-03-10,140091.0,Eddie,Dog,Beagle,,98108
38019,2018-03-10,128110.0,Bugs,Dog,"Spaniel, American Cocker",,98125
38018,2018-03-10,110035.0,Grendel,Dog,Chow Chow,"Retriever, Labrador",
19030,2018-03-10,128111.0,Phoebe,Cat,Siamese,Desert Lynx,98125
19029,2018-03-10,104298.0,Scully,Cat,Domestic Shorthair,Mix,98144
19028,2018-03-10,104297.0,Mulder,Cat,Domestic Shorthair,Mix,98144
38021,2018-03-10,140092.0,Osbert,Dog,Maltese,Mix,98122
55959,2018-03-10,15970.0,Ziggie,Dog,Miniature Pinscher,,98106


In [56]:
# I think there needs to be some more data cleaning happening...
# Are there blank or short cat names?
df["Animal\'s Name"].isnull().sum()

816

In [None]:
# !!! THERE ARE 816 ANIMALS WITHOUT NAMES!!!

In [66]:
df[pd.isnull(df["Animal\'s Name"])]

Unnamed: 0,License Issue Date,License Number,Animal's Name,Species,Primary Breed,Secondary Breed,ZIP Code
54,2015-08-06,578687.0,,Cat,Domestic Shorthair,,98144
83,2015-11-16,269837.0,,Cat,Domestic Shorthair,,98112
147,2015-11-30,21795.0,,Cat,Domestic Medium Hair,,98103
167,2015-12-02,140679.0,,Cat,Domestic Shorthair,,98103
214,2015-12-15,897476.0,,Cat,Domestic Shorthair,,98136
240,2015-12-18,896415.0,,Cat,Domestic Medium Hair,,98136
293,2016-01-08,822915.0,,Cat,Domestic Shorthair,Russian Blue,98199
300,2016-01-13,730389.0,,Cat,Domestic Shorthair,,98121
310,2016-01-25,896246.0,,Cat,Domestic Shorthair,,98125
333,2016-02-16,349529.0,,Cat,Domestic Shorthair,,98103


In [67]:
df = df[~pd.isnull(df["Animal\'s Name"])]

In [8]:
# Let's create separate names for dogs and cats.

In [68]:
dog_names = df[df.Species=='Dog']["Animal's Name"]

In [69]:
dog_names.to_csv('dog_names.csv', index=False)

In [70]:
cat_names = df[df.Species=='Cat']["Animal's Name"]

In [71]:
cat_names.to_csv('cat_names.csv', index=False)

In [72]:
!head dog_names.csv

Winston
Charlie
Louie
Finn
Cole
Moka
Bill
Moose
Hero
Boss


In [73]:
!head cat_names.csv

Tinkerdelle
Pepper
Grey Fox
Hannah
Daisy
Kukula
Stanleigh
Gunner
Rhiannon
Leto


In [19]:
# DOGS

In [74]:
# get a single long string of strain names separated by newline chars
with open('dog_names.csv') as f:
    file = f.read()

In [None]:
len(file)

In [76]:
# Following pytorch example I used previously

In [77]:
import random
import string
import re

all_characters = string.printable
n_characters = len(all_characters)

In [78]:
chunk_len = 100
file_len = len(file)
def random_chunk():
    '''filelen is length of chars in file'''
    start_index = random.randint(0, file_len - chunk_len)
    end_index = start_index + chunk_len + 1
    return file[start_index:end_index]

In [79]:
random_chunk()

'o\nToby\nLuke\nJimmy Dean\nSedona\nWebster\nBode\nHampton\nMax\nTeddy\nDuke\nOtto\nMilo\nNova\nMax\nMax\nNazzy\nWalden'

In [80]:
# Build the Model
import torch
import torch.nn as nn
from torch.autograd import Variable

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.encoder = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, hidden):
        input = self.encoder(input.view(1,-1))
        output, hidden = self.gru(input.view(1,1,-1) ,hidden)
        output = self.decoder(output.view(1, -1))
        return output, hidden
    
    def init_hidden(self):
        return Variable(torch.zeros(self.n_layers, 1, self.hidden_size))


In [81]:
def char_tensor(string_):
    tensor = torch.zeros(len(string_)).long()
    for c in range(len(string_)):
        tensor[c] = all_characters.index(string_[c])
    return Variable(tensor)

print(char_tensor('abcDEF'))

Variable containing:
 10
 11
 12
 39
 40
 41
[torch.LongTensor of size 6]



In [82]:
def random_training_set():
    chunk = random_chunk()
    inp = char_tensor(chunk[:-1])
    target = char_tensor(chunk[1:])
    return inp, target

In [83]:
# Evaluating
def evaluate(prime_str='A', predict_len=100, temperature=0.8):
    hidden = decoder.init_hidden()
    prime_input = char_tensor(prime_str)
    predicted = prime_str
    
    for p in range(len(prime_str)-1):
        _,hidden = decoder(prime_input[p], hidden)
    inp = prime_input[-1]
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        
        predicted_char = all_characters[top_i]
        predicted += predicted_char
        inp = char_tensor(predicted_char)
        
    return predicted

In [84]:
# Training
import time, math

def time_since(since):
    s = time.time() - since
    m = math.floor(s/60)
    s -= m*60
    return '%dm %ds' % (m,s)

In [85]:
def train(inp, target):
    hidden = decoder.init_hidden()
    decoder.zero_grad()
    loss=0
    
    for c in range(chunk_len):
        output, hidden = decoder(inp[c], hidden)
        loss += criterion(output, target[c])
        
    loss.backward()
    decoder_optimizer.step()
    
    return loss.data[0] / chunk_len

In [93]:
n_epochs = 2000
print_every = 100
plot_every=10
hidden_size = 100
n_layers = 1
lr = 0.005

decoder = RNN(n_characters, hidden_size, n_characters, n_layers)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

start = time.time()
all_losses=[]
loss_avg = 0

for epoch in range(1, n_epochs+1):
    try:
        loss = train(*random_training_set())
        loss_avg += loss

        if epoch % print_every == 0:
            print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 100, loss))
            print(evaluate('R', 100), '\n')

        if epoch % plot_every == 0:
            all_losses.append(loss_avg / plot_every)
            loss_avg = 0
    except:
        continue

[0m 14s (100 5%) 2.2591]
Rutzoe Budy
Benery Atine Wilie
Bika
Bugy
Niusta
Arallie
Cohe Bucy
Farikin
Brkiee
Cascita Ikge Cotoo
L 

[0m 29s (200 10%) 2.2650]
Razzs
Pebby
Jaxy
Scarler
Topy
Silly
parlie
Manlie
Pnisty
Jaka
Grarlie
Roxi
Allie
Anddy Bele
Maxie
Dan 

[0m 41s (300 15%) 2.2799]
Roe
Kachan
Ludy
Bosie
Mo Duddy
Sadie
Dacy
Chardie
Leolu
Chee
Madie
Caie
Greffanaele
Sumety
Busha
Gunz 

[0m 57s (400 20%) 2.1085]
Rog
Steo
Nalie
Rosie
Phippa
Bella
Glow
Maley
Noy
Edge
Mori
Broso
Jena
Sast
Roxie
Lunie
RuEn
Lury
Penn 

[1m 12s (500 25%) 1.9606]
RTick
Beana
Bangie
Jack
Shady
Samcka
Samy
Frie
Buntet
Ningie
Millie
Jakis
Stloe
Ostley
Hermo
Gora
Mil 

[1m 26s (600 30%) 2.1313]
Remer
Fran
Buddy
Manko
Kobina
Luna
Luna
Maggie
Bear
Marly
Skiter
Louie
Stella
Edillen
Ceria
Benra
Gee 

[1m 44s (700 35%) 2.0804]
R-!=die
Max
Toby
Roa
Kila
Stel
Luila
Willson
Pillie
Nella
Oliver
Henry
Gizmo Jasper
Ludelo Wala
Rosi
 

[1m 58s (800 40%) 1.8430]
Rere
Mazulle Mailey
Shattie
Zo
Sild
Penna
Billy
Clean
Gaxter
Bill

In [109]:
print(evaluate('f', 200, temperature=0.6))

fle Bell
Rosco
Bella
Margie
Stilla
Sillie
Ziggy
Buster
Poco
Otis
Gobby
Bear
Buther
Jacky
Molly
Mia
Chickon
Roxie
Sammy
Rosi
Lucy
Sophie
Bella
Sasha
Berttinn
King
Sadie
Elillie
Meana
Finn
Kollie
Shadie

