# DOGS: Further Data Preprocessing #

### Imports ###

In [1]:
import pandas as pd
import numpy as np
import datetime
import re

### Data Preview ###
Dog data was extracted from the data exported from the Champ_Preprocessing notebook.

In [2]:
data = pd.read_csv('preprocessed_data.csv', index_col=0)

In [3]:
dog_data = data[data['AnimalType'] == 'DOG'].reset_index(drop=True)
dog_data = dog_data.drop('AnimalType', axis=1)
print("Total Observations: ", len(dog_data))
dog_data.head()

Total Observations:  52260


Unnamed: 0,AnimalID,IntakeType,IntakeSubtype,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,ReproductiveStatusAtIntake,OutcomeType,OutcomeSubtype,IntakeMonth,IntakeYear,IntakeDotW,OutcomeMonth,OutcomeYear,OutcomeDotW,IntakeAge,TimeUntilOutcome,LiveOutcome
0,A281756,OWNER SUR,OTC,WHITE,PIT BULL TERRIER,MALE,NORMAL,FERTILE,EUTH,TIME/SPACE,9,2006,0,9,2006,1,365.0,1,0
1,A256128,STRAY,FIELD,BROWN,AMERICAN PIT BULL TERRIER,MALE,NORMAL,FERTILE,EUTH,MEDICAL,11,2005,5,12,2005,3,,12,0
2,A316619,STRAY,FIELD,WHITE,LABRADOR RETRIEVER,MALE,FEARFUL,FERTILE,EUTH,TIME/SPACE,6,2007,4,7,2007,2,,5,0
3,A319056,STRAY,OTC,TRICOLOR,BEAGLE,MALE,NORMAL,ALTERED,EUTH,TIME/SPACE,7,2007,3,8,2007,1,,19,0
4,A258842,CONFISCATE,NEGLECT,WHITE,PIT BULL TERRIER,MALE,NORMAL,FERTILE,EUTH,BREED,12,2005,2,12,2005,3,,8,0


### Feature Value Reduction ###
The number of feature values  were reduced to help with training the model later on and to categorize values that may have been assigned differently arbitrarily.

#### Combining IntakeType and IntakeSubtype ####

In [4]:
dog_data['IntakeTypes'] = dog_data['IntakeType'].map(str) + ',' + dog_data['IntakeSubtype']
dog_data = dog_data.drop(['IntakeType', 'IntakeSubtype'], axis=1)

#### Classifying the IntakeTypes into 10 categories ####
Many of the combined intake types were very similar and can be grouped together.

In [5]:
def dog_IntakeType(IntakeTypes):
    IntakeTypes = str(IntakeTypes)
    t1 = re.compile("CONFISCATE,ABANDONED|CONFISCATE,NEGLECT|CONFISCATE,CRUELTY|CONFISCATE,CHAINING|CONFISCATE,FIELD")
    t2 = re.compile(
        "CONFISCATE,EVICTION|CONFISCATE,UNPERMITED|CONFISCATE,POLICE|CONFISCATE,HOSPITAL|CONFISCATE,OWNER DIED \
        |CONFISCATE,RESTRAINT|CONFISCATE,COURT ORD")
    t3 = re.compile("FOSTER,RETURN")
    t4 = re.compile("OWNER SUR,FIELD|KHS,OWNER SUR|OWNER SUR,NEGLECT|OWNER SUR,FIELD OWN")
    t5 = re.compile("OWNER SUR,OTC|OWNER SUR,OWNER SUR")
    t6 = re.compile("OWNER SUR,RETURN 30|RETURN,ADOPTION|RETURN,FIELD|RETURN,K HUMANE S|RETURN,OTC")
    t7 = re.compile("KHS,STRAY")
    t8 = re.compile("STRAY,FIELD|STRAY,STRAY")
    t9 = re.compile("STRAY,OTC|OWNER SUR,STRAY")
    
    if t1.match(IntakeTypes):
        return 'CONFISCATE,CRUELTY'
    elif t2.match(IntakeTypes):
        return 'CONFISCATE,OTHER'
    elif t3.match(IntakeTypes):
        return 'FOSTER'
    elif t4.match(IntakeTypes):
        return 'OWNERSUR,FIELD'
    elif t5.match(IntakeTypes):
        return 'OWNERSUR,OTC'
    elif t6.match(IntakeTypes):
        return 'RETURN'
    elif t7.match(IntakeTypes):
        return 'STRAY,KHS'
    elif t8.match(IntakeTypes):
        return 'STRAY,FIELD'
    elif t9.match(IntakeTypes):
        return 'STRAY,OTC'
    else:
        return 'OTHER'

In [6]:
dog_data['IntakeTypes'] = dog_data['IntakeTypes'].apply(dog_IntakeType)
dog_data.groupby('IntakeTypes').count()['AnimalID']

IntakeTypes
CONFISCATE,CRUELTY     1861
CONFISCATE,OTHER        472
FOSTER                 1062
OTHER                   269
OWNERSUR,FIELD         3060
OWNERSUR,OTC          10149
RETURN                 1925
STRAY,FIELD           18646
STRAY,KHS               190
STRAY,OTC             14626
Name: AnimalID, dtype: int64

#### Categorizing dog colors ####

In [7]:
def dog_color_class(color):
    color = str(color)
    c1 = re.compile("ORANGE|APRICOT|RED")
    c2 = re.compile("GRAY|BLUE|SEAL|SILVER")
    c3 = re.compile("CREAM|BUFF|GOLD|TAN|YELLOW|BLONDE|FAWN")
    c4 = re.compile("BROWN|CHOCOLATE|MAHOGANY|SABLE|LIVER")
    c5 = re.compile("WHITE")
    c6 = re.compile("TRICOLOR")
    c7 = re.compile("BLACK")
   
    if c1.match(color):
        return 'ORANGE'
    elif c2.match(color):
        return 'GRAY'
    elif c3.match(color):
        return 'CREAM'
    elif c4.match(color):
        return 'BROWN'
    elif c5.match(color):
        return 'WHITE'
    elif c6.match(color):
        return 'TRICOLOR'
    elif c7.match(color):
        return 'BLACK'
    else:
        return 'OTHER'

In [8]:
dog_data['Color'] = dog_data['PrimaryColor'].str.split().str[0]
dog_data['Color'] = dog_data['Color'].apply(dog_color_class)
dog_data = dog_data.drop('PrimaryColor', axis=1)

In [9]:
dog_data.groupby('Color').count()['AnimalID']

Color
BLACK       16858
BROWN       13787
CREAM        7392
GRAY         2279
ORANGE       1984
OTHER          11
TRICOLOR     2523
WHITE        7426
Name: AnimalID, dtype: int64

#### Categorizing dog breeds ####

In [10]:
breed_counts = dog_data.groupby('PrimaryBreed').count()['AnimalID']
breed_counts

PrimaryBreed
AFFENPINSCHER                    18
AIREDALE TERRIER                 50
AKITA                           224
ALASKAN HUSKY                   143
ALASKAN KLEE KAI                  2
                               ... 
WEST HIGHLAND WHITE TERRIER      52
WHIPPET                          44
WIRE-HAIRED POINTING GRIFFON      5
WOLF HYBRID                      11
YORKSHIRE TERRIER               522
Name: AnimalID, Length: 204, dtype: int64

In [11]:
def dog_breed(breed):
    breed = str(breed)
    
    # consolidating different variants of the same breed
    b1 = re.compile("BORDER COLLIE|COLLIE - ROUGH|COLLIE - SMOOTH")
    b2 = re.compile("DACHSHUND|DACHSHUND - LONGHAIRED|DACHSHUND - WIREHAIRED")   
    b3 = re.compile("CHIHUAHUA - LONG HAIRED|CHIHUAHUA - SMOOTH COATED")
    
    # consolidating similar breeds
    b5 = re.compile("SIBERIAN HUSKY|ALASKAN HUSKY|ALASKAN MALAMUTE")
    b6 = re.compile("MASTIFF|BULLMASTIFF|NEAPOLITAN MASTIFF|TIBETAN MASTIFF|BOERBOEL")
    b7 = re.compile("YORKSHIRE TERRIER|RAT TERRIER|BOSTON TERRIER|CAIRN TERRIER|FOX TERRIER - WIREHAIRED|BORDER TERRIER")
    b7a = re.compile("FOX TERRIER - SMOOTH|SCOTTISH TERRIER|PATTERDALE TERRIER|SILKY TERRIER|WEST HIGHLAND WHITE TERRIER")
    b7b = re.compile("MANCHESTER TERRIER|NORWICH TERRIER|NORFOLK TERRIER|LAKELAND TERRIER|TERRIER|IRISH TERRIER|BEDLINGTON TERRIER")
    b7c = re.compile("WELSH TERRIER|AUSTRALIAN TERRIER|TOY FOX TERRIER|DANDIE DINMONT TERRIER|BULL TERRIER - MINIATURE")
    b8 = re.compile("AMERICAN PIT BULL TERRIER|AMERICAN STAFFORDSHIRE TERRIER|AMERICAN BULLY|STAFFORDSHIRE BULL TERRIER|PIT BULL TERRIER")
    b9 = re.compile("GERMAN SHEPHERD DOG|BELGIAN MALINOIS")
    
    # grouping uncommon and rare breeds together based on size (small, medium, large, giant)
    bs = re.compile("LOWCHEN|COTON DE TULEAR|PODENGO PORTUGUESO PEQUENO|JAPANESE CHIN|FEIST|CHINESE CRESTED DOG")
    bsa = re.compile("AFFENPINSCHER|ITALIAN GREYHOUND|HAVANESE|SCHIPPERKE|CAVALIER KING CHARLES SPANIEL|PEKINGESE|MALTESE")
    bsb = re.compile("ENGLISH TOY SPANIEL|TIBETAN SPANIEL|BRUSSELS GRIFFON|BICHON FRISE|PAPILLON|POODLE - TOY|LHASA APSO")
    bsc = re.compile("SCHNAUZER - MINIATURE|MINIATURE PINSCHER|PEKAPOO")
    
    bm = re.compile("SWISS HOUND|GERMAN PINSCHER|SWEDISH VALLHUND|PULI|BORZOI|KARELIAN BEAR DOG|ALASKAN KLEE KAI")
    bma = re.compile("MUNSTERLANDER|NORWEGIAN BUHUND|POLISH LOWLAND SHEEPDOG|SALUKI|TREEING CUR|PORTUGUESE WATER DOG")
    bmb = re.compile("ENGLISH SHEPHERD|WIRE-HAIRED POINTING GRIFFON|NOVA SCOTIA DUCK-TOLLING RETRIEVER|BERNESE HOUND")
    bmc = re.compile("TENNESEE TREEING BRINDLE HOUND|CANAAN DOG|PHARAOH HOUND|AUSTRALIAN KELPIE|QUEENSLAND HEELER")
    bmd = re.compile("SAMOYED|FINNISH SPITZ|KEESHOND|NORWEGIAN ELKHOUND|VIZSLA|PETIT BASSET GRIFFON VENDEEN|KOREAN JINDO")
    bme = re.compile("FRENCH BULLDOG|HARRIER|BRITTANY|CAROLINA DOG|BOYKIN SPANIEL|ENGLISH COCKER SPANIEL|WELSH CORGI - CARDIGAN")
    bmf = re.compile("ENGLISH SPRINGER SPANIEL|FIELD SPANIEL|SUSSEX SPANIEL|WELSH SPRINGER SPANIEL|ENGLISH COONHOUND")
    bmg = re.compile("BULLDOG|WHIPPET|SHIBA INU|PLOTT HOUND|BASENJI|SCHNAUZER - STANDARD|ENGLISH BULLDOG|AMERICAN ESKIMO")
    bmh = re.compile("POINTER|DALMATIAN|CHINESE SHARPEI|SHETLAND SHEEPDOG|BASSET HOUND|BEARDED COLLIE|WELSH CORGI - PEMBROKE")
    bmi = re.compile("TIBERAN TERRIER|SOFT-COATED WHEATEN TERRIER|GLEN OF IMAAL TERRIER|SKYE TERRIER|TIBETAN TERRIER")
    
    bl = re.compile("HOVAWART|KUVASZ|PICARDY|PICARDY SHEEPDOG|BEAUCERON|GERMAN WIREHAIRED POINTER|BRIARD|SPINONE ITALIANO")
    bla = re.compile("BELGIAN SHEEPDOG|BELGIAN TERVUREN|DOGO ARGENTINO|WOLF HYBRID|BOUVIER DES FLANDRES|IBIZAN HOUND")
    blb = re.compile("CHESAPEAKE BAY RETRIEVER|OLD ENGLISH BULLDOG|DUTCH SHEPHERD|OLD ENGLISH SHEEPDOG|IRISH SETTER")
    blc = re.compile("SCHNAUZER - GIANT|GORDON SETTER|ENGLISH FOXHOUND|CLUMBER SPANIEL|BLACK MOUTH CUR|ENGLISH POINTER")
    bld = re.compile("ENGLISH SETTER|GREYHOUND|AMERICAN FOXHOUND|WEIMARANER|GERMAN SHORTHAIRED POINTER|BLUETICK COONHOUND")
    ble = re.compile("REDBONE COONHOUND|RHODESIAN RIDGEBACK|POODLE - STANDARD|FLAT-COATED RETRIEVER|TREEING WALKER COONHOUND")
    blf = re.compile("TREEING WALKER COONHOUND|BLACK AND TAN COONOUND|DOBERMAN PINSCHER|AMERICAN BULLDOG")
    blg = re.compile("AIREDALE TERRIER|BULL TERRIER|CATAHOULA LEOPARD HOUND")
    
    bg = re.compile("LANDSEER|FILA BRASILEIRO|NEWFOUNDLAND|GREAT DANE|ST BERNARD - SMOOTH COATED|ST BERNARD - ROUGH COATED")
    bga = re.compile("SCOTTISH DEERHOUND|LEONBERGER|GREATER SWISS MOUNTAIN DOG|OTTERHOUND|BERNESE MOUNTAIN DOG|AKITA")
    bgb = re.compile("IRISH WOLFHOUND|DOGUE DE BORDEAUX|PRESA CANARIO|ANATOLIAN SHEPHERD|BLOODHOUND|GREAT PYRENEES|CANE CORSO")
    
    if b1.match(breed):
        return 'COLLIE'
    elif b2.match(breed):
        return 'DACHSHUND'
    elif b3.match(breed):
        return 'CHIHUAHUA'
    elif breed == 'PARSON (JACK) RUSSELL TERRIER':
        return 'JACK RUSS TER'
    
    elif b5.match(breed):
        return 'HUSKY'
    elif b6.match(breed):
        return 'MASTIFF'
    elif b7.match(breed):
        return 'TERRIER-SMALL'
    elif b7a.match(breed):
        return 'TERRIER-SMALL'
    elif b7b.match(breed):
        return 'TERRIER-SMALL'
    elif b7c.match(breed):
        return 'TERRIER-SMALL'
    elif b8.match(breed):
        return 'PIT BULL'
    elif b9.match(breed):
        return 'GERMAN SHEPHERD'

    elif bs.match(breed):
        return 'OTHER-SMALL'
    elif bsa.match(breed):
        return 'OTHER-SMALL'
    elif bsb.match(breed):
        return 'OTHER-SMALL'
    elif bsc.match(breed):
        return 'OTHER-SMALL'
    
    elif bm.match(breed):
        return 'OTHER-MEDIUM'
    elif bma.match(breed):
        return 'OTHER-MEDIUM'
    elif bmb.match(breed):
        return 'OTHER-MEDIUM'
    elif bmc.match(breed):
        return 'OTHER-MEDIUM'
    elif bmd.match(breed):
        return 'OTHER-MEDIUM'
    elif bme.match(breed):
        return 'OTHER-MEDIUM'
    elif bmf.match(breed):
        return 'OTHER-MEDIUM'
    elif bmg.match(breed):
        return 'OTHER-MEDIUM'
    elif bmh.match(breed):
        return 'OTHER-MEDIUM'
    elif bmi.match(breed):
        return 'OTHER-MEDIUM'
    
    elif bl.match(breed):
        return 'OTHER-LARGE'
    elif bla.match(breed):
        return 'OTHER-LARGE'
    elif blb.match(breed):
        return 'OTHER-LARGE'
    elif blc.match(breed):
        return 'OTHER-LARGE'
    elif bld.match(breed):
        return 'OTHER-LARGE'
    elif ble.match(breed):
        return 'OTHER-LARGE'
    elif blf.match(breed):
        return 'OTHER-LARGE'
    elif blg.match(breed):
        return 'OTHER-LARGE'
    
    elif bg.match(breed):
        return 'OTHER-GIANT'
    elif bga.match(breed):
        return 'OTHER-GIANT'
    elif bgb.match(breed):
        return 'OTHER-GIANT'
    
    else:
        return breed

In [12]:
dog_data['Breed'] = dog_data['PrimaryBreed'].apply(dog_breed)
dog_data = dog_data[dog_data['Breed'] != 'MIX']

In [13]:
dog_data.groupby('Breed').count()['AnimalID']

Breed
AUSTRALIAN CATTLE DOG      477
AUSTRALIAN SHEPHERD        555
BEAGLE                    3885
BOXER                     1865
CHIHUAHUA                 1995
CHOW CHOW                 1433
COCKER SPANIEL             466
COLLIE                    1590
DACHSHUND                  633
GERMAN SHEPHERD           3808
GOLDEN RETRIEVER           478
HUSKY                      814
JACK RUSS TER             1268
LABRADOR RETRIEVER        6617
MASTIFF                    337
OTHER-GIANT               1096
OTHER-LARGE               2018
OTHER-MEDIUM              2301
OTHER-SMALL               1790
PIT BULL                 12776
POMERANIAN                 452
POODLE - MINIATURE         608
PUG                        467
ROTTWEILER                1396
SHIH TZU                  1001
TERRIER-SMALL             2114
Name: AnimalID, dtype: int64

### Preprocessed Dog Data Preview ###

In [14]:
print("Total Observations: ", len(dog_data))
dog_data.head(20)

Total Observations:  52240


Unnamed: 0,AnimalID,PrimaryBreed,Gender,IntakeInternalStatus,ReproductiveStatusAtIntake,OutcomeType,OutcomeSubtype,IntakeMonth,IntakeYear,IntakeDotW,OutcomeMonth,OutcomeYear,OutcomeDotW,IntakeAge,TimeUntilOutcome,LiveOutcome,IntakeTypes,Color,Breed
0,A281756,PIT BULL TERRIER,MALE,NORMAL,FERTILE,EUTH,TIME/SPACE,9,2006,0,9,2006,1,365.0,1,0,"OWNERSUR,OTC",WHITE,PIT BULL
1,A256128,AMERICAN PIT BULL TERRIER,MALE,NORMAL,FERTILE,EUTH,MEDICAL,11,2005,5,12,2005,3,,12,0,"STRAY,FIELD",BROWN,PIT BULL
2,A316619,LABRADOR RETRIEVER,MALE,FEARFUL,FERTILE,EUTH,TIME/SPACE,6,2007,4,7,2007,2,,5,0,"STRAY,FIELD",WHITE,LABRADOR RETRIEVER
3,A319056,BEAGLE,MALE,NORMAL,ALTERED,EUTH,TIME/SPACE,7,2007,3,8,2007,1,,19,0,"STRAY,OTC",TRICOLOR,BEAGLE
4,A258842,PIT BULL TERRIER,MALE,NORMAL,FERTILE,EUTH,BREED,12,2005,2,12,2005,3,,8,0,"CONFISCATE,CRUELTY",WHITE,PIT BULL
5,A258896,AMERICAN PIT BULL TERRIER,MALE,NORMAL,FERTILE,EUTH,BREED,12,2005,3,12,2005,2,,6,0,"STRAY,OTC",BROWN,PIT BULL
6,A252704,AIREDALE TERRIER,FEMALE,NORMAL,FERTILE,EUTH,TIME/SPACE,10,2005,5,10,2005,3,,5,0,"STRAY,OTC",CREAM,OTHER-LARGE
7,A253104,PIT BULL TERRIER,MALE,INJURED,FERTILE,EUTH,TIME/SPACE,10,2005,2,10,2005,1,,6,0,"STRAY,FIELD",BLACK,PIT BULL
8,A000120,BEAGLE,FEMALE,NORMAL,FERTILE,EUTH,BREED,3,2005,1,3,2005,5,,4,0,"STRAY,OTC",BLACK,BEAGLE
9,A000604,CHINESE SHARPEI,MALE,FEARFUL,FERTILE,EUTH,BREED,3,2005,0,3,2005,5,,5,0,"STRAY,FIELD",BROWN,OTHER-MEDIUM


### Saving Preprocessed Dog Data ###

In [15]:
dog_data.to_csv('dog_data.csv')