# CATS: Further Data Preprocessing #

### Imports ###

In [1]:
import pandas as pd
import numpy as np
import datetime
import re

### Data Preview ###
Cat data was extracted from the data exported from the Champ_Preprocessing notebook.

In [2]:
data = pd.read_csv('preprocessed_data.csv', index_col=0)

In [3]:
cat_data = data[data['AnimalType'] == 'CAT'].reset_index(drop=True)
cat_data = cat_data.drop('AnimalType', axis=1)
print("Total Observations: ", len(cat_data))
cat_data.head()

Total Observations:  57350


Unnamed: 0,AnimalID,IntakeType,IntakeSubtype,PrimaryColor,PrimaryBreed,Gender,IntakeInternalStatus,ReproductiveStatusAtIntake,OutcomeType,OutcomeSubtype,IntakeMonth,IntakeYear,IntakeDotW,OutcomeMonth,OutcomeYear,OutcomeDotW,IntakeAge,TimeUntilOutcome,LiveOutcome
0,A366370,STRAY,OTC,WHITE,DOMESTIC SHORTHAIR,MALE,FEARFUL,ALTERED,EUTH,FERAL,11,2008,4,11,2008,2,,5,0
1,A366531,STRAY,OTC,BLACK,DOMESTIC SHORTHAIR,UNKNOWN,NORMAL,UNKNOWN,EUTH,CONTAG DIS,11,2008,0,11,2008,2,,9,0
2,A314432,OWNER SUR,OTC,BROWN TABBY,DOMESTIC SHORTHAIR,UNKNOWN,NORMAL,UNKNOWN,EUTH,TIME/SPACE,6,2007,4,6,2007,4,,0,0
3,A317335,STRAY,OTC,GRAY,DOMESTIC SHORTHAIR,UNKNOWN,NORMAL,UNKNOWN,EUTH,TIME/SPACE,7,2007,3,7,2007,2,,6,0
4,A317486,STRAY,OTC,GRAY,DOMESTIC SHORTHAIR,FEMALE,NURSING,FERTILE,EUTH,TIME/SPACE,7,2007,4,7,2007,3,,6,0


### Feature Value Reduction ###
The number of feature values  were reduced to help with training the model later on and to categorize values that may have been assigned differently arbitrarily.

#### Combining IntakeType and IntakeSubtype ####

In [4]:
cat_data['IntakeTypes'] = cat_data['IntakeType'].map(str) + ',' + cat_data['IntakeSubtype']
cat_data = cat_data.drop(['IntakeType', 'IntakeSubtype'], axis=1)

#### Classifying the IntakeTypes into 10 categories ####
Many of the combined intake types were very similar and can be grouped together.

In [5]:
def cat_IntakeType(IntakeTypes):
    IntakeTypes = str(IntakeTypes)
    t1 = re.compile("CONFISCATE,ABANDONED|CONFISCATE,NEGLECT|CONFISCATE,CRUELTY")
    t2 = re.compile(
        "CONFISCATE,EVICTION|CONFISCATE,UNPERMITED|CONFISCATE,POLICE|CONFISCATE,HOSPITAL|CONFISCATE,OWNER DIED|CONFISCATE,RESTRAINT")
    t3 = re.compile("FOSTER,RETURN|RETURN,FOSTER")
    t4 = re.compile("OWNER SUR,FIELD|KHS,OWNER SUR")
    t5 = re.compile("OWNER SUR,OTC")
    t6 = re.compile("OWNER SUR,RETURN 30|RETURN,ADOPTION|RETURN,FIELD|RETURN,K HUMANE S|RETURN,OTC")
    t7 = re.compile("KHS,STRAY")
    t8 = re.compile("STRAY,FIELD|STRAY,STRAY")
    t9 = re.compile("STRAY,OTC|OWNER SUR,STRAY")
    
    if t1.match(IntakeTypes):
        return 'CONFISCATE,CRUELTY'
    elif t2.match(IntakeTypes):
        return 'CONFISCATE,OTHER'
    elif t3.match(IntakeTypes):
        return 'FOSTER'
    elif t4.match(IntakeTypes):
        return 'OWNERSUR,FIELD'
    elif t5.match(IntakeTypes):
        return 'OWNERSUR,OTC'
    elif t6.match(IntakeTypes):
        return 'RETURN'
    elif t7.match(IntakeTypes):
        return 'STRAY,KHS'
    elif t8.match(IntakeTypes):
        return 'STRAY,FIELD'
    elif t9.match(IntakeTypes):
        return 'STRAY,OTC'
    else:
        return 'OTHER'

In [6]:
cat_data['IntakeTypes'] = cat_data['IntakeTypes'].apply(cat_IntakeType)
cat_data.groupby('IntakeTypes').count()['AnimalID']

IntakeTypes
CONFISCATE,CRUELTY      309
CONFISCATE,OTHER         45
FOSTER                 3301
OTHER                   208
OWNERSUR,FIELD         1723
OWNERSUR,OTC          12070
RETURN                  583
STRAY,FIELD           16834
STRAY,KHS               370
STRAY,OTC             21907
Name: AnimalID, dtype: int64

#### Categorizing cat colors ####

In [7]:
def cat_color_class(color):
    color = str(color)
    c1 = re.compile("ORANGE|APRICOT|FLAME|RED|TABBY")
    c2 = re.compile("GRAY|BLUE|SEAL|SILVER")
    c3 = re.compile("CREAM|BUFF|GOLD|TAN|YELLOW")
    c4 = re.compile("BROWN|CHOCOLATE|MAHOGANY|SABLE")
    c5 = re.compile("WHITE|LYNX")
    c6 = re.compile("TORTIE|TORTI")
    c7 = re.compile("BLACK")
    c8 = re.compile("CALICO")
   
    if c1.match(color):
        return 'ORANGE'
    elif c2.match(color):
        return 'GRAY'
    elif c3.match(color):
        return 'CREAM'
    elif c4.match(color):
        return 'BROWN'
    elif c5.match(color):
        return 'WHITE'
    elif c6.match(color):
        return 'TORTIE'
    elif c7.match(color):
        return 'BLACK'
    elif c8.match(color):
        return 'CALICO'
    else:
        return 'OTHER'

In [8]:
cat_data['Color'] = cat_data['PrimaryColor'].str.split().str[0]
cat_data['Color'] = cat_data['Color'].apply(cat_color_class)
cat_data = cat_data.drop('PrimaryColor', axis=1)

In [9]:
cat_data.groupby('Color').count()['AnimalID']

Color
BLACK     16915
BROWN      7192
CALICO     3285
CREAM      1921
GRAY      13341
ORANGE     6835
OTHER       166
TORTIE     2886
WHITE      4809
Name: AnimalID, dtype: int64

#### Categorizing cat breeds ####

In [10]:
def cat_breed(breed):
    breed = str(breed)
    
    if (breed == 'DOMESTIC LONGHAIR'):
        return 'LONG'
    elif (breed == 'DOMESTIC MEDIUMHAIR'):
        return 'MED'
    elif (breed == 'DOMESTIC SHORTHAIR'):
        return 'SHORT'
    elif (breed == 'AMERICAN SHORTHAIR'):
        return 'AMSHORT'
    elif (breed == 'SIAMESE'):
        return 'SIAMESE'
    elif (breed != 'NaN'):
        return 'OTHER'
    else:
        return 'NONE'

In [11]:
cat_data['Breed'] = cat_data['PrimaryBreed'].apply(cat_breed)
cat_data = cat_data.drop('PrimaryBreed', axis=1)

In [12]:
cat_data.groupby('Breed').count()['AnimalID']

Breed
AMSHORT     1325
LONG        2646
MED         5073
OTHER       1174
SHORT      46076
SIAMESE     1056
Name: AnimalID, dtype: int64

### Preprocessed Cat Data Preview ###

In [13]:
print("Total Observations: ", len(cat_data))
cat_data.head(20)

Total Observations:  57350


Unnamed: 0,AnimalID,Gender,IntakeInternalStatus,ReproductiveStatusAtIntake,OutcomeType,OutcomeSubtype,IntakeMonth,IntakeYear,IntakeDotW,OutcomeMonth,OutcomeYear,OutcomeDotW,IntakeAge,TimeUntilOutcome,LiveOutcome,IntakeTypes,Color,Breed
0,A366370,MALE,FEARFUL,ALTERED,EUTH,FERAL,11,2008,4,11,2008,2,,5,0,"STRAY,OTC",WHITE,SHORT
1,A366531,UNKNOWN,NORMAL,UNKNOWN,EUTH,CONTAG DIS,11,2008,0,11,2008,2,,9,0,"STRAY,OTC",BLACK,SHORT
2,A314432,UNKNOWN,NORMAL,UNKNOWN,EUTH,TIME/SPACE,6,2007,4,6,2007,4,,0,0,"OWNERSUR,OTC",BROWN,SHORT
3,A317335,UNKNOWN,NORMAL,UNKNOWN,EUTH,TIME/SPACE,7,2007,3,7,2007,2,,6,0,"STRAY,OTC",GRAY,SHORT
4,A317486,FEMALE,NURSING,FERTILE,EUTH,TIME/SPACE,7,2007,4,7,2007,3,,6,0,"STRAY,OTC",GRAY,SHORT
5,A318485,FEMALE,NORMAL,ALTERED,EUTH,TIME/SPACE,7,2007,5,7,2007,5,,0,0,"OWNERSUR,OTC",BLACK,LONG
6,A318496,UNKNOWN,NORMAL,UNKNOWN,EUTH,TIME/SPACE,7,2007,5,7,2007,3,,5,0,"STRAY,FIELD",BLACK,SHORT
7,A318533,MALE,SICK,ALTERED,EUTH,MEDICAL,7,2007,5,7,2007,6,,1,0,"STRAY,FIELD",BLACK,MED
8,A325393,FEMALE,NURSING,FERTILE,EUTH,CONTAG DIS,9,2007,5,9,2007,3,,5,0,"STRAY,OTC",GRAY,SHORT
9,A252809,MALE,NORMAL,FERTILE,EUTH,TIME/SPACE,10,2005,0,10,2005,5,,5,0,"STRAY,KHS",BLACK,SHORT


### Saving Preprocessed Cat Data ###

In [14]:
cat_data.to_csv('cat_data.csv')