In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# Read in data

In [2]:
austin_intakes = pd.read_csv('./datasets/raw_data/austin_animal_center_intakes.csv')

In [3]:
austin_intakes.head(2)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
0,A786884,*Brock,01/03/2019 04:19:00 PM,January 2019,2501 Magin Meadow Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor
1,A706918,Belle,07/05/2015 12:59:00 PM,July 2015,9409 Bluegrass Dr in Austin (TX),Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver


In [4]:
austin_outcomes = pd.read_csv('./datasets/raw_data/austin_animal_center_outcomes.csv')

In [5]:
austin_outcomes.head(2)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown


In [6]:
austin_outcomes_2 = pd.read_csv('./datasets/raw_data/austin_animal_center_outcomes_2.csv')

In [7]:
austin_outcomes_2.head(2)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown


In [8]:
dallas_shelter = pd.read_csv('./datasets/raw_data/dallas_animal_shelter_data.csv')

In [9]:
dallas_shelter.head(2)

Unnamed: 0,Animal Id,Animal Type,Animal Breed,Kennel Number,Kennel Status,Tag Type,Activity Number,Activity Sequence,Source Id,Census Tract,...,Outcome Time,Receipt Number,Impound Number,Service Request Number,Outcome Condition,Chip Status,Animal Origin,Additional Information,Month,Year
0,A1093136,CAT,DOMESTIC SH,K01,IMPOUNDED,,,1,P0915082,5200,...,14:46:00,,K20-494003,,APP SICK,SCAN NO CHIP,OVER THE COUNTER,,JAN.2020,FY2020
1,A1046046,DOG,PIT BULL,AD 085,UNAVAILABLE,,A19-203263,1,P0740141,12900,...,14:56:00,R19-561166,K19-489088,,APP WNL,SCAN CHIP,FIELD,ADOPTION,NOV.2019,FY2020


# Filter datasets to only include dogs

## Austin Intakes

In [10]:
# Animal types other than dogs in austin_intakes

austin_intakes[austin_intakes['Animal Type']!='Dog']['Animal Type'].unique()

array(['Cat', 'Other', 'Bird', 'Livestock'], dtype=object)

In [11]:
austin_intakes[austin_intakes['Animal Type']!='Dog'].head(3)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
3,A665644,,10/21/2013 07:59:00 AM,October 2013,Austin (TX),Stray,Sick,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico
7,A844350,*Ella,10/15/2021 11:40:00 AM,October 2021,2112 East William Cannon Drive in Austin (TX),Stray,Normal,Cat,Intact Female,6 months,Domestic Shorthair,Brown Tabby
9,A818975,,06/18/2020 02:53:00 PM,June 2020,Braker Lane And Metric in Travis (TX),Stray,Normal,Cat,Intact Male,4 weeks,Domestic Shorthair,Cream Tabby


In [12]:
#Drop rows with animal type other than 'Dog' to create new df with dogs only

austin_intakes = austin_intakes.drop(labels=austin_intakes[austin_intakes['Animal Type']!='Dog'].index)

In [13]:
austin_intakes.shape

(76769, 12)

## Austin Outcomes

#### Merge austin_outcomes and austin_outcomes2

In [14]:
austin_outcomes.shape

(136538, 12)

In [15]:
austin_outcomes_2.shape

(136570, 12)

In [16]:
# merge austin outcomes data sets
austin_outcomes = pd.concat([austin_outcomes, austin_outcomes_2])

In [17]:
# drop duplicated rows
austin_outcomes.drop_duplicates(keep='first', inplace=True)

In [18]:
austin_outcomes.head(3)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,,08/16/2020 11:38:00 AM,Aug 2020,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray


#### Filter austin_outcomes_merged to only include dogs

In [19]:
austin_outcomes[austin_outcomes['Animal Type']!='Dog']['Animal Type'].unique()

array(['Cat', 'Other', 'Bird', 'Livestock'], dtype=object)

In [20]:
austin_outcomes[austin_outcomes['Animal Type']!='Dog'].head(4)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,05/08/2019 06:20:00 PM,May 2019,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
2,A821648,,08/16/2020 11:38:00 AM,Aug 2020,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
4,A674754,,03/18/2014 11:47:00 AM,Mar 2014,03/12/2014,Transfer,Partner,Cat,Intact Male,6 days,Domestic Shorthair Mix,Orange Tabby
7,A689724,*Donatello,10/18/2014 06:52:00 PM,Oct 2014,08/01/2014,Adoption,,Cat,Neutered Male,2 months,Domestic Shorthair Mix,Black


In [21]:
austin_outcomes = austin_outcomes.drop(labels=austin_outcomes[austin_outcomes['Animal Type']!='Dog'].index)

In [22]:
austin_outcomes['Animal Type'].unique()

array(['Dog'], dtype=object)

In [23]:
austin_outcomes.shape

(76727, 12)

In [24]:
austin_intakes.shape

(76769, 12)

## dallas_shelter_data

In [25]:
# Examine columns of dallas_shelter_data

dallas_shelter.head(2)

Unnamed: 0,Animal Id,Animal Type,Animal Breed,Kennel Number,Kennel Status,Tag Type,Activity Number,Activity Sequence,Source Id,Census Tract,...,Outcome Time,Receipt Number,Impound Number,Service Request Number,Outcome Condition,Chip Status,Animal Origin,Additional Information,Month,Year
0,A1093136,CAT,DOMESTIC SH,K01,IMPOUNDED,,,1,P0915082,5200,...,14:46:00,,K20-494003,,APP SICK,SCAN NO CHIP,OVER THE COUNTER,,JAN.2020,FY2020
1,A1046046,DOG,PIT BULL,AD 085,UNAVAILABLE,,A19-203263,1,P0740141,12900,...,14:56:00,R19-561166,K19-489088,,APP WNL,SCAN CHIP,FIELD,ADOPTION,NOV.2019,FY2020


In [26]:
# Rows with 'Animal Type' other than 'DOG'

dallas_shelter[dallas_shelter['Animal Type']!='DOG']['Animal Type'].unique()

array(['CAT', 'BIRD', 'WILDLIFE', 'LIVESTOCK'], dtype=object)

In [27]:
dallas_shelter[dallas_shelter['Animal Type']!='DOG'].head(3)

Unnamed: 0,Animal Id,Animal Type,Animal Breed,Kennel Number,Kennel Status,Tag Type,Activity Number,Activity Sequence,Source Id,Census Tract,...,Outcome Time,Receipt Number,Impound Number,Service Request Number,Outcome Condition,Chip Status,Animal Origin,Additional Information,Month,Year
0,A1093136,CAT,DOMESTIC SH,K01,IMPOUNDED,,,1,P0915082,5200,...,14:46:00,,K20-494003,,APP SICK,SCAN NO CHIP,OVER THE COUNTER,,JAN.2020,FY2020
2,A1098758,BIRD,HAWK,RECEIVING,UNAVAILABLE,,A20-217177,1,P0922414,100,...,15:18:00,,K20-501362,,APP INJ,WILDLIFE - UNABLE TO SCAN,FIELD,,MAR.2020,FY2020
4,A1091970,CAT,DOMESTIC SH,CC 25,UNAVAILABLE,,,1,P0913319,16701,...,15:47:00,R19-561806,K19-492312,,APP WNL,SCAN NO CHIP,OVER THE COUNTER,ADOPTED,DEC.2019,FY2020


In [28]:
# Drops rows with 'Animal Type' other than 'DOG'

dallas_shelter = dallas_shelter.drop(labels=dallas_shelter[dallas_shelter['Animal Type']!='DOG'].index)

In [29]:
# Checkt that df only contains dogs

dallas_shelter['Animal Type'].unique()

array(['DOG'], dtype=object)

# Check for same animal across intakes and outcomes Dfs based on ID & try to merge

## Austin intakes & outcomes

In [30]:
austin_outcomes.head(2)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
1,A776359,Gizmo,07/18/2018 04:02:00 PM,Jul 2018,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
3,A720371,Moose,02/13/2016 05:59:00 PM,Feb 2016,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff


In [31]:
austin_intakes.head(2)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
0,A786884,*Brock,01/03/2019 04:19:00 PM,January 2019,2501 Magin Meadow Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor
1,A706918,Belle,07/05/2015 12:59:00 PM,July 2015,9409 Bluegrass Dr in Austin (TX),Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver


In [32]:
# duplicate Animal IDs in austin intakes
austin_intakes[austin_intakes['Animal ID'].duplicated(keep=False)==True].sort_values(by='Animal ID').head()

# duplciates appear to be same animal that has gone through intake at the shelter multiple times

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
91423,A006100,Scamp,03/07/2014 02:26:00 PM,March 2014,8700 Research in Austin (TX),Public Assist,Normal,Dog,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White
20233,A006100,Scamp,12/07/2017 02:07:00 PM,December 2017,Colony Creek And Hunters Trace in Austin (TX),Stray,Normal,Dog,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White
4344,A006100,Scamp,12/19/2014 10:21:00 AM,December 2014,8700 Research Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White
99487,A245945,Boomer,07/03/2014 05:55:00 PM,July 2014,Garden And Mildred in Austin (TX),Stray,Normal,Dog,Neutered Male,14 years,Labrador Retriever Mix,Tan
122656,A245945,Boomer,05/20/2015 10:34:00 PM,May 2015,7403 Blessing Ave in Austin (TX),Stray,Normal,Dog,Neutered Male,15 years,Labrador Retriever Mix,Tan


In [33]:
# duplicate Animal IDs in austin outcomes
austin_outcomes[austin_outcomes['Animal ID'].duplicated(keep=False)==True].sort_values(by='Animal ID').head(10)

# duplciates appear to be same animal that has had recorded outcome at the shelter multiple times

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
101494,A006100,Scamp,12/07/2017 12:00:00 AM,Dec 2017,07/09/2007,Return to Owner,,Dog,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White
56686,A006100,Scamp,12/20/2014 04:35:00 PM,Dec 2014,07/09/2007,Return to Owner,,Dog,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White
115237,A006100,Scamp,03/08/2014 05:10:00 PM,Mar 2014,07/09/2007,Return to Owner,,Dog,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White
90697,A245945,Boomer,07/04/2014 03:26:00 PM,Jul 2014,05/23/2000,Return to Owner,,Dog,Neutered Male,14 years,Labrador Retriever Mix,Tan
69529,A245945,Boomer,05/25/2015 11:49:00 AM,May 2015,05/23/2000,Transfer,Partner,Dog,Neutered Male,15 years,Labrador Retriever Mix,Tan
71155,A287017,Stitch,08/16/2015 01:31:00 PM,Aug 2015,05/13/2001,Return to Owner,,Dog,Spayed Female,14 years,Chihuahua Shorthair Mix,Black/White
24409,A287017,Stitch,12/12/2014 04:49:00 PM,Dec 2014,05/13/2001,Return to Owner,,Dog,Spayed Female,13 years,Chihuahua Shorthair Mix,Black/White
94844,A307010,Cooper,06/05/2017 03:11:00 PM,Jun 2017,03/04/2003,Return to Owner,,Dog,Neutered Male,14 years,Beagle Mix,Tricolor
38142,A307010,Cooper,09/28/2016 09:36:00 AM,Sep 2016,03/04/2003,Return to Owner,,Dog,Neutered Male,13 years,Beagle Mix,Tricolor
53506,A322813,Tyson,12/24/2015 02:31:00 PM,Dec 2015,03/26/2003,Return to Owner,,Dog,Neutered Male,12 years,Rottweiler Mix,Black/Brown


In [34]:
austin_intakes.shape

(76769, 12)

In [35]:
austin_outcomes.shape

(76727, 12)

In [36]:
# Rows of austin intakes and austin outcomes appear to largely line up, however dates for each animal are not necessarily listed in the same order and intakes has ~40 more rows

In [37]:
austin_outcomes.sort_values(by='Animal ID').head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
115237,A006100,Scamp,03/08/2014 05:10:00 PM,Mar 2014,07/09/2007,Return to Owner,,Dog,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White
101494,A006100,Scamp,12/07/2017 12:00:00 AM,Dec 2017,07/09/2007,Return to Owner,,Dog,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White
56686,A006100,Scamp,12/20/2014 04:35:00 PM,Dec 2014,07/09/2007,Return to Owner,,Dog,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White
39142,A047759,Oreo,04/07/2014 03:12:00 PM,Apr 2014,04/02/2004,Transfer,Partner,Dog,Neutered Male,10 years,Dachshund,Tricolor
81491,A134067,Bandit,11/16/2013 11:54:00 AM,Nov 2013,10/16/1997,Return to Owner,,Dog,Neutered Male,16 years,Shetland Sheepdog,Brown/White


In [38]:
austin_intakes.sort_values(by='Animal ID').head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
20233,A006100,Scamp,12/07/2017 02:07:00 PM,December 2017,Colony Creek And Hunters Trace in Austin (TX),Stray,Normal,Dog,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White
91423,A006100,Scamp,03/07/2014 02:26:00 PM,March 2014,8700 Research in Austin (TX),Public Assist,Normal,Dog,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White
4344,A006100,Scamp,12/19/2014 10:21:00 AM,December 2014,8700 Research Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White
71055,A047759,Oreo,04/02/2014 03:55:00 PM,April 2014,Austin (TX),Owner Surrender,Normal,Dog,Neutered Male,10 years,Dachshund,Tricolor
96671,A134067,Bandit,11/16/2013 09:02:00 AM,November 2013,12034 Research Blvd in Austin (TX),Public Assist,Injured,Dog,Neutered Male,16 years,Shetland Sheepdog,Brown/White


In [39]:
# 381 Animal IDs unique to outcomes df - over 76,000 present in both datasets
in_id = set(austin_intakes['Animal ID'])
out_id = set(austin_outcomes['Animal ID'])

len(list(out_id.difference(in_id)))

381

In [40]:
#List of all Animal IDs in austin outcomes and intakes (with duplicates - IDs in both lists not removed)

austin_in_out_ids = list(in_id.union(out_id))

In [41]:
# A few dogs are in the system a lot
austin_intakes['Animal ID'].value_counts().sort_values(ascending=False).head(10)

A721033    33
A718223    14
A718877    12
A706536    11
A761266     9
A737814     9
A717053     9
A700407     9
A716018     9
A616444     9
Name: Animal ID, dtype: int64

### Convert DateTime to epoch

In [42]:
austin_intakes[['DateTime']].head()

Unnamed: 0,DateTime
0,01/03/2019 04:19:00 PM
1,07/05/2015 12:59:00 PM
2,04/14/2016 06:43:00 PM
4,06/29/2014 10:38:00 AM
5,02/18/2017 12:46:00 PM


In [43]:
austin_intakes['DateTime'] = pd.to_datetime(austin_intakes['DateTime']).apply(lambda x: x.timestamp())

In [44]:
austin_intakes.sort_values(by='DateTime').head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
72982,A521520,Nina,1380614000.0,October 2013,Norht Ec in Austin (TX),Stray,Normal,Dog,Spayed Female,7 years,Border Terrier/Border Collie,White/Tan
117677,A664233,Stevie,1380618000.0,October 2013,7405 Springtime in Austin (TX),Stray,Injured,Dog,Intact Female,3 years,Pit Bull Mix,Blue/White
4788,A664234,,1380624000.0,October 2013,5400 Jimmy Clay in Austin (TX),Stray,Injured,Dog,Intact Male,8 years,Border Collie Mix,Black/White
89334,A664257,Pippin,1380625000.0,October 2013,Burleson in Travis (TX),Stray,Normal,Dog,Intact Female,4 years,Podengo Pequeno Mix,Black
24042,A664266,,1380626000.0,October 2013,Payton And 183 in Austin (TX),Stray,Normal,Dog,Intact Female,1 year,Chihuahua Shorthair Mix,Buff


In [45]:
austin_outcomes['DateTime'] = pd.to_datetime(austin_outcomes['DateTime']).apply(lambda x: x.timestamp())

## Merge Intake & Outcome Frames

In [46]:
austin_intakes.head(3)

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color
0,A786884,*Brock,1546532000.0,January 2019,2501 Magin Meadow Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor
1,A706918,Belle,1436101000.0,July 2015,9409 Bluegrass Dr in Austin (TX),Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver
2,A724273,Runster,1460659000.0,April 2016,2818 Palomino Trail in Austin (TX),Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White


In [47]:
austin_intakes.drop(columns=['MonthYear', 'Animal Type'], inplace=True)
austin_outcomes.drop(columns=['MonthYear', 'Animal Type'], inplace=True)

In [48]:
austin_intakes['Intake Type'].value_counts()

Stray                 52853
Owner Surrender       16804
Public Assist          6612
Abandoned               316
Euthanasia Request      183
Wildlife                  1
Name: Intake Type, dtype: int64

In [49]:
austin_intakes.sort_values(by=['Animal ID', 'DateTime'], inplace=True)
austin_intakes.reset_index(drop=True, inplace=True)

In [50]:
austin_outcomes.sort_values(by=['Animal ID', 'DateTime'], inplace=True)
austin_outcomes.reset_index(drop=True, inplace=True)

In [51]:
austin_intakes.head(3)

Unnamed: 0,Animal ID,Name,DateTime,Found Location,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Breed,Color
0,A006100,Scamp,1394202000.0,8700 Research in Austin (TX),Public Assist,Normal,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White
1,A006100,Scamp,1418984000.0,8700 Research Blvd in Austin (TX),Public Assist,Normal,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White
2,A006100,Scamp,1512656000.0,Colony Creek And Hunters Trace in Austin (TX),Stray,Normal,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White


In [52]:
austin_outcomes.head(3)

Unnamed: 0,Animal ID,Name,DateTime,Date of Birth,Outcome Type,Outcome Subtype,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A006100,Scamp,1394299000.0,07/09/2007,Return to Owner,,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White
1,A006100,Scamp,1419093000.0,07/09/2007,Return to Owner,,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White
2,A006100,Scamp,1512605000.0,07/09/2007,Return to Owner,,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White


In [53]:
# count ins and outs
in_count = austin_intakes['Animal ID'].value_counts()
out_count = austin_outcomes['Animal ID'].value_counts()
repeat_in = in_count[in_count > 1]
repeat_out = out_count[out_count > 1]

In [54]:
# separate ins
repeat_in = austin_intakes[austin_intakes['Animal ID'].isin(list(repeat_in.index))].copy(deep=True)
solo_in = austin_intakes[~austin_intakes['Animal ID'].isin(repeat_in['Animal ID'].unique())].copy(deep=True)

In [55]:
# separate outs
repeat_out = austin_outcomes[austin_outcomes['Animal ID'].isin(list(repeat_out.index))].copy(deep=True)
solo_out = austin_outcomes[~austin_outcomes['Animal ID'].isin(repeat_out['Animal ID'].unique())].copy(deep=True)

In [56]:
solo_in.shape[0] - solo_in['Animal ID'].isin(solo_out['Animal ID']).sum()   # missing outcome data on only 444 dogs that appeared once in the sets

444

### Merge Repeats

In [57]:
repeat_in.shape[0] - repeat_out.shape[0]    # probably only missing a few observations

64

In [58]:
repeat_out.drop(columns=['Name', 'Breed', 'Color'], inplace=True)

In [59]:
repeat_out.rename(columns={
    'Animal ID': 'Animal ID',
    'DateTime': 'date_out',
    'Date of Birth': 'dob',
    'Outcome Type': 'outcome',
    'Outcome Subtype': 'outcome_sub',
    'Sex upon Outcome': 'sex_out',
    'Age upon Outcome': 'age_out'
}, inplace=True)

In [60]:
# storage
in_hold = repeat_in.copy()
in_hold.set_index('Animal ID', inplace=True)
out_hold = repeat_out.copy()                                                                
out_hold.set_index('Animal ID', inplace=True)

# drop missing IDs
in_hold.drop(list(set(in_hold.index).difference(set(out_hold.index))), inplace=True)
out_hold.drop(list(set(out_hold.index).difference(set(in_hold.index))), inplace=True)

# iterate
for i in repeat_in['Animal ID'].unique():                                                   # select ids of intake animals
    if i in set(repeat_out['Animal ID']):
        ins = repeat_in[repeat_in['Animal ID'] == i]                                        # grab intake rows
        outs = repeat_out[repeat_out['Animal ID'] == i]                                     # grab outcome rows

        if ins.shape[0] != outs.shape[0]:                                                   # check if ins & outs match up
            if i in set(in_hold.index):
                in_hold.drop(i, inplace=True)
            if i in set(out_hold.index):
                out_hold.drop(i, inplace=True)



In [61]:
repeat = pd.merge(left=in_hold, right=out_hold, how='left', on='Animal ID')
repeat.reset_index(drop=False, inplace=True)

### Merge Solos

In [62]:
solo_in = solo_in[solo_in['Animal ID'].isin(solo_out['Animal ID'])].copy(deep=True) # drop missing solo outcomes

In [63]:
solo_out.drop(columns=['Name', 'Breed', 'Color'], inplace=True)

In [64]:
solo_out.rename(columns={
    'Animal ID': 'Animal ID',
    'DateTime': 'date_out',
    'Date of Birth': 'dob',
    'Outcome Type': 'outcome',
    'Outcome Subtype': 'outcome_sub',
    'Sex upon Outcome': 'sex_out',
    'Age upon Outcome': 'age_out'
}, inplace=True)

In [65]:
solo = pd.merge(left=solo_in, right=solo_out, how='left', on='Animal ID')

In [66]:
repeat_in.head()

Unnamed: 0,Animal ID,Name,DateTime,Found Location,Intake Type,Intake Condition,Sex upon Intake,Age upon Intake,Breed,Color
0,A006100,Scamp,1394202000.0,8700 Research in Austin (TX),Public Assist,Normal,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White
1,A006100,Scamp,1418984000.0,8700 Research Blvd in Austin (TX),Public Assist,Normal,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White
2,A006100,Scamp,1512656000.0,Colony Creek And Hunters Trace in Austin (TX),Stray,Normal,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White
25,A245945,Boomer,1404410000.0,Garden And Mildred in Austin (TX),Stray,Normal,Neutered Male,14 years,Labrador Retriever Mix,Tan
26,A245945,Boomer,1432161000.0,7403 Blessing Ave in Austin (TX),Stray,Normal,Neutered Male,15 years,Labrador Retriever Mix,Tan


### Merge Solo & Repeats

In [67]:
austin = pd.concat([solo, repeat], axis=0, ignore_index=True)
austin.reset_index(drop=True, inplace=True)
austin.shape

(110483, 16)

In [68]:
austin.rename(columns={
    'Animal ID': 'id',
    'Name': 'name',
    'DateTime': 'date_in',
    'Found Location': 'location',
    'Intake Type': 'intake_type',
    'Intake Condition': 'condition',
    'Sex upon Intake': 'sex_in',
    'Age upon Intake': 'age_in',
    'Breed': 'breed',
    'Color': 'color'
}, inplace=True)

### Drop NA

In [69]:
austin = austin[austin['outcome'].notna()].copy(deep=True)

### Impute Missing

In [70]:
austin['sex_in'].value_counts()

Intact Male      32299
Neutered Male    28636
Intact Female    27294
Spayed Female    21717
Unknown            510
Name: sex_in, dtype: int64

In [71]:
austin['sex_in'].fillna('Intact Male', inplace=True)

In [72]:
austin['sex_out'].value_counts()

Neutered Male    49564
Spayed Female    39679
Intact Male      11371
Intact Female     9332
Unknown            510
Name: sex_out, dtype: int64

In [73]:
austin['sex_out'].fillna('Neutered Male', inplace=True)

In [74]:
austin['age_out'].value_counts()[:5]

1 year      21968
2 years     21499
3 years     10283
2 months     7429
4 years      6221
Name: age_out, dtype: int64

In [75]:
austin['age_out'].fillna('1 year', inplace=True)

In [76]:
austin.reset_index(drop=True, inplace=True)

In [77]:
# convert month to decimil
austin.loc[austin[austin['age_out'].str.contains('month')].index.values, 'age_out'] = austin['age_out'][austin['age_out'].str.contains('month')].apply(lambda x: str(int(x.split()[0])/12))
austin.loc[austin[austin['age_in'].str.contains('month')].index.values, 'age_in'] = austin['age_in'][austin['age_in'].str.contains('month')].apply(lambda x: str(int(x.split()[0])/12))

In [78]:
# convert week to decimil
austin.loc[austin[austin['age_out'].str.contains('week')].index.values, 'age_out'] = austin['age_out'][austin['age_out'].str.contains('week')].apply(lambda x: str(int(x.split()[0])/52))
austin.loc[austin[austin['age_in'].str.contains('week')].index.values, 'age_in'] = austin['age_in'][austin['age_in'].str.contains('week')].apply(lambda x: str(int(x.split()[0])/52))

In [79]:
# convert day to decimil
austin.loc[austin[austin['age_out'].str.contains('day')].index.values, 'age_out'] = austin['age_out'][austin['age_out'].str.contains('day')].apply(lambda x: str(int(x.split()[0])/365))
austin.loc[austin[austin['age_in'].str.contains('day')].index.values, 'age_in'] = austin['age_in'][austin['age_in'].str.contains('day')].apply(lambda x: str(int(x.split()[0])/365))

In [80]:
# drop 'years'
austin.loc[austin[austin['age_out'].str.contains('year')].index.values, 'age_out'] = austin['age_out'][austin['age_out'].str.contains('year')].apply(lambda x: str(x.split()[0]))
austin.loc[austin[austin['age_in'].str.contains('year')].index.values, 'age_in'] = austin['age_in'][austin['age_in'].str.contains('year')].apply(lambda x: str(x.split()[0]))

In [81]:
# fix negative numbers
austin['age_out'] = austin['age_out'].str.replace('-', '')
austin['age_in'] = austin['age_in'].str.replace('-', '')

In [82]:
austin['age_out'] = round(austin['age_out'].astype(float), 2)
austin['age_in'] = round(austin['age_in'].astype(float), 2)

In [83]:
austin['sex_in'].value_counts()

Intact Male      32300
Neutered Male    28636
Intact Female    27294
Spayed Female    21717
Unknown            510
Name: sex_in, dtype: int64

In [84]:
austin['sex'] = austin['sex_in'].map({
    'Intact Male': 1,
    'Neutered Male': 1,
    'Intact Female': 0,
    'Spayed Female': 0,
    'Unknown': 1
})

In [85]:
austin['intact_in'] = austin['sex_in'].map({
    'Intact Male': 1,
    'Neutered Male': 0,
    'Intact Female': 1,
    'Spayed Female': 0,
    'Unknown': 1
})

In [86]:
austin['intact_out'] = austin['sex_out'].map({
    'Intact Male': 1,
    'Neutered Male': 0,
    'Intact Female': 1,
    'Spayed Female': 0,
    'Unknown': 1
})

In [87]:
austin['dob'] = pd.to_datetime(austin['dob']).apply(lambda x: x.timestamp())

In [88]:
austin.drop(columns=['sex_in', 'sex_out'], inplace=True)

In [89]:
austin.head(3)

Unnamed: 0,id,name,date_in,location,intake_type,condition,age_in,breed,color,date_out,dob,outcome,outcome_sub,age_out,sex,intact_in,intact_out
0,A047759,Oreo,1396454000.0,Austin (TX),Owner Surrender,Normal,10.0,Dachshund,Tricolor,1396884000.0,1080864000.0,Transfer,Partner,10.0,1,0,0
1,A134067,Bandit,1384593000.0,12034 Research Blvd in Austin (TX),Public Assist,Injured,16.0,Shetland Sheepdog,Brown/White,1384603000.0,876960000.0,Return to Owner,,16.0,1,0,0
2,A141142,Bettie,1384613000.0,Austin (TX),Stray,Aged,15.0,Labrador Retriever/Pit Bull,Black/White,1384688000.0,896659200.0,Return to Owner,,15.0,0,0,0


In [90]:
austin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110457 entries, 0 to 110456
Data columns (total 17 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           110457 non-null  object 
 1   name         97586 non-null   object 
 2   date_in      110457 non-null  float64
 3   location     110457 non-null  object 
 4   intake_type  110457 non-null  object 
 5   condition    110457 non-null  object 
 6   age_in       110457 non-null  float64
 7   breed        110457 non-null  object 
 8   color        110457 non-null  object 
 9   date_out     110457 non-null  float64
 10  dob          110457 non-null  float64
 11  outcome      110457 non-null  object 
 12  outcome_sub  28998 non-null   object 
 13  age_out      110457 non-null  float64
 14  sex          110457 non-null  int64  
 15  intact_in    110457 non-null  int64  
 16  intact_out   110457 non-null  int64  
dtypes: float64(5), int64(3), object(9)
memory usage: 14.3+ MB


In [91]:
austin.to_csv('./datasets/cleaned_data/austin.csv', index=False)

## Dallas data

In [92]:
dallas_shelter.head(2)

Unnamed: 0,Animal Id,Animal Type,Animal Breed,Kennel Number,Kennel Status,Tag Type,Activity Number,Activity Sequence,Source Id,Census Tract,...,Outcome Time,Receipt Number,Impound Number,Service Request Number,Outcome Condition,Chip Status,Animal Origin,Additional Information,Month,Year
1,A1046046,DOG,PIT BULL,AD 085,UNAVAILABLE,,A19-203263,1,P0740141,12900,...,14:56:00,R19-561166,K19-489088,,APP WNL,SCAN CHIP,FIELD,ADOPTION,NOV.2019,FY2020
3,A1061310,DOG,LABRADOR RETR,LFD 163,UNAVAILABLE,,A19-206245,1,P9991730,4800,...,13:33:00,R19-561090,K19-491175,,APP WNL,SCAN CHIP,FIELD,,DEC.2019,FY2020


In [93]:
austin.columns

Index(['id', 'name', 'date_in', 'location', 'intake_type', 'condition',
       'age_in', 'breed', 'color', 'date_out', 'dob', 'outcome', 'outcome_sub',
       'age_out', 'sex', 'intact_in', 'intact_out'],
      dtype='object')

In [94]:
dallas_shelter.columns

Index(['Animal Id', 'Animal Type', 'Animal Breed', 'Kennel Number',
       'Kennel Status', 'Tag Type', 'Activity Number', 'Activity Sequence',
       'Source Id', 'Census Tract', 'Council District', 'Intake Type',
       'Intake Subtype', 'Intake Total', 'Reason', 'Staff Id', 'Intake Date',
       'Intake Time', 'Due Out', 'Intake Condition', 'Hold Request',
       'Outcome Type', 'Outcome Subtype', 'Outcome Date', 'Outcome Time',
       'Receipt Number', 'Impound Number', 'Service Request Number',
       'Outcome Condition', 'Chip Status', 'Animal Origin',
       'Additional Information', 'Month', 'Year'],
      dtype='object')

In [95]:
dallas_shelter = dallas_shelter[['Animal Id', 'Animal Breed', 'Intake Type', 'Intake Subtype', 'Intake Date', 'Intake Condition',
                                 'Outcome Type', 'Outcome Subtype', 'Outcome Date']].copy(deep=True)

In [96]:
dallas_shelter.rename(columns={
    'Animal Id': 'id',
    'Animal Breed': 'breed',
    'Intake Type': 'intake_type',
    'Intake Subtype': 'intake_subtype',
    'Intake Date': 'date_in',
    'Intake Condition': 'condition',
    'Outcome Type': 'outcome',
    'Outcome Subtype': 'outcome_sub',
    'Outcome Date': 'date_out',
    
}, inplace=True)

In [97]:
dallas_shelter.head()

Unnamed: 0,id,breed,intake_type,intake_subtype,date_in,condition,outcome,outcome_sub,date_out
1,A1046046,PIT BULL,STRAY,CONFINED,11/28/19,APP WNL,ADOPTION,WALK IN,12/19/19
3,A1061310,LABRADOR RETR,STRAY,AT LARGE,12/18/19,APP WNL,RETURNED TO OWNER,WALK IN,12/18/19
5,A1091007,CHIHUAHUA SH,STRAY,AT LARGE,12/15/19,APP SICK,RETURNED TO OWNER,WALK IN,12/19/19
6,A1092433,GERM SHEPHERD,FOSTER,RETURN,1/12/20,APP WNL,TRANSFER,TRANSPORT,1/12/20
7,A1095357,PARSON RUSS TER,OWNER SURRENDER,GENERAL,2/6/20,APP WNL,TRANSFER,UNDERAGE,2/7/20


In [98]:
dallas_shelter.dropna(inplace=True)

In [99]:
dallas_shelter['date_out'].value_counts()[-20:]

5/3/20      25
10/1/20     25
5/7/20      24
5/1/20      24
10/6/19     24
8/21/20     24
10/3/20     23
8/11/20     23
6/22/20     23
12/25/19    22
10/1/19     22
4/26/20     21
10/4/20     20
7/19/20     18
4/12/20     18
9/10/20     17
10/5/20     14
10/6/20     12
11/28/19    11
10/7/20      8
Name: date_out, dtype: int64

In [100]:
dallas_shelter['date_out'] = dallas_shelter['date_out'].map(lambda x: datetime.strptime(x, '%m/%d/%y'))
dallas_shelter['date_in'] = dallas_shelter['date_in'].map(lambda x: datetime.strptime(x, '%m/%d/%y'))

In [101]:
dallas_shelter['date_out'] = dallas_shelter['date_out'].map(lambda x: x.timestamp())
dallas_shelter['date_in'] = dallas_shelter['date_in'].map(lambda x: x.timestamp())

In [102]:
dallas_shelter.head()

Unnamed: 0,id,breed,intake_type,intake_subtype,date_in,condition,outcome,outcome_sub,date_out
1,A1046046,PIT BULL,STRAY,CONFINED,1574899000.0,APP WNL,ADOPTION,WALK IN,1576714000.0
3,A1061310,LABRADOR RETR,STRAY,AT LARGE,1576627000.0,APP WNL,RETURNED TO OWNER,WALK IN,1576627000.0
5,A1091007,CHIHUAHUA SH,STRAY,AT LARGE,1576368000.0,APP SICK,RETURNED TO OWNER,WALK IN,1576714000.0
6,A1092433,GERM SHEPHERD,FOSTER,RETURN,1578787000.0,APP WNL,TRANSFER,TRANSPORT,1578787000.0
7,A1095357,PARSON RUSS TER,OWNER SURRENDER,GENERAL,1580947000.0,APP WNL,TRANSFER,UNDERAGE,1581034000.0


In [103]:
dallas_shelter.to_csv('./datasets/cleaned_data/dallas.csv', index=False)

# Merge Austin & Dallas Datasets

In [104]:
austin.head()

Unnamed: 0,id,name,date_in,location,intake_type,condition,age_in,breed,color,date_out,dob,outcome,outcome_sub,age_out,sex,intact_in,intact_out
0,A047759,Oreo,1396454000.0,Austin (TX),Owner Surrender,Normal,10.0,Dachshund,Tricolor,1396884000.0,1080864000.0,Transfer,Partner,10.0,1,0,0
1,A134067,Bandit,1384593000.0,12034 Research Blvd in Austin (TX),Public Assist,Injured,16.0,Shetland Sheepdog,Brown/White,1384603000.0,876960000.0,Return to Owner,,16.0,1,0,0
2,A141142,Bettie,1384613000.0,Austin (TX),Stray,Aged,15.0,Labrador Retriever/Pit Bull,Black/White,1384688000.0,896659200.0,Return to Owner,,15.0,0,0,0
3,A163459,Sasha,1415978000.0,Ih 35 And 41St St in Austin (TX),Stray,Normal,15.0,Miniature Schnauzer Mix,Black/Gray,1415993000.0,940291200.0,Return to Owner,,15.0,0,1,1
4,A165752,Pep,1410780000.0,Gatlin Gun Rd And Brodie in Austin (TX),Stray,Normal,15.0,Lhasa Apso Mix,Brown/White,1410799000.0,934934400.0,Return to Owner,,15.0,1,0,0


In [105]:
dallas_shelter.head()

Unnamed: 0,id,breed,intake_type,intake_subtype,date_in,condition,outcome,outcome_sub,date_out
1,A1046046,PIT BULL,STRAY,CONFINED,1574899000.0,APP WNL,ADOPTION,WALK IN,1576714000.0
3,A1061310,LABRADOR RETR,STRAY,AT LARGE,1576627000.0,APP WNL,RETURNED TO OWNER,WALK IN,1576627000.0
5,A1091007,CHIHUAHUA SH,STRAY,AT LARGE,1576368000.0,APP SICK,RETURNED TO OWNER,WALK IN,1576714000.0
6,A1092433,GERM SHEPHERD,FOSTER,RETURN,1578787000.0,APP WNL,TRANSFER,TRANSPORT,1578787000.0
7,A1095357,PARSON RUSS TER,OWNER SURRENDER,GENERAL,1580947000.0,APP WNL,TRANSFER,UNDERAGE,1581034000.0


In [106]:
austin.columns

Index(['id', 'name', 'date_in', 'location', 'intake_type', 'condition',
       'age_in', 'breed', 'color', 'date_out', 'dob', 'outcome', 'outcome_sub',
       'age_out', 'sex', 'intact_in', 'intact_out'],
      dtype='object')

In [107]:
dallas_shelter.columns

Index(['id', 'breed', 'intake_type', 'intake_subtype', 'date_in', 'condition',
       'outcome', 'outcome_sub', 'date_out'],
      dtype='object')

##### merged columns should be:
<br>
id, breed, color, dob, sex, date_in, age_in, intact_in, location, intake_type, intake_subtype, condition, date_out, age_out, intact_out, outcome, outcome_sub

## Add to/reorder austin columns

### Add columns

columns to be added to austin: intake_subtype

In [108]:
austin['intake_subtype'] = np.full_like(austin['id'], np.nan)

In [109]:
austin.head()

Unnamed: 0,id,name,date_in,location,intake_type,condition,age_in,breed,color,date_out,dob,outcome,outcome_sub,age_out,sex,intact_in,intact_out,intake_subtype
0,A047759,Oreo,1396454000.0,Austin (TX),Owner Surrender,Normal,10.0,Dachshund,Tricolor,1396884000.0,1080864000.0,Transfer,Partner,10.0,1,0,0,
1,A134067,Bandit,1384593000.0,12034 Research Blvd in Austin (TX),Public Assist,Injured,16.0,Shetland Sheepdog,Brown/White,1384603000.0,876960000.0,Return to Owner,,16.0,1,0,0,
2,A141142,Bettie,1384613000.0,Austin (TX),Stray,Aged,15.0,Labrador Retriever/Pit Bull,Black/White,1384688000.0,896659200.0,Return to Owner,,15.0,0,0,0,
3,A163459,Sasha,1415978000.0,Ih 35 And 41St St in Austin (TX),Stray,Normal,15.0,Miniature Schnauzer Mix,Black/Gray,1415993000.0,940291200.0,Return to Owner,,15.0,0,1,1,
4,A165752,Pep,1410780000.0,Gatlin Gun Rd And Brodie in Austin (TX),Stray,Normal,15.0,Lhasa Apso Mix,Brown/White,1410799000.0,934934400.0,Return to Owner,,15.0,1,0,0,


### Reorder columns

In [110]:
austin[['id', 'breed', 'color', 'dob', 'sex', 'date_in', 'age_in', 'intact_in', 'location', 'intake_type', 'intake_subtype', 'condition', 'date_out', 'age_out', 'intact_out', 'outcome', 'outcome_sub']]

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,intake_subtype,condition,date_out,age_out,intact_out,outcome,outcome_sub
0,A047759,Dachshund,Tricolor,1.080864e+09,1,1.396454e+09,10.00,0,Austin (TX),Owner Surrender,,Normal,1.396884e+09,10.00,0,Transfer,Partner
1,A134067,Shetland Sheepdog,Brown/White,8.769600e+08,1,1.384593e+09,16.00,0,12034 Research Blvd in Austin (TX),Public Assist,,Injured,1.384603e+09,16.00,0,Return to Owner,
2,A141142,Labrador Retriever/Pit Bull,Black/White,8.966592e+08,0,1.384613e+09,15.00,0,Austin (TX),Stray,,Aged,1.384688e+09,15.00,0,Return to Owner,
3,A163459,Miniature Schnauzer Mix,Black/Gray,9.402912e+08,0,1.415978e+09,15.00,1,Ih 35 And 41St St in Austin (TX),Stray,,Normal,1.415993e+09,15.00,1,Return to Owner,
4,A165752,Lhasa Apso Mix,Brown/White,9.349344e+08,1,1.410780e+09,15.00,0,Gatlin Gun Rd And Brodie in Austin (TX),Stray,,Normal,1.410799e+09,15.00,0,Return to Owner,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110452,A850544,Basenji Mix,Tricolor,1.624666e+09,1,1.644230e+09,0.58,0,Austin (TX),Owner Surrender,,Normal,1.644335e+09,0.58,0,Adoption,
110453,A850561,German Shepherd Mix,Black/Brown,1.579997e+09,1,1.643218e+09,2.00,0,Austin (TX),Public Assist,,Normal,1.643307e+09,2.00,0,Adoption,
110454,A850561,German Shepherd Mix,Black/Brown,1.579997e+09,1,1.643218e+09,2.00,0,Austin (TX),Public Assist,,Normal,1.644078e+09,2.00,0,Adoption,
110455,A850561,German Shepherd Mix,Black/Brown,1.579997e+09,1,1.643650e+09,2.00,0,Austin (TX),Owner Surrender,,Normal,1.643307e+09,2.00,0,Adoption,


In [111]:
austin = austin[['id', 'breed', 'color', 'dob', 'sex', 'date_in', 'age_in', 'intact_in', 'location', 'intake_type', 'intake_subtype', 'condition', 'date_out', 'age_out', 'intact_out', 'outcome', 'outcome_sub']]

In [112]:
austin.head()

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,intake_subtype,condition,date_out,age_out,intact_out,outcome,outcome_sub
0,A047759,Dachshund,Tricolor,1080864000.0,1,1396454000.0,10.0,0,Austin (TX),Owner Surrender,,Normal,1396884000.0,10.0,0,Transfer,Partner
1,A134067,Shetland Sheepdog,Brown/White,876960000.0,1,1384593000.0,16.0,0,12034 Research Blvd in Austin (TX),Public Assist,,Injured,1384603000.0,16.0,0,Return to Owner,
2,A141142,Labrador Retriever/Pit Bull,Black/White,896659200.0,0,1384613000.0,15.0,0,Austin (TX),Stray,,Aged,1384688000.0,15.0,0,Return to Owner,
3,A163459,Miniature Schnauzer Mix,Black/Gray,940291200.0,0,1415978000.0,15.0,1,Ih 35 And 41St St in Austin (TX),Stray,,Normal,1415993000.0,15.0,1,Return to Owner,
4,A165752,Lhasa Apso Mix,Brown/White,934934400.0,1,1410780000.0,15.0,0,Gatlin Gun Rd And Brodie in Austin (TX),Stray,,Normal,1410799000.0,15.0,0,Return to Owner,


## Add to/reorder Dallas columns

### Add columns

##### merged columns should be:
<br>
id, breed, color, dob, sex, date_in, age_in, intact_in, location, intake_type, intake_subtype, condition, date_out, age_out, intact_out, outcome, outcome_sub

In [113]:
dallas_shelter.columns

Index(['id', 'breed', 'intake_type', 'intake_subtype', 'date_in', 'condition',
       'outcome', 'outcome_sub', 'date_out'],
      dtype='object')

columns to add: color, dob, sex, age_in, intact_in, location, age_out, intact_out, date_out

In [114]:
columns_to_add = 'color', 'dob', 'sex', 'age_in', 'intact_in', 'location', 'age_out', 'intact_out'
for col in columns_to_add:
    dallas_shelter[col] = np.full_like(dallas_shelter['id'], np.nan)

In [115]:
dallas_shelter.head()

Unnamed: 0,id,breed,intake_type,intake_subtype,date_in,condition,outcome,outcome_sub,date_out,color,dob,sex,age_in,intact_in,location,age_out,intact_out
1,A1046046,PIT BULL,STRAY,CONFINED,1574899000.0,APP WNL,ADOPTION,WALK IN,1576714000.0,,,,,,,,
3,A1061310,LABRADOR RETR,STRAY,AT LARGE,1576627000.0,APP WNL,RETURNED TO OWNER,WALK IN,1576627000.0,,,,,,,,
5,A1091007,CHIHUAHUA SH,STRAY,AT LARGE,1576368000.0,APP SICK,RETURNED TO OWNER,WALK IN,1576714000.0,,,,,,,,
6,A1092433,GERM SHEPHERD,FOSTER,RETURN,1578787000.0,APP WNL,TRANSFER,TRANSPORT,1578787000.0,,,,,,,,
7,A1095357,PARSON RUSS TER,OWNER SURRENDER,GENERAL,1580947000.0,APP WNL,TRANSFER,UNDERAGE,1581034000.0,,,,,,,,


### Reorder columns

In [116]:
dallas_shelter[['id', 'breed', 'color', 'dob', 'sex', 'date_in', 'age_in', 'intact_in', 'location', 'intake_type', 'intake_subtype', 'condition', 'date_out', 'age_out', 'intact_out', 'outcome', 'outcome_sub']]

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,intake_subtype,condition,date_out,age_out,intact_out,outcome,outcome_sub
1,A1046046,PIT BULL,,,,1.574899e+09,,,,STRAY,CONFINED,APP WNL,1.576714e+09,,,ADOPTION,WALK IN
3,A1061310,LABRADOR RETR,,,,1.576627e+09,,,,STRAY,AT LARGE,APP WNL,1.576627e+09,,,RETURNED TO OWNER,WALK IN
5,A1091007,CHIHUAHUA SH,,,,1.576368e+09,,,,STRAY,AT LARGE,APP SICK,1.576714e+09,,,RETURNED TO OWNER,WALK IN
6,A1092433,GERM SHEPHERD,,,,1.578787e+09,,,,FOSTER,RETURN,APP WNL,1.578787e+09,,,TRANSFER,TRANSPORT
7,A1095357,PARSON RUSS TER,,,,1.580947e+09,,,,OWNER SURRENDER,GENERAL,APP WNL,1.581034e+09,,,TRANSFER,UNDERAGE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31913,A1091105,CHIHUAHUA SH,,,,1.576541e+09,,,,STRAY,AT LARGE,UNKNOWN,1.576541e+09,,,RETURNED TO OWNER,FIELD
31914,A1091106,CHIHUAHUA LH,,,,1.576541e+09,,,,STRAY,AT LARGE,APP WNL,1.576541e+09,,,RETURNED TO OWNER,FIELD
31915,A1091115,FRENCH BULLDOG,,,,1.576541e+09,,,,STRAY,AT LARGE,UNKNOWN,1.576541e+09,,,RETURNED TO OWNER,FIELD
31916,A1091120,BULLDOG,,,,1.576541e+09,,,,STRAY,AT LARGE,APP WNL,1.576541e+09,,,RETURNED TO OWNER,FIELD


In [117]:
dallas_shelter = dallas_shelter[['id', 'breed', 'color', 'dob', 'sex', 'date_in', 'age_in', 'intact_in', 'location', 'intake_type', 'intake_subtype', 'condition', 'date_out', 'age_out', 'intact_out', 'outcome', 'outcome_sub']]

In [118]:
dallas_shelter.head()

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,intake_subtype,condition,date_out,age_out,intact_out,outcome,outcome_sub
1,A1046046,PIT BULL,,,,1574899000.0,,,,STRAY,CONFINED,APP WNL,1576714000.0,,,ADOPTION,WALK IN
3,A1061310,LABRADOR RETR,,,,1576627000.0,,,,STRAY,AT LARGE,APP WNL,1576627000.0,,,RETURNED TO OWNER,WALK IN
5,A1091007,CHIHUAHUA SH,,,,1576368000.0,,,,STRAY,AT LARGE,APP SICK,1576714000.0,,,RETURNED TO OWNER,WALK IN
6,A1092433,GERM SHEPHERD,,,,1578787000.0,,,,FOSTER,RETURN,APP WNL,1578787000.0,,,TRANSFER,TRANSPORT
7,A1095357,PARSON RUSS TER,,,,1580947000.0,,,,OWNER SURRENDER,GENERAL,APP WNL,1581034000.0,,,TRANSFER,UNDERAGE


## Check consistency of columns

In [119]:
austin.shape

(110457, 17)

In [120]:
dallas_shelter.shape

(22161, 17)

In [121]:
austin.columns

Index(['id', 'breed', 'color', 'dob', 'sex', 'date_in', 'age_in', 'intact_in',
       'location', 'intake_type', 'intake_subtype', 'condition', 'date_out',
       'age_out', 'intact_out', 'outcome', 'outcome_sub'],
      dtype='object')

In [122]:
dallas_shelter.columns

Index(['id', 'breed', 'color', 'dob', 'sex', 'date_in', 'age_in', 'intact_in',
       'location', 'intake_type', 'intake_subtype', 'condition', 'date_out',
       'age_out', 'intact_out', 'outcome', 'outcome_sub'],
      dtype='object')

## Merge dataframes

In [123]:
pd.concat([austin, dallas_shelter], axis=0).shape

(132618, 17)

In [124]:
texas_shelters = pd.concat([austin, dallas_shelter], axis=0)

## Saved merged data

In [125]:
texas_shelters.head()

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,intake_subtype,condition,date_out,age_out,intact_out,outcome,outcome_sub
0,A047759,Dachshund,Tricolor,1080864000.0,1,1396454000.0,10.0,0,Austin (TX),Owner Surrender,,Normal,1396884000.0,10.0,0,Transfer,Partner
1,A134067,Shetland Sheepdog,Brown/White,876960000.0,1,1384593000.0,16.0,0,12034 Research Blvd in Austin (TX),Public Assist,,Injured,1384603000.0,16.0,0,Return to Owner,
2,A141142,Labrador Retriever/Pit Bull,Black/White,896659200.0,0,1384613000.0,15.0,0,Austin (TX),Stray,,Aged,1384688000.0,15.0,0,Return to Owner,
3,A163459,Miniature Schnauzer Mix,Black/Gray,940291200.0,0,1415978000.0,15.0,1,Ih 35 And 41St St in Austin (TX),Stray,,Normal,1415993000.0,15.0,1,Return to Owner,
4,A165752,Lhasa Apso Mix,Brown/White,934934400.0,1,1410780000.0,15.0,0,Gatlin Gun Rd And Brodie in Austin (TX),Stray,,Normal,1410799000.0,15.0,0,Return to Owner,


In [127]:
texas_shelters.to_csv('./datasets/cleaned_data/texas_shelters.csv', index=False)

## Reset texas_shelters index

In [128]:
texas_shelters.index.duplicated().sum()

22161

In [129]:
texas_shelters.reset_index().drop(columns='index').index.duplicated().sum()

0

In [130]:
texas_shelters = texas_shelters.reset_index().drop(columns='index')

In [131]:
texas_shelters.head()

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,intake_subtype,condition,date_out,age_out,intact_out,outcome,outcome_sub
0,A047759,Dachshund,Tricolor,1080864000.0,1,1396454000.0,10.0,0,Austin (TX),Owner Surrender,,Normal,1396884000.0,10.0,0,Transfer,Partner
1,A134067,Shetland Sheepdog,Brown/White,876960000.0,1,1384593000.0,16.0,0,12034 Research Blvd in Austin (TX),Public Assist,,Injured,1384603000.0,16.0,0,Return to Owner,
2,A141142,Labrador Retriever/Pit Bull,Black/White,896659200.0,0,1384613000.0,15.0,0,Austin (TX),Stray,,Aged,1384688000.0,15.0,0,Return to Owner,
3,A163459,Miniature Schnauzer Mix,Black/Gray,940291200.0,0,1415978000.0,15.0,1,Ih 35 And 41St St in Austin (TX),Stray,,Normal,1415993000.0,15.0,1,Return to Owner,
4,A165752,Lhasa Apso Mix,Brown/White,934934400.0,1,1410780000.0,15.0,0,Gatlin Gun Rd And Brodie in Austin (TX),Stray,,Normal,1410799000.0,15.0,0,Return to Owner,


In [132]:
texas_shelters.index.duplicated().sum()

0

## Examine/homogenize columns

In [133]:
texas_shelters.columns

Index(['id', 'breed', 'color', 'dob', 'sex', 'date_in', 'age_in', 'intact_in',
       'location', 'intake_type', 'intake_subtype', 'condition', 'date_out',
       'age_out', 'intact_out', 'outcome', 'outcome_sub'],
      dtype='object')

### Breed

In [134]:
# breed column will be uodated based in breeds dataset

len(texas_shelters['breed'].unique())

2588

### Sex

In [135]:
texas_shelters['sex'].value_counts()

1    61446
0    49011
Name: sex, dtype: int64

### Date_in

In [136]:
texas_shelters['date_in']

0         1.396454e+09
1         1.384593e+09
2         1.384613e+09
3         1.415978e+09
4         1.410780e+09
              ...     
132613    1.576541e+09
132614    1.576541e+09
132615    1.576541e+09
132616    1.576541e+09
132617    1.576541e+09
Name: date_in, Length: 132618, dtype: float64

In [137]:
texas_shelters['date_in'].dtype

dtype('float64')

### Age_in

In [138]:
texas_shelters['age_in'].unique()

array([10.0, 16.0, 15.0, 18.0, 14.0, 17.0, 13.0, 12.0, 11.0, 9.0, 8.0,
       7.0, 6.0, 3.0, 19.0, 23.0, 5.0, 4.0, 2.0, 1.0, 0.92, 0.83, 0.75,
       0.58, 0.42, 0.5, 0.33, 0.67, 0.17, 0.25, 0.08, 0.01, 0.02, 0.0,
       0.06, 0.04, 0.1, 20.0, 24.0, nan], dtype=object)

### Intact_in

In [139]:
texas_shelters['intact_in'].unique()

array([0, 1, nan], dtype=object)

### Intake_type

In [140]:
texas_shelters['intake_type'].unique()

array(['Owner Surrender', 'Public Assist', 'Stray', 'Euthanasia Request',
       'Abandoned', 'Wildlife', 'STRAY', 'FOSTER', 'OWNER SURRENDER',
       'CONFISCATED', 'TREATMENT', 'TRANSFER', 'KEEPSAFE', 'DISPOS REQ'],
      dtype=object)

In [141]:
texas_shelters['intake_type'].map(
    {
        'Owner Surrender':'surrender', 'OWNER SURRENDER':'surrender',
        'Public Assist':'public_assist', 
        'Stray':'stray', 'STRAY':'stray',
        'Euthanasia Request':'euth_request',
        'Abandoned':'abandoned',
        'wildlife':'wildlife',
        'FOSTER':'foster',
        'CONFISCATED':'confiscated',
        'TREATMENT':'treatment',
        'TRANSFER':'transfer',
        'KEEPSAFE':'keep_safe',
        'DISPOS REQ':'disposal'
        }
    )

0             surrender
1         public_assist
2                 stray
3                 stray
4                 stray
              ...      
132613            stray
132614            stray
132615            stray
132616            stray
132617            stray
Name: intake_type, Length: 132618, dtype: object

In [142]:
texas_shelters['intake_type'] = texas_shelters['intake_type'].map(
    {
        'Owner Surrender':'surrender', 'OWNER SURRENDER':'surrender',
        'Public Assist':'public_assist', 
        'Stray':'stray', 'STRAY':'stray',
        'Euthanasia Request':'euth_request',
        'Abandoned':'abandoned',
        'wildlife':'wildlife',
        'FOSTER':'foster',
        'CONFISCATED':'confiscated',
        'TREATMENT':'treatment',
        'TRANSFER':'transfer',
        'KEEPSAFE':'keep_safe',
        'DISPOS REQ':'disposal'
        }
    )

In [143]:
texas_shelters['intake_type']

0             surrender
1         public_assist
2                 stray
3                 stray
4                 stray
              ...      
132613            stray
132614            stray
132615            stray
132616            stray
132617            stray
Name: intake_type, Length: 132618, dtype: object

### Intake_subtype

In [144]:
texas_shelters['intake_subtype'].unique()

array([nan, 'CONFINED', 'AT LARGE', 'RETURN', 'GENERAL', 'APPOINT', 'WEB',
       'QUARANTINE', 'SURGERY', 'HEART WORM', 'EUTHANASIA REQUESTED',
       'RETURN30', 'KEEP SAFE', 'OTC', 'TREATMENT',
       'KEEP SAFE DEAD ON ARRIVAL', 'DEAD ON ARRIVAL', 'OTHER',
       'QUARANTINE DEAD ON ARRIVAL', 'DIED', 'FIELD', 'CRUELTY',
       'DANGEROUS', 'SAC', 'SX POST OP', 'OWN HOSPIT', 'ALUMNI',
       'SPCA TEXAS', 'TRAP PROGRAM', 'AGG OPPS', 'OWN DECEAS', 'FOLLOWUP',
       'ILLNESS', 'EVICTION', 'INJURED', 'SPAY/NEUT', 'OWN ARREST',
       'FOR ADOPT', 'STRAY', 'ARC', 'VEHICLETOW', 'DISASTER',
       'SAC DEAD ON ARRIVAL'], dtype=object)

In [145]:
texas_shelters['intake_subtype']

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
            ...   
132613    AT LARGE
132614    AT LARGE
132615    AT LARGE
132616    AT LARGE
132617    AT LARGE
Name: intake_subtype, Length: 132618, dtype: object

In [146]:
texas_shelters['intake_subtype'][132613].lower().replace(' ', '_')

'at_large'

In [147]:
str(texas_shelters['intake_subtype'][0])

'nan'

In [148]:
for i in range(len(texas_shelters['intake_subtype'])):
    if str(texas_shelters['intake_subtype'][i]) != 'nan':
        texas_shelters['intake_subtype'][i] = texas_shelters['intake_subtype'][i].lower().replace(' ', '_')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  texas_shelters['intake_subtype'][i] = texas_shelters['intake_subtype'][i].lower().replace(' ', '_')


In [149]:
texas_shelters['intake_subtype'].unique()

array([nan, 'confined', 'at_large', 'return', 'general', 'appoint', 'web',
       'quarantine', 'surgery', 'heart_worm', 'euthanasia_requested',
       'return30', 'keep_safe', 'otc', 'treatment',
       'keep_safe_dead_on_arrival', 'dead_on_arrival', 'other',
       'quarantine_dead_on_arrival', 'died', 'field', 'cruelty',
       'dangerous', 'sac', 'sx_post_op', 'own_hospit', 'alumni',
       'spca_texas', 'trap_program', 'agg_opps', 'own_deceas', 'followup',
       'illness', 'eviction', 'injured', 'spay/neut', 'own_arrest',
       'for_adopt', 'stray', 'arc', 'vehicletow', 'disaster',
       'sac_dead_on_arrival'], dtype=object)

### Condition

In [150]:
texas_shelters['condition'].unique()

array(['Normal', 'Injured', 'Aged', 'Sick', 'Medical', 'Other',
       'Pregnant', 'Nursing', 'Feral', 'Behavior', 'Neonatal',
       'Med Urgent', 'Med Attn', 'APP WNL', 'APP SICK', 'NORMAL',
       'UNKNOWN', 'APP INJ', 'TREATABLE REHABILITABLE NON-CONTAGIOUS',
       'UNDERAGE', 'CRITICAL', 'DEAD', 'DECEASED', 'FATAL',
       'UNHEALTHY UNTREATABLE NON-CONTAGIOUS',
       'TREATABLE MANAGEABLE NON-CONTAGIOUS',
       'TREATABLE MANAGEABLE CONTAGIOUS', 'HEALTHY',
       'TREATABLE REHABILITABLE CONTAGIOUS'], dtype=object)

In [151]:
texas_shelters['condition'].map(
    {
        'Normal':'wnl', 'APP WNL':'wnl', 'NORMAL':'wnl', 'HEALTHY':'wnl',
        'Injured':'med_attn', 'Sick':'med_attn', 'Medical':'med_attn', 'Med Urgent':'med_attn', 'Med Attn':'med_attn', 'APP SICK':'med_attn', 'APP INJ':'med_attn',
        'TREATABLE REHABILITABLE NON-CONTAGIOUS':'med_attn', 'CRITICAL':'med_attn', 'TREATABLE MANAGEABLE NON-CONTAGIOUS':'med_attn', 
        'TREATABLE MANAGEABLE CONTAGIOUS':'med_attn', 'TREATABLE REHABILITABLE CONTAGIOUS':'med_attn',
        'Aged':'aged',
        'Pregnant':'preg_nursing', 'Nursing':'preg_nursing',
        'Neonatal':'underage', 'UNDERAGE':'underage',
        'Feral':'behavioral', 'Behavior':'behavioral',
        'DEAD':'doa_fatal', 'DECEASED':'doa_fatal', 'FATAL':'doa_fatal', 'UNHEALTHY UNTREATABLE NON-CONTAGIOUS':'doa_fatal', 
        'Other':'other', 'UNKNOWN':'other'
    }
)

0              wnl
1         med_attn
2             aged
3              wnl
4              wnl
            ...   
132613       other
132614         wnl
132615       other
132616         wnl
132617         wnl
Name: condition, Length: 132618, dtype: object

In [152]:
texas_shelters['condition'] = texas_shelters['condition'].map(
    {
        'Normal':'wnl', 'APP WNL':'wnl', 'NORMAL':'wnl', 'HEALTHY':'wnl',
        'Injured':'med_attn', 'Sick':'med_attn', 'Medical':'med_attn', 'Med Urgent':'med_attn', 'Med Attn':'med_attn', 'APP SICK':'med_attn', 'APP INJ':'med_attn',
        'TREATABLE REHABILITABLE NON-CONTAGIOUS':'med_attn', 'CRITICAL':'med_attn', 'TREATABLE MANAGEABLE NON-CONTAGIOUS':'med_attn', 
        'TREATABLE MANAGEABLE CONTAGIOUS':'med_attn', 'TREATABLE REHABILITABLE CONTAGIOUS':'med_attn',
        'Aged':'aged',
        'Pregnant':'preg_nursing', 'Nursing':'preg_nursing',
        'Neonatal':'underage', 'UNDERAGE':'underage',
        'Feral':'behavioral', 'Behavior':'behavioral',
        'DEAD':'doa_fatal', 'DECEASED':'doa_fatal', 'FATAL':'doa_fatal', 'UNHEALTHY UNTREATABLE NON-CONTAGIOUS':'doa_fatal', 
        'Other':'other', 'UNKNOWN':'other'
    }
)

In [153]:
texas_shelters['condition'].unique()

array(['wnl', 'med_attn', 'aged', 'other', 'preg_nursing', 'behavioral',
       'underage', 'doa_fatal'], dtype=object)

### Date_out

In [154]:
texas_shelters['date_out']

0         1.396884e+09
1         1.384603e+09
2         1.384688e+09
3         1.415993e+09
4         1.410799e+09
              ...     
132613    1.576541e+09
132614    1.576541e+09
132615    1.576541e+09
132616    1.576541e+09
132617    1.576541e+09
Name: date_out, Length: 132618, dtype: float64

### Intact_out

In [155]:
texas_shelters['intact_out'].unique()

array([0, 1, nan], dtype=object)

### Outcome

In [156]:
texas_shelters['outcome'].unique()

array(['Transfer', 'Return to Owner', 'Adoption', 'Euthanasia', 'Died',
       'Rto-Adopt', 'Missing', 'Disposal', 'ADOPTION',
       'RETURNED TO OWNER', 'TRANSFER', 'EUTHANIZED', 'FOUND EXP',
       'LOST EXP', 'FOSTER', 'DIED', 'TREATMENT', 'DEAD ON ARRIVAL',
       'MISSING', 'DISPOSAL', 'OTHER'], dtype=object)

In [157]:
texas_shelters.shape

(132618, 17)

In [158]:
texas_shelters['outcome'].map(
    {
        'Transfer':'transfer', 'TRANSFER':'transfer',
        'Return to Owner':'owner_return', 'Rto-Adopt':'owner_return', 'RETURNED TO OWNER':'owner_return',
        'Adoption':'adopted', 'ADOPTION':'adopted',
        'FOSTER':'fostered',
        'Euthanasia':'deceased', 'Died':'deceased', 'Disposal':'deceased', 'EUTHANIZED':'deceased', 'DIED':'deceased', 
        'DEAD ON ARRIVAL':'deceased', 'DISPOSAL':'deceased', 'FOUND EXP':'deceased', 'LOST EXP':'deceased',
        'Missing':'missing', 'MISSING':'missing',
        'TREATMENT':'treatment',
        'OTHER':'other'
    }
)

0             transfer
1         owner_return
2         owner_return
3         owner_return
4         owner_return
              ...     
132613    owner_return
132614    owner_return
132615    owner_return
132616    owner_return
132617    owner_return
Name: outcome, Length: 132618, dtype: object

In [159]:
texas_shelters['outcome'] = texas_shelters['outcome'].map(
    {
        'Transfer':'transfer', 'TRANSFER':'transfer',
        'Return to Owner':'owner_return', 'Rto-Adopt':'owner_return', 'RETURNED TO OWNER':'owner_return',
        'Adoption':'adopted', 'ADOPTION':'adopted',
        'FOSTER':'fostered',
        'Euthanasia':'deceased', 'Died':'deceased', 'Disposal':'deceased', 'EUTHANIZED':'deceased', 'DIED':'deceased', 
        'DEAD ON ARRIVAL':'deceased', 'DISPOSAL':'deceased', 'FOUND EXP':'deceased', 'LOST EXP':'deceased',
        'Missing':'missing', 'MISSING':'missing',
        'TREATMENT':'treatment',
        'OTHER':'other'
    }
)

In [160]:
texas_shelters['outcome'].unique()

array(['transfer', 'owner_return', 'adopted', 'deceased', 'missing',
       'fostered', 'treatment', 'other'], dtype=object)

### Outcome_sub

In [161]:
texas_shelters['outcome_sub'].unique()

array(['Partner', nan, 'Foster', 'Suffering', 'Medical', 'Behavior',
       'In Kennel', 'Aggressive', 'At Vet', 'Rabies Risk', 'Field',
       'Offsite', 'Out State', 'Possible Theft', 'Court/Investigation',
       'In Foster', 'Enroute', 'In Surgery', 'Barn', 'Customer S', 'Prc',
       'Emer', 'Emergency', 'In State', 'WALK IN', 'TRANSPORT',
       'UNDERAGE', 'BEHAVIOR', 'FIELD', 'OTHER', 'MEDICAL-CONTAGIOUS',
       'MEDICAL-NONCONTAGIOUS', 'SURGERY', 'IN KENNEL', 'COMPLETED',
       'BITE', 'WESTMORELD', 'EAC', 'PROMOTION', 'GENERAL', 'HUMANE',
       'SHORT TERM', 'OFFSITE', 'TELEADOPT', 'TREATMENT', 'TRANS-INV',
       'NTCOMPLETE', 'ADOPETS', 'DISPOSAL', 'TO ADOPT', 'HOLD',
       'BY FOSTER', 'IN FOSTER', 'MICROCHIP', 'EVENT', 'INV', 'STOLEN',
       'MEDICAL', 'ESCAPED', 'SBI', 'APPOINT', 'AT VETERINARIAN', 'DOA',
       'DD/AGG', 'ENROUTE', 'ARC', 'REFERRAL', 'IN SURGERY', 'AT HOME',
       'DISASTER', 'SPACE', 'TAG NUMBER', 'SPCA TEXAS', 'DAS OUTREACH',
       'STAFF', 'RE

In [162]:
for i in range(len(texas_shelters['outcome_sub'])):
    if str(texas_shelters['outcome_sub'][i]) != 'nan':
        texas_shelters['outcome_sub'][i] = texas_shelters['outcome_sub'][i].lower().replace(' ', '_')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  texas_shelters['outcome_sub'][i] = texas_shelters['outcome_sub'][i].lower().replace(' ', '_')


In [163]:
texas_shelters['outcome_sub'].unique()

array(['partner', nan, 'foster', 'suffering', 'medical', 'behavior',
       'in_kennel', 'aggressive', 'at_vet', 'rabies_risk', 'field',
       'offsite', 'out_state', 'possible_theft', 'court/investigation',
       'in_foster', 'enroute', 'in_surgery', 'barn', 'customer_s', 'prc',
       'emer', 'emergency', 'in_state', 'walk_in', 'transport',
       'underage', 'other', 'medical-contagious', 'medical-noncontagious',
       'surgery', 'completed', 'bite', 'westmoreld', 'eac', 'promotion',
       'general', 'humane', 'short_term', 'teleadopt', 'treatment',
       'trans-inv', 'ntcomplete', 'adopets', 'disposal', 'to_adopt',
       'hold', 'by_foster', 'microchip', 'event', 'inv', 'stolen',
       'escaped', 'sbi', 'appoint', 'at_veterinarian', 'doa', 'dd/agg',
       'arc', 'referral', 'at_home', 'disaster', 'space', 'tag_number',
       'spca_texas', 'das_outreach', 'staff', 'return'], dtype=object)

In [164]:
texas_shelters[['outcome_sub']].isna().sum()

outcome_sub    81459
dtype: int64

In [165]:
texas_shelters.shape

(132618, 17)

### Round age_out and age_in

In [166]:
for i in range(len(texas_shelters['age_in'])):
    if str(texas_shelters['age_in'][i]) != 'nan':
        texas_shelters['age_in'][i] = np.round(texas_shelters['age_in'][i], 0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  texas_shelters['age_in'][i] = np.round(texas_shelters['age_in'][i], 0)


In [167]:
for i in range(len(texas_shelters['age_out'])):
    if str(texas_shelters['age_out'][i]) != 'nan':
        texas_shelters['age_out'][i] = np.round(texas_shelters['age_out'][i], 0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  texas_shelters['age_out'][i] = np.round(texas_shelters['age_out'][i], 0)


### Add age column

In [168]:
age_cats = {
    0: 'Baby',
    1: 'Young',
    2: 'Adult', 3: 'Adult', 4: 'Adult', 5: 'Adult', 6: 'Adult', 7: 'Adult', 8: 'Adult',
    9: 'Senior', 10: 'Senior', 11: 'Senior', 12: 'Senior', 13: 'Senior', 14: 'Senior', 15: 'Senior', 16: 'Senior', 
    17: 'Senior', 18: 'Senior', 19: 'Senior', 20: 'Senior', 21: 'Senior', 22: 'Senior', 23: 'Senior', 24: 'Senior', 
    25: 'Senior', 26: 'Senior', 27: 'Senior', 28: 'Senior', 29: 'Senior', 30: 'Senior',
    np.nan: 'Adult' 
}

In [169]:
texas_shelters['age'] = texas_shelters['age_out'].map(age_cats)

In [170]:
texas_shelters.head()

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,intake_subtype,condition,date_out,age_out,intact_out,outcome,outcome_sub,age
0,A047759,Dachshund,Tricolor,1080864000.0,1,1396454000.0,10.0,0,Austin (TX),surrender,,wnl,1396884000.0,10.0,0,transfer,partner,Senior
1,A134067,Shetland Sheepdog,Brown/White,876960000.0,1,1384593000.0,16.0,0,12034 Research Blvd in Austin (TX),public_assist,,med_attn,1384603000.0,16.0,0,owner_return,,Senior
2,A141142,Labrador Retriever/Pit Bull,Black/White,896659200.0,0,1384613000.0,15.0,0,Austin (TX),stray,,aged,1384688000.0,15.0,0,owner_return,,Senior
3,A163459,Miniature Schnauzer Mix,Black/Gray,940291200.0,0,1415978000.0,15.0,1,Ih 35 And 41St St in Austin (TX),stray,,wnl,1415993000.0,15.0,1,owner_return,,Senior
4,A165752,Lhasa Apso Mix,Brown/White,934934400.0,1,1410780000.0,15.0,0,Gatlin Gun Rd And Brodie in Austin (TX),stray,,wnl,1410799000.0,15.0,0,owner_return,,Senior


In [171]:
texas_shelters['age'].value_counts()

Adult     74903
Young     30452
Baby      20612
Senior     6651
Name: age, dtype: int64

In [172]:
texas_shelters['age_out'].value_counts()

1.0     30452
2.0     21501
0.0     20612
3.0     10289
4.0      6221
5.0      5487
6.0      3522
7.0      2973
8.0      2749
10.0     1881
9.0      1635
11.0      863
12.0      849
13.0      555
14.0      361
15.0      266
16.0      128
17.0       55
18.0       31
19.0       20
20.0        5
23.0        1
24.0        1
Name: age_out, dtype: int64

In [173]:
texas_shelters['age'].unique()

array(['Senior', 'Adult', 'Young', 'Baby'], dtype=object)

In [174]:
texas_shelters['age'].isna().sum()

0

# Drop outcome_sub column

In [175]:
texas_shelters.head(2)

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,intake_subtype,condition,date_out,age_out,intact_out,outcome,outcome_sub,age
0,A047759,Dachshund,Tricolor,1080864000.0,1,1396454000.0,10.0,0,Austin (TX),surrender,,wnl,1396884000.0,10.0,0,transfer,partner,Senior
1,A134067,Shetland Sheepdog,Brown/White,876960000.0,1,1384593000.0,16.0,0,12034 Research Blvd in Austin (TX),public_assist,,med_attn,1384603000.0,16.0,0,owner_return,,Senior


In [176]:
texas_shelters.drop(columns='outcome_sub')

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,intake_subtype,condition,date_out,age_out,intact_out,outcome,age
0,A047759,Dachshund,Tricolor,1080864000.0,1,1.396454e+09,10.0,0,Austin (TX),surrender,,wnl,1.396884e+09,10.0,0,transfer,Senior
1,A134067,Shetland Sheepdog,Brown/White,876960000.0,1,1.384593e+09,16.0,0,12034 Research Blvd in Austin (TX),public_assist,,med_attn,1.384603e+09,16.0,0,owner_return,Senior
2,A141142,Labrador Retriever/Pit Bull,Black/White,896659200.0,0,1.384613e+09,15.0,0,Austin (TX),stray,,aged,1.384688e+09,15.0,0,owner_return,Senior
3,A163459,Miniature Schnauzer Mix,Black/Gray,940291200.0,0,1.415978e+09,15.0,1,Ih 35 And 41St St in Austin (TX),stray,,wnl,1.415993e+09,15.0,1,owner_return,Senior
4,A165752,Lhasa Apso Mix,Brown/White,934934400.0,1,1.410780e+09,15.0,0,Gatlin Gun Rd And Brodie in Austin (TX),stray,,wnl,1.410799e+09,15.0,0,owner_return,Senior
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132613,A1091105,CHIHUAHUA SH,,,,1.576541e+09,,,,stray,at_large,other,1.576541e+09,,,owner_return,Adult
132614,A1091106,CHIHUAHUA LH,,,,1.576541e+09,,,,stray,at_large,wnl,1.576541e+09,,,owner_return,Adult
132615,A1091115,FRENCH BULLDOG,,,,1.576541e+09,,,,stray,at_large,other,1.576541e+09,,,owner_return,Adult
132616,A1091120,BULLDOG,,,,1.576541e+09,,,,stray,at_large,wnl,1.576541e+09,,,owner_return,Adult


In [177]:
texas_shelters.drop(columns='outcome_sub', inplace=True)

### Save changes to csv

In [178]:
texas_shelters.to_csv('./datasets/cleaned_data/texas_shelters.csv', index=False)

# Make 'primary color' and 'secondary color' columns

In [179]:
texas_shelters['color']

0            Tricolor
1         Brown/White
2         Black/White
3          Black/Gray
4         Brown/White
             ...     
132613            NaN
132614            NaN
132615            NaN
132616            NaN
132617            NaN
Name: color, Length: 132618, dtype: object

In [180]:
texas_shelters['color'].unique()

array(['Tricolor', 'Brown/White', 'Black/White', 'Black/Gray',
       'White/Black', 'Black/Tan', 'Tan/Black', 'Black', 'Gold/Gold',
       'Sable/White', 'Blue Merle/Tan', 'Blue Merle', 'Brown/Black',
       'Black/Tricolor', 'Gray/White', 'Black/Brown', 'Cream', 'Tan',
       'Red', 'Yellow', 'Brown Merle', 'White', 'White/Brown', 'Tan/Red',
       'Black/Brown Brindle', 'Brown/Tan', 'Brown/Buff', 'Red Merle',
       'Brown', 'Brown/Cream', 'Blue/White', 'Red/White', 'Tan/White',
       'Gold', 'Red/Black', 'Black Brindle', 'Brown/Blue Merle', 'Blue',
       'Cream/White', 'Red/Tan', 'Chocolate', 'Brown Brindle/White',
       'White/Liver', 'Blue/Tan', 'Cream/Tan', 'Red/Brown', 'White/Cream',
       'Chocolate/White', 'White/Brown Merle', 'White/Brown Brindle',
       'Black/Cream', 'Brown Brindle', 'White/Gray', 'Brown Merle/White',
       'White/Tan', 'Yellow/Tan', 'Blue Tick/Brown', 'Gold/White',
       'Blue Merle/White', 'Apricot', 'White/Tricolor', 'Black/Blue Tick',
       'Bu

In [181]:
colors = list(texas_shelters['color'].dropna().unique())

In [182]:
sorted(colors)

['Agouti',
 'Apricot',
 'Apricot/Brown',
 'Apricot/Tricolor',
 'Apricot/White',
 'Black',
 'Black Brindle',
 'Black Brindle/Black',
 'Black Brindle/Blue',
 'Black Brindle/Blue Tick',
 'Black Brindle/Brown',
 'Black Brindle/Brown Brindle',
 'Black Brindle/Tan',
 'Black Brindle/White',
 'Black Smoke',
 'Black Smoke/Black',
 'Black Smoke/Blue Tick',
 'Black Smoke/Brown',
 'Black Smoke/Gray',
 'Black Smoke/White',
 'Black Tiger',
 'Black Tiger/White',
 'Black/Black',
 'Black/Black Brindle',
 'Black/Black Smoke',
 'Black/Blue',
 'Black/Blue Merle',
 'Black/Blue Tick',
 'Black/Brown',
 'Black/Brown Brindle',
 'Black/Brown Merle',
 'Black/Buff',
 'Black/Chocolate',
 'Black/Cream',
 'Black/Fawn',
 'Black/Gold',
 'Black/Gray',
 'Black/Orange',
 'Black/Red',
 'Black/Silver',
 'Black/Tan',
 'Black/Tricolor',
 'Black/White',
 'Black/Yellow',
 'Black/Yellow Brindle',
 'Blue',
 'Blue Cream',
 'Blue Cream/Blue Tiger',
 'Blue Cream/White',
 'Blue Merle',
 'Blue Merle/Black',
 'Blue Merle/Blue Merle',


In [183]:
texas_shelters['color'][83]

'Brown/Blue Merle'

In [184]:
import re

re.split(' |/', texas_shelters['color'][83])

['Brown', 'Blue', 'Merle']

In [185]:
len(re.split(' |/', texas_shelters['color'][83]))

3

In [186]:
re.split(' |/', texas_shelters['color'][83])[0]

'Brown'

In [187]:
len(re.split(' |/', texas_shelters['color'][0]))

1

In [188]:
len(texas_shelters['color'])

132618

In [189]:
texas_shelters.shape

(132618, 17)

In [190]:
primary_colors = []
secondary_colors = []

for i in range(len(texas_shelters['color'])):
    if str(texas_shelters['color'][i])!='nan':
        color_list = re.split(' |/', texas_shelters['color'][i])
        print(i, color_list)
        if len(color_list)==1:
            primary_colors.append(color_list[0])
            secondary_colors.append(color_list[0])
        else:
            primary_colors.append(color_list[0])
            secondary_colors.append(color_list[1])
    else:
        primary_colors.append(np.nan)
        secondary_colors.append(np.nan)
        print(i, 'nan')
    

0 ['Tricolor']
1 ['Brown', 'White']
2 ['Black', 'White']
3 ['Black', 'Gray']
4 ['Brown', 'White']
5 ['White', 'Black']
6 ['Brown', 'White']
7 ['Black', 'Tan']
8 ['Black', 'White']
9 ['Tan', 'Black']
10 ['Tan', 'Black']
11 ['Black']
12 ['Gold', 'Gold']
13 ['Gold', 'Gold']
14 ['Brown', 'White']
15 ['Brown', 'White']
16 ['Sable', 'White']
17 ['Blue', 'Merle', 'Tan']
18 ['Blue', 'Merle']
19 ['Brown', 'Black']
20 ['Black', 'Tricolor']
21 ['Gray', 'White']
22 ['Black', 'Brown']
23 ['Tricolor']
24 ['Cream']
25 ['Tan']
26 ['Red']
27 ['Tan']
28 ['Yellow']
29 ['Brown', 'Merle']
30 ['White']
31 ['White', 'Brown']
32 ['Tan']
33 ['Tricolor']
34 ['Tricolor']
35 ['Black']
36 ['Cream']
37 ['Tan', 'Red']
38 ['Black', 'Brown', 'Brindle']
39 ['Brown', 'Tan']
40 ['Tan']
41 ['White', 'Black']
42 ['Brown', 'Tan']
43 ['Tricolor']
44 ['Black', 'Brown']
45 ['Brown', 'Buff']
46 ['Red', 'Merle']
47 ['Black', 'White']
48 ['Tan']
49 ['Brown']
50 ['Black', 'Tan']
51 ['White', 'Brown']
52 ['Brown', 'Cream']
53 ['Bla

In [191]:
texas_shelters['primary_color'] = primary_colors

In [192]:
texas_shelters['secondary_color'] = secondary_colors

In [193]:
texas_shelters.head()

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,intake_subtype,condition,date_out,age_out,intact_out,outcome,age,primary_color,secondary_color
0,A047759,Dachshund,Tricolor,1080864000.0,1,1396454000.0,10.0,0,Austin (TX),surrender,,wnl,1396884000.0,10.0,0,transfer,Senior,Tricolor,Tricolor
1,A134067,Shetland Sheepdog,Brown/White,876960000.0,1,1384593000.0,16.0,0,12034 Research Blvd in Austin (TX),public_assist,,med_attn,1384603000.0,16.0,0,owner_return,Senior,Brown,White
2,A141142,Labrador Retriever/Pit Bull,Black/White,896659200.0,0,1384613000.0,15.0,0,Austin (TX),stray,,aged,1384688000.0,15.0,0,owner_return,Senior,Black,White
3,A163459,Miniature Schnauzer Mix,Black/Gray,940291200.0,0,1415978000.0,15.0,1,Ih 35 And 41St St in Austin (TX),stray,,wnl,1415993000.0,15.0,1,owner_return,Senior,Black,Gray
4,A165752,Lhasa Apso Mix,Brown/White,934934400.0,1,1410780000.0,15.0,0,Gatlin Gun Rd And Brodie in Austin (TX),stray,,wnl,1410799000.0,15.0,0,owner_return,Senior,Brown,White


In [194]:
texas_shelters['primary_color']

0         Tricolor
1            Brown
2            Black
3            Black
4            Brown
            ...   
132613         NaN
132614         NaN
132615         NaN
132616         NaN
132617         NaN
Name: primary_color, Length: 132618, dtype: object

In [195]:
texas_shelters['secondary_color']

0         Tricolor
1            White
2            White
3             Gray
4            White
            ...   
132613         NaN
132614         NaN
132615         NaN
132616         NaN
132617         NaN
Name: secondary_color, Length: 132618, dtype: object

In [196]:
texas_shelters.to_csv('./datasets/cleaned_data/texas_shelters.csv', index=False)

## Lowercase color columns

In [197]:
texas_shelters.head(2)

Unnamed: 0,id,breed,color,dob,sex,date_in,age_in,intact_in,location,intake_type,intake_subtype,condition,date_out,age_out,intact_out,outcome,age,primary_color,secondary_color
0,A047759,Dachshund,Tricolor,1080864000.0,1,1396454000.0,10.0,0,Austin (TX),surrender,,wnl,1396884000.0,10.0,0,transfer,Senior,Tricolor,Tricolor
1,A134067,Shetland Sheepdog,Brown/White,876960000.0,1,1384593000.0,16.0,0,12034 Research Blvd in Austin (TX),public_assist,,med_attn,1384603000.0,16.0,0,owner_return,Senior,Brown,White


In [198]:
texas_shelters['primary_color'][5].lower()

'white'

In [199]:
for i in range(len(texas_shelters['primary_color'])):
    if str(texas_shelters['primary_color'][i])!='nan':
        texas_shelters['primary_color'][i] = texas_shelters['primary_color'][i].lower()
    if str(texas_shelters['secondary_color'][i])!='nan':
        texas_shelters['secondary_color'][i] = texas_shelters['secondary_color'][i].lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  texas_shelters['primary_color'][i] = texas_shelters['primary_color'][i].lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  texas_shelters['secondary_color'][i] = texas_shelters['secondary_color'][i].lower()


In [200]:
texas_shelters['primary_color'].unique()

array(['tricolor', 'brown', 'black', 'white', 'tan', 'gold', 'sable',
       'blue', 'gray', 'cream', 'red', 'yellow', 'chocolate', 'apricot',
       'buff', 'fawn', 'silver', 'liver', 'orange', 'ruddy', 'agouti',
       'calico', nan], dtype=object)

In [201]:
texas_shelters['secondary_color'].unique()

array(['tricolor', 'white', 'gray', 'black', 'tan', 'gold', 'merle',
       'brown', 'cream', 'red', 'yellow', 'buff', 'brindle', 'blue',
       'chocolate', 'liver', 'tick', 'apricot', 'sable', 'silver',
       'tiger', 'fawn', 'smoke', 'orange', 'pink', 'agouti', 'tortie',
       'calico', nan], dtype=object)

# Export changes to cvs

In [202]:
texas_shelters.to_csv('./datasets/cleaned_data/texas_shelters.csv', index=False)