In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

intakes_file_path = os.path.join("center_data", "Austin_Animal_Center_Intakes.csv")

dog_intakes_df = pd.read_csv(intakes_file_path)\
    .rename(columns = lambda df: df.lower().replace(" ", "_"))\
    .drop("monthyear", axis=1)

dog_intakes_df = dog_intakes_df[dog_intakes_df["animal_type"] == 'Dog']\
    .rename(columns = {
    "intake_type":"type",
    "sex_upon_intake":"sex",
    "age_upon_intake":"age"
    })\
    .assign(is_intake=True)\
    .assign(datetime = lambda df: pd.to_datetime(df["datetime"]))\
    .reset_index(drop=True)

# dogs_intakes_df.to_csv('dog_intakes.csv')
dog_intakes_df.head(5)

Unnamed: 0,animal_id,name,datetime,found_location,type,intake_condition,animal_type,sex,age,breed,color,is_intake
0,A786884,*Brock,2019-01-03 16:19:00,2501 Magin Meadow Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor,True
1,A706918,Belle,2015-07-05 12:59:00,9409 Bluegrass Dr in Austin (TX),Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver,True
2,A724273,Runster,2016-04-14 18:43:00,2818 Palomino Trail in Austin (TX),Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White,True
3,A682524,Rio,2014-06-29 10:38:00,800 Grove Blvd in Austin (TX),Stray,Normal,Dog,Neutered Male,4 years,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,True
4,A743852,Odin,2017-02-18 12:46:00,Austin (TX),Owner Surrender,Normal,Dog,Neutered Male,2 years,Labrador Retriever Mix,Chocolate,True


In [2]:
outcomes_file_path = os.path.join("center_data", "Austin_Animal_Center_Outcomes.csv")

dog_outcomes_df = pd.read_csv(outcomes_file_path)\
    .rename(columns = lambda df: df.lower().replace(" ", "_"))\
    .drop("monthyear", axis=1)

dog_outcomes_df = dog_outcomes_df[
    (dog_outcomes_df["animal_type"] == 'Dog') & (dog_outcomes_df['outcome_type'] == 'Adoption')]\
    .rename(columns = {
        "outcome_type":"type",
        "sex_upon_outcome":"sex",
        "age_upon_outcome":"age"
        })\
    .assign(is_intake=False)\
    .assign(datetime = lambda df: pd.to_datetime(df["datetime"]))\
    .reset_index(drop=True)    

# dogs_intakes_df.to_csv('dog_intakes.csv')
dog_outcomes_df.head(5)

Unnamed: 0,animal_id,name,datetime,date_of_birth,type,outcome_subtype,animal_type,sex,age,breed,color,is_intake
0,A789027,Lennie,2019-02-17 11:44:00,02/13/2017,Adoption,,Dog,Neutered Male,2 years,Chihuahua Shorthair Mix,Cream,False
1,A720371,Moose,2016-02-13 17:59:00,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff,False
2,A789027,Lennie,2019-03-10 12:25:00,02/13/2017,Adoption,,Dog,Neutered Male,2 years,Chihuahua Shorthair Mix,Cream,False
3,A794494,Zoey,2019-05-14 16:27:00,06/14/2018,Adoption,,Dog,Spayed Female,10 months,Wire Hair Fox Terrier Mix,White/Black,False
4,A764361,Astro,2018-01-03 19:15:00,12/28/2016,Adoption,,Dog,Neutered Male,1 year,Norwich Terrier Mix,Black/Tan,False


In [3]:
dog_concat_df = pd\
    .concat([dog_intakes_df, dog_outcomes_df], sort=False)\
    .sort_values(by=["animal_id", "datetime"])\
     [lambda df: df.duplicated(["animal_id"],keep=False)]\
    .fillna("N/A")
# dog_concat_df[dog_concat_df["animal_id"] == "A047759"]

dog_concat_df.head()

Unnamed: 0,animal_id,name,datetime,found_location,type,intake_condition,animal_type,sex,age,breed,color,is_intake,date_of_birth,outcome_subtype
39209,A006100,Scamp,2014-03-07 14:26:00,8700 Research in Austin (TX),Public Assist,Normal,Dog,Neutered Male,6 years,Spinone Italiano Mix,Yellow/White,True,,
1859,A006100,Scamp,2014-12-19 10:21:00,8700 Research Blvd in Austin (TX),Public Assist,Normal,Dog,Neutered Male,7 years,Spinone Italiano Mix,Yellow/White,True,,
8663,A006100,Scamp,2017-12-07 14:07:00,Colony Creek And Hunters Trace in Austin (TX),Stray,Normal,Dog,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White,True,,
45817,A200922,Carlos,2013-10-03 15:47:00,Austin (TX),Owner Surrender,Normal,Dog,Neutered Male,16 years,Dachshund Mix,Black/Tan,True,,
5096,A200922,Carlos,2013-11-22 09:44:00,,Adoption,,Dog,Neutered Male,16 years,Dachshund Mix,Black/Tan,False,10/03/1997,Foster


In [4]:
describe_df = dog_concat_df.describe()
describe_df

Unnamed: 0,animal_id,name,datetime,found_location,type,intake_condition,animal_type,sex,age,breed,color,is_intake,date_of_birth,outcome_subtype
count,68444,68444.0,68444,68444.0,68444,68444,68444,68444,68444,68444,68444,68444,68444.0,68444.0
unique,28509,8561.0,60807,20320.0,5,11,1,6,47,1696,294,2,4677.0,3.0
top,A721033,,2014-02-19 13:51:00,,Adoption,Normal,Dog,Neutered Male,1 year,Pit Bull Mix,Black/White,True,,
freq,33,10827.0,47,29906.0,29906,36504,68444,23004,14926,8118,8605,38538,38538.0,64500.0
first,,,2013-10-01 11:01:00,,,,,,,,,,,
last,,,2020-01-07 17:18:00,,,,,,,,,,,


In [5]:
# dog_concat_df.groupby(dog_concat_df["animal_id"], as_index=False).count()
num_records_df = dog_concat_df["animal_id"]\
    .value_counts()\
    .to_frame()\
    .reset_index()\
    .rename(columns = {
        "index":"animal_id",
        "animal_id":"num_records"
    })

num_records_df

Unnamed: 0,animal_id,num_records
0,A721033,33
1,A754989,16
2,A718223,14
3,A770009,14
4,A737854,13
...,...,...
28504,A761987,2
28505,A806576,2
28506,A798391,2
28507,A799442,2


In [6]:
def qualified(num_intakes, num_outcomes):
    is_qualified = False
    
    if num_intakes == num_outcomes or num_intakes-1 == num_outcomes:
        is_qualified=True
    
    return is_qualified


#     elif is_odd(num_records):
#         if num_records//2+1 == num_intakes:
#             is_qualified = True
#     else:
#         if num_records//2 == num_intakes:
#             is_qualified = True    
    

In [7]:
dog_intake_count_df = dog_concat_df[["animal_id", "is_intake"]]\
    .groupby(by = ["animal_id"], as_index=False).count()

# intake_count_df = dog_qualified_df[dog_qualified_df["is_intake"] == True]["is_intake"]\


dog_intake_count_df
# intake_count_df

Unnamed: 0,animal_id,is_intake
0,A006100,3
1,A200922,2
2,A210457,2
3,A226069,2
4,A245945,2
...,...,...
28504,A811450,2
28505,A811507,2
28506,A811509,2
28507,A811534,2


In [8]:
intakes_agg = dog_concat_df[dog_concat_df["is_intake"]].groupby("animal_id")["animal_id"].count()
outcomes_agg = dog_concat_df[~dog_concat_df["is_intake"]].groupby("animal_id")["animal_id"].count()

In [9]:
validation_agg = pd.merge(
    intakes_agg.to_frame().rename(columns = {"animal_id":"intake_count"}),
    outcomes_agg.to_frame().rename(columns = {"animal_id":"outcome_count"}),
    how="outer",
    left_index=True,
    right_index=True
).dropna().reset_index()

# validation_agg

validation_agg["valid"] = np.vectorize(qualified)(validation_agg["intake_count"], validation_agg["outcome_count"])
matched_stays_df = validation_agg[validation_agg["valid"]]["animal_id"]
matched_stays_df

0        A200922
1        A210457
2        A226069
3        A249087
4        A274546
          ...   
26226    A811450
26227    A811507
26228    A811509
26229    A811534
26230    A811598
Name: animal_id, Length: 25802, dtype: object

In [10]:
working_df = dog_concat_df.loc[dog_concat_df["animal_id"].isin(matched_stays_df.values)].reset_index(drop=True)
# working_df.iloc[51683]["datetime"]-working_df.iloc[51682]["datetime"]
# working_df[working_df["animal_id"] == "A787364"]
working_df.head(5)

Unnamed: 0,animal_id,name,datetime,found_location,type,intake_condition,animal_type,sex,age,breed,color,is_intake,date_of_birth,outcome_subtype
0,A200922,Carlos,2013-10-03 15:47:00,Austin (TX),Owner Surrender,Normal,Dog,Neutered Male,16 years,Dachshund Mix,Black/Tan,True,,
1,A200922,Carlos,2013-11-22 09:44:00,,Adoption,,Dog,Neutered Male,16 years,Dachshund Mix,Black/Tan,False,10/03/1997,Foster
2,A210457,Caleb,2016-09-28 12:05:00,4424 S Mopac #412 in Austin (TX),Public Assist,Aged,Dog,Neutered Male,17 years,Chihuahua Shorthair,Tan/Black,True,,
3,A210457,Caleb,2016-10-07 12:34:00,,Adoption,,Dog,Neutered Male,17 years,Chihuahua Shorthair,Tan/Black,False,06/01/1999,Foster
4,A226069,Cedar,2015-10-06 12:29:00,Wheless Ln And Berkman Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,15 years,Labrador Retriever/Beagle,Sable/White,True,,


In [11]:
# avg_stay_df = pd.DataFrame(columns =["animal_id", "stay_length"])

# for index, animal_id in working_df.iterrows():
#     stay = 0
#     if working_df.iloc[index]["animal_id"] == working_df.iloc[index+1]["animal_id"]:
#         stay = animal_id.loc["datetime"] - animal_id.loc["datetime"]       
#     else:
#         pass
#     avg_stay_df

# #     print("index: " + str(index))
# #     print("animal_id: " + str(animal_id))

# avg_stay_df

# index = 0
# if working_df.iloc[index]["animal_id"] == working_df.iloc[index+1]["animal_id"]:
#     print("a ok")

In [12]:
is_pair=False
x=len(working_df)-1
print(x)
stay_len_list=[]
for i in range(len(working_df)-1) :
    x_date=(working_df.loc[i, "datetime"]).normalize()
    x2_date=(working_df.loc[i+1, "datetime"]).normalize()
    this_id=working_df.loc[i, "animal_id"]
    if working_df.loc[i, "animal_id"]==working_df.loc[i+1, "animal_id"]:
        if x2_date.date()>x_date.date():
            stay=x2_date-x_date
            stay_len_list.append(stay)
        else:
            pass
    else:
        pass
print(len(stay_len_list))
print(stay_len_list)

60890
34603
[Timedelta('50 days 00:00:00'), Timedelta('9 days 00:00:00'), Timedelta('23 days 00:00:00'), Timedelta('90 days 00:00:00'), Timedelta('3 days 00:00:00'), Timedelta('147 days 00:00:00'), Timedelta('148 days 00:00:00'), Timedelta('1014 days 00:00:00'), Timedelta('28 days 00:00:00'), Timedelta('90 days 00:00:00'), Timedelta('27 days 00:00:00'), Timedelta('58 days 00:00:00'), Timedelta('39 days 00:00:00'), Timedelta('7 days 00:00:00'), Timedelta('5 days 00:00:00'), Timedelta('2 days 00:00:00'), Timedelta('28 days 00:00:00'), Timedelta('9 days 00:00:00'), Timedelta('10 days 00:00:00'), Timedelta('349 days 00:00:00'), Timedelta('2 days 00:00:00'), Timedelta('16 days 00:00:00'), Timedelta('19 days 00:00:00'), Timedelta('38 days 00:00:00'), Timedelta('6 days 00:00:00'), Timedelta('88 days 00:00:00'), Timedelta('16 days 00:00:00'), Timedelta('202 days 00:00:00'), Timedelta('4 days 00:00:00'), Timedelta('7 days 00:00:00'), Timedelta('123 days 00:00:00'), Timedelta('11 days 00:00:00')

In [13]:
# Color

def split_colors(color,*,primary=True):
    split_char = "/"
    
    color_list = color.split(split_char) if color.find(split_char) >=0 else color

    color_output = None
    if primary:
        color_output = color_list[0] if type(color_list) == list else color_list
    else:
        color_output = color_list[1] if type(color_list) == list else None
    
    return color_output

def convert_color(color, conversion_dict):
    converted_color = None
    for key, value in conversion_dict.items():
        if not color == None:
            if color in value:
                converted_color = key
        else:
            converted_color = None
    return converted_color


def total_color_set(color_series):
    
    primary_secondary = [
        split_colors(color)
        for color 
        in color_series.unique()
    ]

    primary = {pair[0] if type(pair) == list else pair for pair in primary_secondary}
    secondary = {pair[1] for pair in primary_secondary if type(pair) == list}

    return primary|secondary
    
total_colors = total_color_set(working_df["color"])

color_conversion_dict = {
    "Black":[color for color in total_colors if color.startswith("Black")],
    "Brown":[color for color in total_colors if color.startswith("Brown")],
    "Blue":[color for color in total_colors if color.startswith("Blue")],
    "Red":[color for color in total_colors if color.startswith("Red")],
    "Yellow":[color for color in total_colors if color.startswith("Yellow")],
    "Gray":["Gray", "Silver"],
    "Tan":["Tan", "Buff", "Orange", "Gold", "Fawn", "Apricot"],
    "White":["White", "Cream"],
    "Tricolor":["Tricolor", "Calico", "Sable"],
    "Pink": "Pink"
}
color_conversion_dict["Red"].append("Ruddy")
color_conversion_dict["Brown"].extend(["Agouti", "Chocolate", "Liver", "Liver Tick"])
color_conversion_dict

{'Black': ['Black', 'Black Tiger', 'Black Brindle', 'Black Smoke'],
 'Brown': ['Brown Merle',
  'Brown',
  'Brown Tiger',
  'Brown Brindle',
  'Agouti',
  'Chocolate',
  'Liver',
  'Liver Tick'],
 'Blue': ['Blue Smoke',
  'Blue Merle',
  'Blue',
  'Blue Cream',
  'Blue Tiger',
  'Blue Tick'],
 'Red': ['Red Tick', 'Red Merle', 'Red', 'Ruddy'],
 'Yellow': ['Yellow', 'Yellow Brindle'],
 'Gray': ['Gray', 'Silver'],
 'Tan': ['Tan', 'Buff', 'Orange', 'Gold', 'Fawn', 'Apricot'],
 'White': ['White', 'Cream'],
 'Tricolor': ['Tricolor', 'Calico', 'Sable'],
 'Pink': 'Pink'}

In [14]:
working_df["color_primary"] = working_df["color"]\
    .apply(split_colors)\
    .apply(lambda color: convert_color(color, color_conversion_dict))

working_df["color_secondary"] = working_df["color"]\
    .apply(lambda color: split_colors(color, primary=False))\
    .apply(lambda color: convert_color(color, color_conversion_dict))

working_df = working_df.drop("color", axis=1)
# Do not run this cell again or else drop 

In [20]:
working_df.loc[working_df['breed'].str.contains('Labrador Retriever/'), 'breed'] = 'Labrador Retriever Mix'
working_df.loc[working_df['breed'].str.contains('Chihuahua Shorthair/'), 'breed'] = 'Chihuahua Shorthair Mix'
working_df.loc[working_df['breed'].str.contains('Pit Bull/'), 'breed'] = 'Pit Bull Mix'
working_df.loc[working_df['breed'].str.contains('German Shepherd/'), 'breed'] = 'German Shepherd Mix'
working_df.loc[working_df['breed'].str.contains('Australian Cattle Dog/'), 'breed'] = 'Australian Cattle Dog Mix'
working_df.loc[working_df['breed'].str.contains('Dachshund/'), 'breed'] = 'Dachshund Mix'
working_df.loc[working_df['breed'].str.contains('Border Collie/'), 'breed'] = 'Border Collie Mix'
working_df.loc[working_df['breed'].str.contains('Boxer/'), 'breed'] = 'Boxer Mix'
working_df.loc[working_df['breed'].str.contains('Catahoula/'), 'breed'] = 'Catahoula Mix'
working_df.loc[working_df['breed'].str.contains('Miniature Poodle/'), 'breed'] = 'Miniature Poodle Mix'
working_df.loc[working_df['breed'].str.contains('Australian Shepherd/'), 'breed'] = 'Australian Shepherd Mix'
working_df.loc[working_df['breed'].str.contains('Pointer/'), 'breed'] = 'Pointer Mix'
working_df.loc[working_df['breed'].str.contains('Jack Russell Terrier/'), 'breed'] = 'Jack Russell Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Beagle/'), 'breed'] = 'Beagle Mix'
working_df.loc[working_df['breed'].str.contains('Yorkshire Terrier/'), 'breed'] = 'Yorkshire Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Rat Terrier/'), 'breed'] = 'Rat Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Cairn Terrier/'), 'breed'] = 'Cairn Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Staffordshire/'), 'breed'] = 'Staffordshire Mix'
working_df.loc[working_df['breed'].str.contains('Great Pyrenees/'), 'breed'] = 'Great Pyrenees Mix'
working_df.loc[working_df['breed'].str.contains('Miniature Schnauzer/'), 'breed'] = 'Miniature Schnauzer Mix'
working_df.loc[working_df['breed'].str.contains('Siberian Husky/'), 'breed'] = 'Siberian Husky Mix'
working_df.loc[working_df['breed'].str.contains('Chihuahua Longhair/'), 'breed'] = 'Chihuahua Longhair Mix'
working_df.loc[working_df['breed'].str.contains('Anatol Shepherd/'), 'breed'] = 'Anatol Shepherd Mix'
working_df.loc[working_df['breed'].str.contains('Plott Hound/'), 'breed'] = 'Plott Hound Mix'
working_df.loc[working_df['breed'].str.contains('Black Mouth Cur/'), 'breed'] = 'Black Mouth Cur Mix'
working_df.loc[working_df['breed'].str.contains('Rottweiler/'), 'breed'] = 'Rottweiler Mix'
working_df.loc[working_df['breed'].str.contains('Australian Kelpie/'), 'breed'] = 'Australian Kelpie Mix'
working_df.loc[working_df['breed'].str.contains('American Bulldog/'), 'breed'] = 'American Bulldog Mix'
working_df.loc[working_df['breed'].str.contains('Miniature Pinscher/'), 'breed'] = 'Miniature Pinscher Mix'
working_df.loc[working_df['breed'].str.contains('Border Terrier/'), 'breed'] = 'Border Terrier Mix'
working_df.loc[working_df['breed'].str.contains('American Pit Bull Terrier/'), 'breed'] = 'American Pit Bull Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Cardigan Welsh Corgi/'), 'breed'] = 'Cardigan Welsh Corgi Mix'
working_df.loc[working_df['breed'].str.contains('Golden Retriever/'), 'breed'] = 'Golden Retriever Mix'
working_df.loc[working_df['breed'].str.contains('Maltese/'), 'breed'] = 'Maltese Mix'
working_df.loc[working_df['breed'].str.contains('Blue Lacy/'), 'breed'] = 'Blue Lacy Mix'
working_df.loc[working_df['breed'].str.contains('Queensland Heeler/'), 'breed'] = 'Queensland Heeler Mix'
working_df.loc[working_df['breed'].str.contains('American Staffordshire Terrier/'), 'breed'] = 'American Staffordshire Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Basset Hound/'), 'breed'] = 'Basset Hound Mix'
working_df.loc[working_df['breed'].str.contains('Manchester Terrier/'), 'breed'] = 'Manchester Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Black/Tan Hound/'), 'breed'] = 'Black/Tan Hound Mix'
working_df.loc[working_df['breed'].str.contains('Collie Smooth/'), 'breed'] = 'Collie Smooth Mix'
working_df.loc[working_df['breed'].str.contains('Norfolk Terrier/'), 'breed'] = 'Norfolk Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Dachshund Longhair/'), 'breed'] = 'Dachshund Longhair Mix'
working_df.loc[working_df['breed'].str.contains('Redbone Hound/'), 'breed'] = 'Redbone Hound Mix'
working_df.loc[working_df['breed'].str.contains('Dachshund Wirehair/'), 'breed'] = 'Dachshund Wirehair Mix'
working_df.loc[working_df['breed'].str.contains('Doberman Pinsch/'), 'breed'] = 'Doberman Pinsch Mix'
working_df.loc[working_df['breed'].str.contains('Flat Coat Retriever/'), 'breed'] = 'Flat Coat Retriever Mix'
working_df.loc[working_df['breed'].str.contains('Mastiff/'), 'breed'] = 'Mastiff Mix'
working_df.loc[working_df['breed'].str.contains('Shih Tzu/'), 'breed'] = 'Shih Tzu Mix'
working_df.loc[working_df['breed'].str.contains('Chow Chow/'), 'breed'] = 'Chow Chow Mix'
working_df.loc[working_df['breed'].str.contains('Carolina Dog/'), 'breed'] = 'Carolina Dog Mix'
working_df.loc[working_df['breed'].str.contains('Harrier/'), 'breed'] = 'Harrier Mix'
working_df.loc[working_df['breed'].str.contains('Toy Poodle/'), 'breed'] = 'Toy Poodle Mix'
working_df.loc[working_df['breed'].str.contains('Whippet/'), 'breed'] = 'Whippet Mix'
working_df.loc[working_df['breed'].str.contains('Chinese Sharpei/'), 'breed'] = 'Chinese Sharpei Mix'
working_df.loc[working_df['breed'].str.contains('Alaskan Husky/'), 'breed'] = 'Alaskan Husky Mix'
working_df.loc[working_df['breed'].str.contains('Pembroke Welsh Corgi/'), 'breed'] = 'Pembroke Welsh Corgi Mix'
working_df.loc[working_df['breed'].str.contains('Basenji/'), 'breed'] = 'Basenji Mix'
working_df.loc[working_df['breed'].str.contains('Rhod Ridgeback/'), 'breed'] = 'Rhod Ridgeback Mix'
working_df.loc[working_df['breed'].str.contains('Cocker Spaniel/'), 'breed'] = 'Cocker Spaniel Mix'
working_df.loc[working_df['breed'].str.contains('Great Dane/'), 'breed'] = 'Great Dane Mix'
working_df.loc[working_df['breed'].str.contains('Soft Coated Wheaten Terrier/'), 'breed'] = 'Soft Coated Wheaten Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Pomeranian/'), 'breed'] = 'Pomeranian Mix'
working_df.loc[working_df['breed'].str.contains('Bruss Griffon/'), 'breed'] = 'Bruss Griffon Mix'
working_df.loc[working_df['breed'].str.contains('Pug/'), 'breed'] = 'Pug Mix'
working_df.loc[working_df['breed'].str.contains('Norwich Terrier/'), 'breed'] = 'Norwich Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Belgian Malinois/'), 'breed'] = 'Belgian Malinois Mix'
working_df.loc[working_df['breed'].str.contains('Wire Hair Fox Terrier/'), 'breed'] = 'Wire Hair Fox Terrier Mix'
working_df.loc[working_df['breed'].str.contains('English Bulldog/'), 'breed'] = 'English Bulldog Mix'
working_df.loc[working_df['breed'].str.contains('Boston Terrier/'), 'breed'] = 'Boston Terrier Mix'
working_df.loc[working_df['breed'].str.contains('German Shorthair Pointer/'), 'breed'] = 'German Shorthair Pointer Mix'
working_df.loc[working_df['breed'].str.contains('Papillon/'), 'breed'] = 'Papillon Mix'
working_df.loc[working_df['breed'].str.contains('Tibetan Spaniel/'), 'breed'] = 'Tibetan Spaniel Mix'
working_df.loc[working_df['breed'].str.contains('Smooth Fox Terrier/'), 'breed'] = 'Smooth Fox Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Dalmatian/'), 'breed'] = 'Dalmatian Mix'
working_df.loc[working_df['breed'].str.contains('Pbgv/'), 'breed'] = 'Pbgv Mix'
working_df.loc[working_df['breed'].str.contains('Bull Terrier/'), 'breed'] = 'Bull Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Vizsla/'), 'breed'] = 'Vizsla Mix'
working_df.loc[working_df['breed'].str.contains('Standard Schnauzer/'), 'breed'] = 'Standard Schnauzer Mix'
working_df.loc[working_df['breed'].str.contains('Weimaraner/'), 'breed'] = 'Weimaraner Mix'
working_df.loc[working_df['breed'].str.contains('Scottish Terrier/'), 'breed'] = 'Scottish Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Dutch Shepherd/'), 'breed'] = 'Dutch Shepherd Mix'
working_df.loc[working_df['breed'].str.contains('Pekingese/'), 'breed'] = 'Pekingese Mix'
working_df.loc[working_df['breed'].str.contains('Akita/'), 'breed'] = 'Akita Mix'
working_df.loc[working_df['breed'].str.contains('Pharaoh Hound/'), 'breed'] = 'Pharaoh Hound Mix'
working_df.loc[working_df['breed'].str.contains('Collie Rough/'), 'breed'] = 'Collie Rough Mix'
working_df.loc[working_df['breed'].str.contains('Bichon Frise/'), 'breed'] = 'Bichon Frise Mix'
working_df.loc[working_df['breed'].str.contains('English Coonhound/'), 'breed'] = 'English Coonhound Mix'
working_df.loc[working_df['breed'].str.contains('Italian Greyhound/'), 'breed'] = 'Italian Greyhound Mix'
working_df.loc[working_df['breed'].str.contains('Shiba Inu/'), 'breed'] = 'Shiba Inu Mix'
working_df.loc[working_df['breed'].str.contains('English Pointer/'), 'breed'] = 'English Pointer Mix'
working_df.loc[working_df['breed'].str.contains('Greyhound/'), 'breed'] = 'Greyhound Mix'
working_df.loc[working_df['breed'].str.contains('Lhasa Apso/'), 'breed'] = 'Lhasa Apso Mix'
working_df.loc[working_df['breed'].str.contains('Havanese/'), 'breed'] = 'Havanese Mix'
working_df.loc[working_df['breed'].str.contains('Shetland Sheepdog/'), 'breed'] = 'Shetland Sheepdog Mix'
working_df.loc[working_df['breed'].str.contains('West Highland/'), 'breed'] = 'West Highland Mix'
working_df.loc[working_df['breed'].str.contains('Bluetick Hound/'), 'breed'] = 'Bluetick Hound Mix'
working_df.loc[working_df['breed'].str.contains('Airedale Terrier/'), 'breed'] = 'Airedale Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Finnish Spitz/'), 'breed'] = 'Finnish Spitz Mix'
working_df.loc[working_df['breed'].str.contains('Chesa Bay Retr/'), 'breed'] = 'Chesa Bay Retr Mix'
working_df.loc[working_df['breed'].str.contains('Schnauzer Giant/'), 'breed'] = 'Schnauzer Giant Mix'
working_df.loc[working_df['breed'].str.contains('Bernese Mountain Dog/'), 'breed'] = 'Bernese Mountain Dog Mix'
working_df.loc[working_df['breed'].str.contains('Australian Terrier/'), 'breed'] = 'Australian Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Alaskan Malamute/'), 'breed'] = 'Alaskan Malamute Mix'
working_df.loc[working_df['breed'].str.contains('Parson Russell Terrier/'), 'breed'] = 'Parson Russell Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Toy Fox Terrier/'), 'breed'] = 'Toy Fox Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Irish Terrier/'), 'breed'] = 'Irish Terrier Mix'
working_df.loc[working_df['breed'].str.contains('American Eskimo/'), 'breed'] = 'American Eskimo Mix'
working_df.loc[working_df['breed'].str.contains('Bulldog/'), 'breed'] = 'Bulldog Mix'
working_df.loc[working_df['breed'].str.contains('Leonberger/'), 'breed'] = 'Leonberger Mix'
working_df.loc[working_df['breed'].str.contains('Swedish Vallhund/'), 'breed'] = 'Swedish Vallhund Mix'
working_df.loc[working_df['breed'].str.contains('Treeing Walker Coonhound/'), 'breed'] = 'Treeing Walker Coonhound Mix'
working_df.loc[working_df['breed'].str.contains('Glen Of Imaal/'), 'breed'] = 'Glen Of Imaal Mix'
working_df.loc[working_df['breed'].str.contains('Bloodhound/'), 'breed'] = 'Bloodhound Mix'
working_df.loc[working_df['breed'].str.contains('Bullmastiff/'), 'breed'] = 'Bullmastiff Mix'
working_df.loc[working_df['breed'].str.contains('Tibetan Terrier/'), 'breed'] = 'Tibetan Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Welsh Terrier/'), 'breed'] = 'Welsh Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Cavalier Span/'), 'breed'] = 'Cavalier Span Mix'
working_df.loc[working_df['breed'].str.contains('St. Bernard Smooth Coat/'), 'breed'] = 'St. Bernard Smooth Coat Mix'
working_df.loc[working_df['breed'].str.contains('Landseer/'), 'breed'] = 'Landseer Mix'
working_df.loc[working_df['breed'].str.contains('Dogo Argentino/'), 'breed'] = 'Dogo Argentino Mix'
working_df.loc[working_df['breed'].str.contains('Patterdale Terr/'), 'breed'] = 'Patterdale Terr Mix'
working_df.loc[working_df['breed'].str.contains('Field Spaniel/'), 'breed'] = 'Field Spaniel Mix'
working_df.loc[working_df['breed'].str.contains('Feist/'), 'breed'] = 'Feist Mix'
working_df.loc[working_df['breed'].str.contains('Beauceron/'), 'breed'] = 'Beauceron Mix'
working_df.loc[working_df['breed'].str.contains('Schipperke/'), 'breed'] = 'Schipperke Mix'
working_df.loc[working_df['breed'].str.contains('Dogue De Bordeaux/'), 'breed'] = 'Dogue De Bordeaux Mix'
working_df.loc[working_df['breed'].str.contains('Boykin Span/'), 'breed'] = 'Boykin Span Mix'
working_df.loc[working_df['breed'].str.contains('Standard Poodle/'), 'breed'] = 'Standard Poodle Mix'
working_df.loc[working_df['breed'].str.contains('Canaan Dog/'), 'breed'] = 'Canaan Dog Mix'
working_df.loc[working_df['breed'].str.contains('Silky Terrier/'), 'breed'] = 'Silky Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Presa Canario/'), 'breed'] = 'Presa Canario Mix'
working_df.loc[working_df['breed'].str.contains('Treeing Cur/'), 'breed'] = 'Treeing Cur Mix'
working_df.loc[working_df['breed'].str.contains('Podengo Pequeno/'), 'breed'] = 'Podengo Pequeno Mix'
working_df.loc[working_df['breed'].str.contains('German Wirehaired Pointer/'), 'breed'] = 'German Wirehaired Pointer Mix'
working_df.loc[working_df['breed'].str.contains('Newfoundland/'), 'breed'] = 'Newfoundland Mix'
working_df.loc[working_df['breed'].str.contains('St. Bernard Rough Coat/'), 'breed'] = 'St. Bernard Rough Coat Mix'
working_df.loc[working_df['breed'].str.contains('Old English Bulldog/'), 'breed'] = 'Old English Bulldog Mix'
working_df.loc[working_df['breed'].str.contains('Boerboel/'), 'breed'] = 'Boerboel Mix'
working_df.loc[working_df['breed'].str.contains('Irish Wolfhound/'), 'breed'] = 'Irish Wolfhound Mix'
working_df.loc[working_df['breed'].str.contains('Keeshond/'), 'breed'] = 'Keeshond Mix'
working_df.loc[working_df['breed'].str.contains('Kuvasz/'), 'breed'] = 'Kuvasz Mix'
working_df.loc[working_df['breed'].str.contains('Cane Corso/'), 'breed'] = 'Cane Corso Mix'
working_df.loc[working_df['breed'].str.contains('American Foxhound/'), 'breed'] = 'American Foxhound Mix'
working_df.loc[working_df['breed'].str.contains('English Shepherd/'), 'breed'] = 'English Shepherd Mix'
working_df.loc[working_df['breed'].str.contains('English Foxhound/'), 'breed'] = 'English Foxhound Mix'
working_df.loc[working_df['breed'].str.contains('Chinese Crested/'), 'breed'] = 'Chinese Crested Mix'
working_df.loc[working_df['breed'].str.contains('Picardy Sheepdog/'), 'breed'] = 'Picardy Sheepdog Mix'
working_df.loc[working_df['breed'].str.contains('Skye Terrier/'), 'breed'] = 'Skye Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Akbash/'), 'breed'] = 'Akbash Mix'
working_df.loc[working_df['breed'].str.contains('Wirehaired Pointing Griffon/'), 'breed'] = 'Wirehaired Pointing Griffon Mix'
working_df.loc[working_df['breed'].str.contains('Affenpinscher/'), 'breed'] = 'Affenpinscher Mix'
working_df.loc[working_df['breed'].str.contains('Mexican Hairless/'), 'breed'] = 'Mexican Hairless Mix'
working_df.loc[working_df['breed'].str.contains('Bearded Collie/'), 'breed'] = 'Bearded Collie Mix'
working_df.loc[working_df['breed'].str.contains('Greater Swiss Mountain Dog/'), 'breed'] = 'Greater Swiss Mountain Dog Mix'
working_df.loc[working_df['breed'].str.contains('French Bulldog/'), 'breed'] = 'French Bulldog Mix'
working_df.loc[working_df['breed'].str.contains('Nova Scotia Duck Tolling Retriever/'), 'breed'] = 'Nova Scotia Duck Tolling Retriever Mix'
working_df.loc[working_df['breed'].str.contains('Neapolitan Mastiff/'), 'breed'] = 'Neapolitan Mastiff Mix'
working_df.loc[working_df['breed'].str.contains('German Pinscher/'), 'breed'] = 'German Pinscher Mix'
working_df.loc[working_df['breed'].str.contains('English Cocker Spaniel/'), 'breed'] = 'English Cocker Spaniel Mix'
working_df.loc[working_df['breed'].str.contains('Treeing Tennesse Brindle/'), 'breed'] = 'Treeing Tennesse Brindle Mix'
working_df.loc[working_df['breed'].str.contains('Ibizan Hound/'), 'breed'] = 'Ibizan Hound Mix'
working_df.loc[working_df['breed'].str.contains('English Setter/'), 'breed'] = 'English Setter Mix'
working_df.loc[working_df['breed'].str.contains('Bull Terrier Miniature/'), 'breed'] = 'Bull Terrier Miniature Mix'
working_df.loc[working_df['breed'].str.contains('Welsh Springer Spaniel/'), 'breed'] = 'Welsh Springer Spaniel Mix'
working_df.loc[working_df['breed'].str.contains('Coton De Tulear/'), 'breed'] = 'Coton De Tulear Mix'
working_df.loc[working_df['breed'].str.contains('Brittany/'), 'breed'] = 'Brittany Mix'
working_df.loc[working_df['breed'].str.contains('Dandie Dinmont/'), 'breed'] = 'Dandie Dinmont Mix'
working_df.loc[working_df['breed'].str.contains('Belgian Sheepdog/'), 'breed'] = 'Belgian Sheepdog Mix'
working_df.loc[working_df['breed'].str.contains('Port Water Dog/'), 'breed'] = 'Port Water Dog Mix'
working_df.loc[working_df['breed'].str.contains('Bedlington Terr/'), 'breed'] = 'Bedlington Terr Mix'
working_df.loc[working_df['breed'].str.contains('Gordon Setter/'), 'breed'] = 'Gordon Setter Mix'
working_df.loc[working_df['breed'].str.contains('Eng Toy Spaniel/'), 'breed'] = 'Eng Toy Spaniel Mix'
working_df.loc[working_df['breed'].str.contains('Dachshund Stan/'), 'breed'] = 'Dachshund Stan Mix'
working_df.loc[working_df['breed'].str.contains('English Springer Spaniel/'), 'breed'] = 'English Springer Spaniel Mix'
working_df.loc[working_df['breed'].str.contains('Clumber Spaniel/'), 'breed'] = 'Clumber Spaniel Mix'
working_df.loc[working_df['breed'].str.contains('Saluki/'), 'breed'] = 'Saluki Mix'
working_df.loc[working_df['breed'].str.contains('Grand Basset Griffon Vendeen/'), 'breed'] = 'Grand Basset Griffon Vendeen Mix'
working_df.loc[working_df['breed'].str.contains('Hovawart/'), 'breed'] = 'Hovawart Mix'
working_df.loc[working_df['breed'].str.contains('Samoyed/'), 'breed'] = 'Samoyed Mix'
working_df.loc[working_df['breed'].str.contains('Jindo/'), 'breed'] = 'Jindo Mix'
working_df.loc[working_df['breed'].str.contains('Lakeland Terrier/'), 'breed'] = 'Lakeland Terrier Mix'
working_df.loc[working_df['breed'].str.contains('Lowchen/'), 'breed'] = 'Lowchen Mix'
working_df.loc[working_df['breed'].str.contains('Norwegian Elkhound/'), 'breed'] = 'Norwegian Elkhound Mix'
working_df.loc[working_df['breed'].str.contains('Otterhound/'), 'breed'] = 'Otterhound Mix'
working_df.loc[working_df['breed'].str.contains('Belgian Tervuren/'), 'breed'] = 'Belgian Tervuren Mix'
working_df.loc[working_df['breed'].str.contains('Sealyham Terr/'), 'breed'] = 'Sealyham Terr Mix'
working_df.loc[working_df['breed'].str.contains('Japanese Chin/'), 'breed'] = 'Japanese Chin Mix'
working_df.loc[working_df['breed'].str.contains('Afghan Hound/'), 'breed'] = 'Afghan Hound Mix'
working_df.loc[working_df['breed'].str.contains('Entlebucher/'), 'breed'] = 'Entlebucher Mix'

In [32]:
mix_df = working_df[working_df['breed'].str.contains('Mix')]\
        .sort_values(by=['animal_id'], ascending=False)
pure_df = working_df[~working_df['breed'].str.contains('Mix')]\
        .sort_values(by=['animal_id'], ascending=False)

In [34]:
#Creates Molly dataframe for testing.
molly = working_df.set_index("animal_id").loc["A754989"]
def find_staymean(df):
#Finds the mean length of stay for a particular animal id if and only if
#the stay has followed by an adoption outcome.
    stay_list=[]
    first_row = df.iloc[0, :]
    if first_row["is_intake"]:
        x_len = len(df)
#The range has to be the length of the dataframe-1 because of the need to add 1 to the current row
#The for loop cannot use iterrows because the result has to refer to the current row and the next one.
        for i in range(0, x_len - 1):
#Sets variables equal to the first and second rows of the dataframe
            x1_row = df.iloc[i, :]
            x2_row = df.iloc[i + 1, :]
#First in pair has to be intake, the next has to be an outcome
            if (x1_row["is_intake"]==True) & (x2_row["is_intake"]==False):
#A pair of rows is valid if they are not both intakes or both outcomes.
                if x1_row["is_intake"] != x2_row["is_intake"]:
#Stay length is the number of days between intake and adoption outcome.
                    stay = (x2_row['datetime']-x1_row["datetime"]).days
                    stay_list.append(stay)
# Calculates and returns average
    average_stay = sum(stay_list)/len(stay_list)
    return average_stay
find_staymean(molly)

7.5

In [35]:
def find_num_stays(df):
#Finds the number of stays for a particular animal id if and only if
#the stay was  followed by an adoption outcome.
    stay_list=[]
    first_row = df.iloc[0, :]
    if first_row["is_intake"]:
        x_len = len(df)
#The range has to be the length of the dataframe-1 because of the need to add 1 to the current row
#The for loop cannot use iterrows because the result has to refer to the current row and the next one.
        for i in range(0, x_len - 1):
#Sets variables equal to the first and second rows of the dataframe
            x1_row = df.iloc[i, :]
            x2_row = df.iloc[i + 1, :]
#First in pair has to be intake, the next has to be an outcome
            if (x1_row["is_intake"]==True) & (x2_row["is_intake"]==False):
#A pair of rows is valid if they are not both intakes or both outcomes.
                if x1_row["is_intake"] != x2_row["is_intake"]:
#Stay length is the number of days between intake and adoption outcome.
                    stay = (x2_row['datetime']-x1_row["datetime"]).days
                    stay_list.append(stay)
# Calculates and returns average
    num_stays=len(stay_list)
    return num_stays
find_num_stays(molly)

8

In [15]:
# # make sure to install these packages before running:
# # pip install pandas
# # pip install sodapy

# import pandas as pd
# from sodapy import Socrata
# import requests

# app_token="3mubvsyw1rybaisquwqluss7zqf6c3nqxu5g4x56odov7ggexx"
# def get_soda_api_data(endpoint, app_token, count=1000, offset=0, return_df=True):
#     params = {'$$app_token': app_token, '$limit': count, '$offset': offset, }
    
#     results = []

#     while True:

#         try:
#             r = requests.get(endpoint, params=params)
#             rcontent = r.json()

#             if rcontent == []:
#                 break

#             results.append(rcontent)
#             offset += count 
#             params['$offset'] = offset

#         except HTTPError as err:

#             if err.response.status_code == '404':
#                 break
#             else:
#                 print(err.response.status_code)
    
#     if return_df:
#         results_df = pd.DataFrame()

#         for i in results:
#             results_df = results_df.append(pd.io.json.json_normalize(i))
        
#         return results_df
    
#     else:
#         return results
    
# endpoint="https://data.austintexas.gov/resource/wter-evkm.json"

# get_soda_api_data(endpoint, app_token)