In [321]:
import numpy as np
import pandas as pd  
import seaborn as sns
from matplotlib import pyplot as plt

In [322]:
data = pd.read_csv('aac_shelter_outcomes.csv')
data.head(5)

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome
0,2 weeks,A684346,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07T00:00:00,2014-07-22T16:04:00,2014-07-22T16:04:00,,Partner,Transfer,Intact Male
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Lucy,Partner,Transfer,Spayed Female
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,*Johnny,,Adoption,Neutered Male
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Monday,Partner,Transfer,Neutered Male
4,5 months,A683115,Other,Bat Mix,Brown,2014-01-07T00:00:00,2014-07-07T14:04:00,2014-07-07T14:04:00,,Rabies Risk,Euthanasia,Unknown


In [323]:
data.shape

(78256, 12)

In [324]:
# нам нужно предсказать метки 'Adoption' и 'Transfer' в столбце outcome_type. Поэтому оставим только их 
data = data[data.outcome_type.isin(['Adoption', 'Transfer'])]

In [325]:
data.shape

(56611, 12)

!!!! _profiling_____________________________________________

наблюдения по профайлингу:
1. в категориальном столбце name 31% пропусков. Не думаю что имя влияет на судьбу животного. Удалим столбец. 
2. в категириальном столбце outcome_subtype почти 50% пропусков. Это важный признак, но мы увы придется его тоже удалить. 
3. Переменные color  и  breed имееют огромное количество различных наименований. надо как-то преобразовать. 
4. надо привести столбец age_upon_outcome к однороднову числовому виду. Может быть считать сколько было дней животному, когда оно поступило в приют. 
5. столбец animal_id удаляем. 
6. столбцы datetime и monthyear идентичные
7.  к столбцам animal_type и sex_upon_outcome тоже применим One hot encoding, возможно заранее преобразовав

In [326]:
data = data.drop('animal_id', 1)
data = data.drop('name', 1)
data = data.drop('outcome_subtype', 1)

In [327]:
data.isnull().sum()

age_upon_outcome    2
animal_type         0
breed               0
color               0
date_of_birth       0
datetime            0
monthyear           0
outcome_type        0
sex_upon_outcome    0
dtype: int64

In [328]:
# в столбце age_upon_outcome  есть два пропущенных значения. Заполним их модой.
data = data.apply(lambda x:x.fillna(x.value_counts().index[0]))
data.apply(lambda x: sum(x.isnull()/len(data)))

age_upon_outcome    0.0
animal_type         0.0
breed               0.0
color               0.0
date_of_birth       0.0
datetime            0.0
monthyear           0.0
outcome_type        0.0
sex_upon_outcome    0.0
dtype: float64

In [329]:
#сведем столбц age_upon_outcome к количесвту дней от рождения.
n_of_days = []
for i in data.age_upon_outcome:
    [number, date] = i.split(" ")
    if ('day' in date):
        n_of_days.append(int(number))
    if ('week' in date):
        n_of_days.append((int(number))*7)
    if ('month' in date):
        n_of_days.append((int(number))*30)
    if ('year' in date):
        n_of_days.append((int(number))*365)

In [330]:
n_of_days_df = pd.DataFrame(n_of_days)
n_of_days_df.index = data.index
data['number_of_days_upon_outcome'] = n_of_days_df

In [331]:
data

Unnamed: 0,age_upon_outcome,animal_type,breed,color,date_of_birth,datetime,monthyear,outcome_type,sex_upon_outcome,number_of_days_upon_outcome
0,2 weeks,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07T00:00:00,2014-07-22T16:04:00,2014-07-22T16:04:00,Transfer,Intact Male,14
1,1 year,Dog,Beagle Mix,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Transfer,Spayed Female,365
2,1 year,Dog,Pit Bull,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,Adoption,Neutered Male,365
3,9 years,Dog,Miniature Schnauzer Mix,White,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Transfer,Neutered Male,3285
5,4 months,Dog,Leonberger Mix,Brown/White,2013-06-03T00:00:00,2013-10-07T13:06:00,2013-10-07T13:06:00,Transfer,Intact Male,120
7,3 years,Dog,Chihuahua Shorthair Mix,Brown,2011-11-23T00:00:00,2014-12-08T15:55:00,2014-12-08T15:55:00,Transfer,Spayed Female,1095
8,1 month,Cat,Domestic Shorthair Mix,Blue Tabby/White,2014-06-16T00:00:00,2014-08-14T18:45:00,2014-08-14T18:45:00,Adoption,Intact Female,30
9,3 months,Cat,Domestic Shorthair Mix,White/Black,2014-03-26T00:00:00,2014-06-29T17:45:00,2014-06-29T17:45:00,Adoption,Spayed Female,90
11,2 years,Dog,Papillon/Border Collie,Black/White,2012-02-28T00:00:00,2014-03-28T14:39:00,2014-03-28T14:39:00,Transfer,Neutered Male,730
12,2 months,Dog,Chihuahua Shorthair/Pomeranian,Black,2014-03-07T00:00:00,2014-05-26T19:10:00,2014-05-26T19:10:00,Adoption,Neutered Male,60


In [332]:
#удалим старую колонку age_upon_outcome
data = data.drop(['age_upon_outcome'], axis=1)

___________________

In [333]:
#следующим шагом преобразуем категориальный признак animal_type
data.animal_type.value_counts()

Dog          29570
Cat          26055
Other          772
Bird           206
Livestock        8
Name: animal_type, dtype: int64

In [334]:
# преобразуем и оставим только 3 котегории: cat, dog, other
data.animal_type.replace(['Bird', 'Livestock'], 'Other', inplace=True)

In [335]:
data.animal_type.value_counts()

Dog      29570
Cat      26055
Other      986
Name: animal_type, dtype: int64

In [336]:
column_names = data.animal_type.unique()

In [337]:
type_animal = data['animal_type'].get_values().reshape(-1, 1)

In [338]:
# One-hot encoding
oh_encoder = preprocessing.OneHotEncoder()
oh_encoder.fit(type_animal)
oh_result = oh_encoder.transform(type_animal).toarray()
oh_result.shape

(56611, 3)

In [339]:
type_animal_df = pd.DataFrame(oh_result, columns = column_names)
type_animal_df.index = data.index
data = pd.concat([data, type_animal_df], axis=1)

In [340]:
#удалим старую колонку animal_type
data = data.drop(['animal_type'], axis=1)

In [341]:
data

Unnamed: 0,breed,color,date_of_birth,datetime,monthyear,outcome_type,sex_upon_outcome,number_of_days_upon_outcome,Cat,Dog,Other
0,Domestic Shorthair Mix,Orange Tabby,2014-07-07T00:00:00,2014-07-22T16:04:00,2014-07-22T16:04:00,Transfer,Intact Male,14,1.0,0.0,0.0
1,Beagle Mix,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Transfer,Spayed Female,365,0.0,1.0,0.0
2,Pit Bull,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,Adoption,Neutered Male,365,0.0,1.0,0.0
3,Miniature Schnauzer Mix,White,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Transfer,Neutered Male,3285,0.0,1.0,0.0
5,Leonberger Mix,Brown/White,2013-06-03T00:00:00,2013-10-07T13:06:00,2013-10-07T13:06:00,Transfer,Intact Male,120,0.0,1.0,0.0
7,Chihuahua Shorthair Mix,Brown,2011-11-23T00:00:00,2014-12-08T15:55:00,2014-12-08T15:55:00,Transfer,Spayed Female,1095,0.0,1.0,0.0
8,Domestic Shorthair Mix,Blue Tabby/White,2014-06-16T00:00:00,2014-08-14T18:45:00,2014-08-14T18:45:00,Adoption,Intact Female,30,1.0,0.0,0.0
9,Domestic Shorthair Mix,White/Black,2014-03-26T00:00:00,2014-06-29T17:45:00,2014-06-29T17:45:00,Adoption,Spayed Female,90,1.0,0.0,0.0
11,Papillon/Border Collie,Black/White,2012-02-28T00:00:00,2014-03-28T14:39:00,2014-03-28T14:39:00,Transfer,Neutered Male,730,0.0,1.0,0.0
12,Chihuahua Shorthair/Pomeranian,Black,2014-03-07T00:00:00,2014-05-26T19:10:00,2014-05-26T19:10:00,Adoption,Neutered Male,60,0.0,1.0,0.0


___________________

In [342]:
#преобразуем категориальную переменную breed
data.breed.value_counts()

Domestic Shorthair Mix                           20809
Pit Bull Mix                                      3509
Chihuahua Shorthair Mix                           3399
Labrador Retriever Mix                            3258
Domestic Medium Hair Mix                          2049
German Shepherd Mix                               1265
Domestic Longhair Mix                             1027
Siamese Mix                                        857
Australian Cattle Dog Mix                          788
Dachshund Mix                                      581
Border Collie Mix                                  459
Boxer Mix                                          437
Miniature Poodle Mix                               392
Catahoula Mix                                      336
Domestic Shorthair                                 323
Rat Terrier Mix                                    310
Australian Shepherd Mix                            301
Jack Russell Terrier Mix                           296
Yorkshire 

In [343]:
# мы видим что некоторые породы являются миксом пород, а некоторые животные чистокровные. Разделим признак по этому прикципу. Mix - 0, чистокровная - 1
data.breed = data.breed.apply(lambda x : 0 if 'Mix' in x else 1)

In [344]:
data

Unnamed: 0,breed,color,date_of_birth,datetime,monthyear,outcome_type,sex_upon_outcome,number_of_days_upon_outcome,Cat,Dog,Other
0,0,Orange Tabby,2014-07-07T00:00:00,2014-07-22T16:04:00,2014-07-22T16:04:00,Transfer,Intact Male,14,1.0,0.0,0.0
1,0,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Transfer,Spayed Female,365,0.0,1.0,0.0
2,1,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,Adoption,Neutered Male,365,0.0,1.0,0.0
3,0,White,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Transfer,Neutered Male,3285,0.0,1.0,0.0
5,0,Brown/White,2013-06-03T00:00:00,2013-10-07T13:06:00,2013-10-07T13:06:00,Transfer,Intact Male,120,0.0,1.0,0.0
7,0,Brown,2011-11-23T00:00:00,2014-12-08T15:55:00,2014-12-08T15:55:00,Transfer,Spayed Female,1095,0.0,1.0,0.0
8,0,Blue Tabby/White,2014-06-16T00:00:00,2014-08-14T18:45:00,2014-08-14T18:45:00,Adoption,Intact Female,30,1.0,0.0,0.0
9,0,White/Black,2014-03-26T00:00:00,2014-06-29T17:45:00,2014-06-29T17:45:00,Adoption,Spayed Female,90,1.0,0.0,0.0
11,1,Black/White,2012-02-28T00:00:00,2014-03-28T14:39:00,2014-03-28T14:39:00,Transfer,Neutered Male,730,0.0,1.0,0.0
12,1,Black,2014-03-07T00:00:00,2014-05-26T19:10:00,2014-05-26T19:10:00,Adoption,Neutered Male,60,0.0,1.0,0.0


____________________

In [345]:
# преобразуем переменную color
animal_color = data.color.value_counts()
animal_color

Black/White                 6111
Black                       5141
Brown Tabby                 3975
Brown Tabby/White           2083
Orange Tabby                1914
White                       1846
Tan/White                   1640
Brown/White                 1600
White/Black                 1551
Blue/White                  1467
Tricolor                    1404
Tan                         1366
Brown                       1289
Black/Tan                   1220
Calico                      1212
Tortie                      1210
Blue                        1099
White/Brown                 1057
Blue Tabby                  1024
Orange Tabby/White           967
Black/Brown                  905
Brown Brindle/White          898
White/Tan                    807
Torbie                       764
Brown/Black                  684
Red                          593
Red/White                    549
Brown Brindle                494
Blue Tabby/White             489
Cream Tabby                  458
          

 в этой категориальной переменной слишком много признаков. Можно заметить, что некоторые животные однотонные, а некоторые имееют разноцветный окрас. Разделим колонку по этому признаку, присвоив значения 1 и 0

In [346]:
data['multicolor'] = data['color'].apply(lambda x : 1 if '/' in x else 0)

In [347]:
data = data.drop(['color'], axis=1)

In [348]:
data.head()

Unnamed: 0,breed,date_of_birth,datetime,monthyear,outcome_type,sex_upon_outcome,number_of_days_upon_outcome,Cat,Dog,Other,multicolor
0,0,2014-07-07T00:00:00,2014-07-22T16:04:00,2014-07-22T16:04:00,Transfer,Intact Male,14,1.0,0.0,0.0,0
1,0,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Transfer,Spayed Female,365,0.0,1.0,0.0,1
2,1,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,Adoption,Neutered Male,365,0.0,1.0,0.0,1
3,0,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Transfer,Neutered Male,3285,0.0,1.0,0.0,0
5,0,2013-06-03T00:00:00,2013-10-07T13:06:00,2013-10-07T13:06:00,Transfer,Intact Male,120,0.0,1.0,0.0,1


_____________________________

In [349]:
# избавимся от переменных времени.
data = data.drop(['date_of_birth'], axis=1) 
# мы и так представляем сколько лет животному, потому что значем сколько ему было лет на момент попадания в приют


In [350]:
data = data.drop(['monthyear'], axis=1)

In [351]:
data = data.drop(['datetime'], axis=1)

__________________________

In [352]:
# преобразуем признак sex_upon_outcome
data.sex_upon_outcome.value_counts()

Neutered Male    20732
Spayed Female    19949
Intact Female     6874
Intact Male       6295
Unknown           2761
Name: sex_upon_outcome, dtype: int64

разобьем этот признак на 2. род и стерилизованное животное или нет

судя по терменологии Neutered=Spayed
я бы очень хотела значнеия unknown разбить 50 на 50. половина Intact, половина Neutered. Половина Male, половина Female.  Но не понимаю как это сделать. Буда рада советам. А пока, варварски присвою всем значениям Unknown значение Spayed Female

In [353]:
data.sex_upon_outcome.replace('Unknown', 'Spayed Female')

0          Intact Male
1        Spayed Female
2        Neutered Male
3        Neutered Male
5          Intact Male
7        Spayed Female
8        Intact Female
9        Spayed Female
11       Neutered Male
12       Neutered Male
15       Neutered Male
16       Neutered Male
17         Intact Male
18       Spayed Female
19       Spayed Female
20       Spayed Female
21       Neutered Male
22       Spayed Female
23       Neutered Male
24       Neutered Male
25       Spayed Female
26       Intact Female
27       Spayed Female
28       Intact Female
29       Intact Female
30       Spayed Female
31       Spayed Female
32       Spayed Female
33       Spayed Female
34       Neutered Male
             ...      
78214    Spayed Female
78215    Neutered Male
78217    Spayed Female
78218    Spayed Female
78223    Spayed Female
78224    Neutered Male
78225    Neutered Male
78227    Neutered Male
78228    Spayed Female
78229    Spayed Female
78230    Intact Female
78231    Intact Female
78232      

In [360]:
data

Unnamed: 0,breed,outcome_type,sex_upon_outcome,number_of_days_upon_outcome,Cat,Dog,Other,multicolor
0,0,Transfer,Intact Male,14,1.0,0.0,0.0,0
1,0,Transfer,Spayed Female,365,0.0,1.0,0.0,1
2,1,Adoption,Neutered Male,365,0.0,1.0,0.0,1
3,0,Transfer,Neutered Male,3285,0.0,1.0,0.0,0
5,0,Transfer,Intact Male,120,0.0,1.0,0.0,1
7,0,Transfer,Spayed Female,1095,0.0,1.0,0.0,0
8,0,Adoption,Intact Female,30,1.0,0.0,0.0,1
9,0,Adoption,Spayed Female,90,1.0,0.0,0.0,1
11,1,Transfer,Neutered Male,730,0.0,1.0,0.0,1
12,1,Adoption,Neutered Male,60,0.0,1.0,0.0,0


In [362]:
a = data['sex_upon_outcome'].str. split(" ")
a

0          [Intact, Male]
1        [Spayed, Female]
2        [Neutered, Male]
3        [Neutered, Male]
5          [Intact, Male]
7        [Spayed, Female]
8        [Intact, Female]
9        [Spayed, Female]
11       [Neutered, Male]
12       [Neutered, Male]
15       [Neutered, Male]
16       [Neutered, Male]
17         [Intact, Male]
18       [Spayed, Female]
19       [Spayed, Female]
20              [Unknown]
21       [Neutered, Male]
22       [Spayed, Female]
23       [Neutered, Male]
24       [Neutered, Male]
25       [Spayed, Female]
26       [Intact, Female]
27       [Spayed, Female]
28       [Intact, Female]
29       [Intact, Female]
30              [Unknown]
31       [Spayed, Female]
32       [Spayed, Female]
33       [Spayed, Female]
34       [Neutered, Male]
               ...       
78214    [Spayed, Female]
78215    [Neutered, Male]
78217    [Spayed, Female]
78218    [Spayed, Female]
78223    [Spayed, Female]
78224    [Neutered, Male]
78225    [Neutered, Male]
78227    [Ne

In [367]:
data['sex'] = data['sex_upon_outcome'].apply(lambda x : 1 if 'Male' in x else 0)
data['intact'] = data['sex_upon_outcome'].apply(lambda x : 1 if 'Intact' in x else 0)

In [369]:
data = data.drop(['sex_upon_outcome'], axis=1)

In [370]:
data

Unnamed: 0,breed,outcome_type,number_of_days_upon_outcome,Cat,Dog,Other,multicolor,sex,intact
0,0,Transfer,14,1.0,0.0,0.0,0,1,1
1,0,Transfer,365,0.0,1.0,0.0,1,0,0
2,1,Adoption,365,0.0,1.0,0.0,1,1,0
3,0,Transfer,3285,0.0,1.0,0.0,0,1,0
5,0,Transfer,120,0.0,1.0,0.0,1,1,1
7,0,Transfer,1095,0.0,1.0,0.0,0,0,0
8,0,Adoption,30,1.0,0.0,0.0,1,0,1
9,0,Adoption,90,1.0,0.0,0.0,1,0,0
11,1,Transfer,730,0.0,1.0,0.0,1,1,0
12,1,Adoption,60,0.0,1.0,0.0,0,1,0


In [371]:
# дата фрейм готов. Используем классификатор "случайный лес"

In [372]:
X = data.drop('outcome_type', axis=1)
y = data['outcome_type']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [373]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                            random_state=0)
rf_model = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test,y_pred))
y_prob = rf_model.predict_proba(X_test)
print(log_loss(y_test, y_prob))

0.800582884394595
0.5389940216502876
