## PetFinder.my Adoption Prediction

In this competition you will predict the speed at which a pet is adopted, based on the pet抯 listing on PetFinder. Sometimes a profile represents a group of pets.

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.offline import iplot,init_notebook_mode 
from plotly.tools import make_subplots
from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold, train_test_split

In [None]:
pd.options.display.max_columns = 100

In [None]:
init_notebook_mode(connected=True)

In [None]:
train_data  = pd.read_csv("../input/train/train.csv")

In [None]:
test_data = pd.read_csv("../input/test/test.csv")

In [None]:
breed_labels = pd.read_csv("../input/breed_labels.csv")

In [None]:
breed_names = {k:v for k, v in zip(list(breed_labels['BreedID']), list(breed_labels['BreedName']))}

In [None]:
breed_types = {k:v for k, v in zip(list(breed_labels['BreedID']), list(breed_labels['Type']))}

In [None]:
breed_names[0] = "NA"
breed_types[0] = "NA"

In [None]:
color_labels  = pd.read_csv("../input/color_labels.csv")

In [None]:
color_names = {k:v for k, v in zip(list(color_labels['ColorID']), list(color_labels['ColorName']))}

In [None]:
color_names[0] = "NA"

In [None]:
state_labels = pd.read_csv("../input/state_labels.csv")

In [None]:
state_names = {k:v for k, v in zip(list(state_labels['StateID']), list(state_labels['StateName']))}

In [None]:
train_data.columns

### Columns Description
- PetID - Unique hash ID of pet profile
- AdoptionSpeed - Categorical speed of adoption. Lower is faster. This is the value to predict.
- Type - Type of animal (1 = Dog, 2 = Cat)
- Name - Name of pet (Empty if not named)
- Age - Age of pet when listed, in months
- Breed1 - Primary breed of pet (Refer to BreedLabels dictionary)
- Breed2 - Secondary breed of pet, if pet is of mixed breed (Refer to BreedLabels dictionary)
- Gender - Gender of pet (1 = Male, 2 = Female, 3 = Mixed, if profile represents group of pets)
- Color1 - Color 1 of pet (Refer to ColorLabels dictionary)
- Color2 - Color 2 of pet (Refer to ColorLabels dictionary)
- Color3 - Color 3 of pet (Refer to ColorLabels dictionary)
- MaturitySize - Size at maturity (1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, 0 = Not Specified)
- FurLength - Fur length (1 = Short, 2 = Medium, 3 = Long, 0 = Not Specified)
- Vaccinated - Pet has been vaccinated (1 = Yes, 2 = No, 3 = Not Sure)
- Dewormed - Pet has been dewormed (1 = Yes, 2 = No, 3 = Not Sure)
- Sterilized - Pet has been spayed / neutered (1 = Yes, 2 = No, 3 = Not Sure)
- Health - Health Condition (1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified)
- Quantity - Number of pets represented in profile
- Fee - Adoption fee (0 = Free)
- State - State location in Malaysia (Refer to StateLabels dictionary)
- RescuerID - Unique hash ID of rescuer
- VideoAmt - Total uploaded videos for this pet
- PhotoAmt - Total uploaded photos for this pet
- Description - Profile write-up for this pet. The primary language used is English, with some in Malay or Chinese.

We will represent the categorical fields with the category name instaed of a number, for better readability 

In [None]:
train_data["Breed1"] = train_data["Breed1"].apply(lambda x: breed_names[x])
train_data["Breed2"] = train_data["Breed2"].apply(lambda x: breed_names[x])

In [None]:
test_data["Breed1"] = test_data["Breed1"].apply(lambda x: breed_names[x])
test_data["Breed2"] = test_data["Breed2"].apply(lambda x: breed_names[x])

In [None]:
train_data["Color1"] = train_data["Color1"].apply(lambda x: color_names[x])
train_data["Color2"] = train_data["Color2"].apply(lambda x: color_names[x])
train_data["Color3"] = train_data["Color3"].apply(lambda x: color_names[x])

In [None]:
test_data["Color1"] = test_data["Color1"].apply(lambda x: color_names[x])
test_data["Color2"] = test_data["Color2"].apply(lambda x: color_names[x])
test_data["Color3"] = test_data["Color3"].apply(lambda x: color_names[x])

In [None]:
train_data["State"] = train_data["State"].apply(lambda x: state_names[x])

In [None]:
test_data["State"] = test_data["State"].apply(lambda x: state_names[x])

In [None]:
train_data["Type"] = train_data["Type"].apply(lambda x: "Dog" if x==1 else "Cat")

In [None]:
test_data["Type"] = test_data["Type"].apply(lambda x: "Dog" if x==1 else "Cat")

In [None]:
yes_no_dict = {1: "Yes", 2: "No", 3: "Not Sure"}
train_data["Vaccinated"] = train_data["Vaccinated"].apply(lambda x: yes_no_dict[x])
train_data["Dewormed"] = train_data["Dewormed"].apply(lambda x: yes_no_dict[x])
train_data["Sterilized"] = train_data["Sterilized"].apply(lambda x: yes_no_dict[x])

In [None]:
test_data["Vaccinated"] = test_data["Vaccinated"].apply(lambda x: yes_no_dict[x])
test_data["Dewormed"] = test_data["Dewormed"].apply(lambda x: yes_no_dict[x])
test_data["Sterilized"] = test_data["Sterilized"].apply(lambda x: yes_no_dict[x])

In [None]:
gender_dict = {1: "Male", 2: "Female", 3: "Mixed"}
train_data["Gender"] = train_data["Gender"].apply(lambda x: gender_dict[x])

In [None]:
test_data["Gender"] = test_data["Gender"].apply(lambda x: gender_dict[x])

In [None]:
health_dict = {1 : "Healthy", 2 : "Minor Injury", 3 : "Serious Injury", 0 : "Not Specified"}
train_data["Health"] = train_data["Health"].apply(lambda x: health_dict[x])

In [None]:
test_data["Health"] = test_data["Health"].apply(lambda x: health_dict[x])

In [None]:
size_dict = {1 : "Small", 2 : "Medium", 3 : "Large", 4 : "Extra Large", 0 : "Not Specified"}
train_data["MaturitySize"] = train_data["MaturitySize"].apply(lambda x: size_dict[x])

In [None]:
test_data["MaturitySize"] = test_data["MaturitySize"].apply(lambda x: size_dict[x])

In [None]:
fur_dict =  {1 : "Short", 2 : "Medium", 3 : "Long", 0 : "Not Specified"}
train_data["FurLength"] = train_data["FurLength"].apply(lambda x: fur_dict[x])

In [None]:
test_data["FurLength"] = test_data["FurLength"].apply(lambda x: fur_dict[x])

### Unique values for each columns

Number of unique values for each columns, including numerical ones.

In [None]:
pd.DataFrame({"Columns Name": list(train_data.columns),
              "Number of unique values (train)": [train_data[c].unique().shape[0] for c in train_data.columns], 
              "Number of unique values (test)": 
              [0 if c=="AdoptionSpeed" else test_data[c].unique().shape[0] for c in train_data.columns]})

### Analysis of Individual Columns

It's of paramount importance to see the ditribution of values for each column and we will do that analysis in this section. We will generally use bar chart for categorical features and histigram for numerical features. For numerical features outlier above and below 3 standard deviation of mean are removed. For categorical for some fields, category with very few examples are not shown.

In [None]:
counts = dict(train_data["Type"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["Type"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=1, cols=2)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 1, 2)

fig["layout"].update(title="Number of Pets by Type")
iplot(fig)

**Insight:** Number of cats are higher in test data. If this field turns out to be important, it may be an issue. Keep in mind that this field affects other imporant fields too, like age, as cats and dogs grow at different speed and have different life span.

In [None]:
trace_train = go.Histogram(x=list(train_data.loc[train_data["Age"] < train_data["Age"].mean()+3*train_data["Age"].std(), "Age"]), 
                           opacity=0.75,
                           xbins=dict(size=1),
                           name="Training Data")
trace_test = go.Histogram(x=list(test_data.loc[test_data["Age"] < test_data["Age"].mean()+3*test_data["Age"].std(), "Age"]), 
                          opacity=0.75,
                          xbins=dict(size=1),
                          name="Testing Data")

layout = go.Layout(title="Number of pets by Age", barmode="overlay", xaxis=dict(title="Age (Months)"))

fig = go.Figure(data=[trace_train, trace_test], layout=layout)

iplot(fig)

**Insight:** Most of the pets are less than a year old, with 1 and 2 months being the most populated. For older pets most converted from year and are multiple of 12, instead of actual age in month. Outliers are removed.

In [None]:
VALUE_THRESHOLD = 10

In [None]:
counts = dict(train_data["Breed1"].value_counts())
counts = {k[:15]:v for k,v in counts.items() if v>=3*VALUE_THRESHOLD}
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["Breed1"].value_counts())
counts = {k[:15]:v for k,v in counts.items() if v>=VALUE_THRESHOLD}
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=2, cols=1, vertical_spacing = 0.2)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 2, 1)

fig["layout"].update(title="Number of Pets by Breed 1", height=800, margin=go.layout.Margin(b=150))
iplot(fig)

**Insight:** Mixed breed is most populous. Even the second and third _Domestic Short Hair_ and _Domestic Medium Hair_ looks more like some kind of mix breed and not pure breed. This data might be more subjective than expected. Categories with less that _VALUE_THRESHOLD_ number are not shown, for better visualization.

In [None]:
counts = dict(train_data["Breed2"].value_counts())
counts = {k[:15]:v for k,v in counts.items() if v>=3*VALUE_THRESHOLD}
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["Breed2"].value_counts())
counts = {k[:15]:v for k,v in counts.items() if v>=VALUE_THRESHOLD}
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=2, cols=1)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 2, 1)

fig["layout"].update(title="Number of Pets by Breed 2", height=800)
iplot(fig)

**Insight:** Most pets have no second breed available. This field should not be too important. Too me it seems that when they can decide in which category a perticular animal belongs, they put both.

In [None]:
counts = dict(train_data["Color1"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["Color1"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=2, cols=1)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 2, 1)

fig["layout"].update(title="Number of Pets by color 1", height=800)
iplot(fig)

**Insight:** Black is most popular color, followed by Brown.

In [None]:
counts = dict(train_data["Color2"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["Color2"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=2, cols=1)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 2, 1)

fig["layout"].update(title="Number of Pets by color 2", height=800)
iplot(fig)

**Insight:** Here it makes sense to have another field as pets, especially mix breeds will have multiple shades.

In [None]:
counts = dict(train_data["Color3"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["Color3"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=2, cols=1)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 2, 1)

fig["layout"].update(title="Number of Pets by color 3", height=800)
iplot(fig)

**Insight:** Third color seems somewhat unnecessary.

In [None]:
counts = dict(train_data["MaturitySize"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["MaturitySize"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=1, cols=2)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 1, 2)

fig["layout"].update(title="Number of Pets by Size", height=600)
iplot(fig)

**Insight:** Medium and small pets are dominant.

In [None]:
counts = dict(train_data["FurLength"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["FurLength"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=1, cols=2)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 1, 2)

fig["layout"].update(title="Number of Pets by Fur Length", height=600)
iplot(fig)

In [None]:
counts = dict(train_data["Vaccinated"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["Vaccinated"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=1, cols=2)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 1, 2)

fig["layout"].update(title="Number of Pets by Vaccination", height=600)
iplot(fig)

**Insight:** A lot of pets are not vaccinated. This might reduce their chances of adoption, as the adopter has to do that expense.

In [None]:
counts = dict(train_data["Dewormed"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["Dewormed"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=1, cols=2)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 1, 2)

fig["layout"].update(title="Number of Pets by Deworming", height=600)
iplot(fig)

**Insight:** Deworming is more common, however the same logic applies here.

In [None]:
counts = dict(train_data["Health"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["Health"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=1, cols=2)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 1, 2)

fig["layout"].update(title="Number of Pets by Health", height=600)
iplot(fig)

**Insight:** Pets with minor injuries can have reduced chance of adoption. Seriously injured pets are very rare and can be ignored.

In [None]:
counts = dict(train_data["State"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["State"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=2, cols=1)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 2, 1)

fig["layout"].update(title="Number of Pets by State", height=800)
iplot(fig)

**Insight:** Will this be a factor? May be one region is more pet friendly compared to another. We can only know with feature imporance later in this notebook.

In [None]:
counts = {"No Fees (Zero)": (train_data["Fee"]==0).sum(), "With Fees (Non-Zero)":  (train_data["Fee"]!=0).sum()}
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = {"No Fees (Zero)": (test_data["Fee"]==0).sum(), "With Fees (Non-Zero)":  (test_data["Fee"]!=0).sum()}

trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=1, cols=2)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 1, 2)

fig["layout"].update(title="Number of Pets by Fees", height=600)
iplot(fig)

**Insight:** Majority are free adoption, without any fees. However, the one with fees might be more desirable/ vaccinated etc.

In [None]:
trace_train = go.Histogram(x=list(train_data.loc[(train_data["Fee"] < (train_data["Fee"].mean() + 3*train_data["Fee"].std())) 
                                    & (train_data["Fee"] > 0) , "Fee"]), 
                           xbins=dict(size=10),
                           opacity=0.75, 
                           name="Training Data")
trace_test = go.Histogram(x=list(test_data.loc[(test_data["Fee"] < (test_data["Fee"].mean() + 3*test_data["Fee"].std())) 
                                    & (test_data["Fee"] > 0) , "Fee"]),
                          xbins=dict(size=10),
                          opacity=0.75, 
                          name="Testing Data")

layout = go.Layout(title="Number of pets by Fee (Non-Zero)", barmode="overlay", xaxis=dict(title="Fees"))

fig = go.Figure(data=[trace_train, trace_test], layout=layout)

iplot(fig)

**Insight:** Fees are generally in multiple of 50. Outliers are removed for better visualization.

In [None]:
counts = dict(train_data["VideoAmt"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

counts = dict(test_data["VideoAmt"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Testing Data")

fig  = make_subplots(rows=2, cols=1)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 2, 1)

fig["layout"].update(title="Number of Pets by Videos", height=800)
iplot(fig)

**Insight:** Most pets does not have a video.

In [None]:
counts = dict(train_data["PhotoAmt"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), 
                           y=list(counts.values()), 
                           name="Training Data")

counts = dict(test_data["PhotoAmt"].value_counts())
trace_test = go.Bar(x=list(counts.keys()), 
                          y=list(counts.values()), 
                          name="Testing Data")

fig  = make_subplots(rows=2, cols=1)

fig.append_trace(trace_train, 1, 1)
fig.append_trace(trace_test, 2, 1)

fig["layout"].update(title="Number of Pets by Photos", height=800)
iplot(fig)

**Insight:** Most pets have 1 to 5 photos.

In [None]:
train_data.loc[train_data["Description"].isnull(), "Description"] = ""
train_data["Descrpition_Length"]  = train_data["Description"].apply(lambda s: len(s))

In [None]:
test_data.loc[test_data["Description"].isnull(), "Description"] = ""
test_data["Descrpition_Length"]  = test_data["Description"].apply(lambda s: len(s))

In [None]:
trace_train = go.Histogram(x=list(train_data.loc[train_data["Descrpition_Length"] < train_data["Descrpition_Length"].mean()+3*train_data["Descrpition_Length"].std(), "Descrpition_Length"]), 
                           opacity=0.75,
                           xbins=dict(size=50),
                           name="Training Data")
trace_test = go.Histogram(x=list(test_data.loc[test_data["Descrpition_Length"] < test_data["Descrpition_Length"].mean()+3*test_data["Descrpition_Length"].std(), "Descrpition_Length"]), 
                          opacity=0.75,
                          xbins=dict(size=50),
                          name="Testing Data")

layout = go.Layout(title="Number of pets by Decription Length", barmode="overlay", xaxis=dict(title="Length in Characters"))

fig = go.Figure(data=[trace_train, trace_test], layout=layout)

iplot(fig)

**Insight:** Most pets have description in the range of 0-300 words with the middle section most populated. This is short description comparable to a tweet. Outliers are removed.

In [None]:
counts = dict(train_data["AdoptionSpeed"].value_counts())
trace_train = go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Training Data")

layout = go.Layout(title="Number of Pets by Adoption Speed")

fig = go.Figure(data=[trace_train], layout=layout)

iplot(fig)

### LightGBM and Feature Importance 

In [None]:
train_data = pd.read_csv("../input/train/train.csv")

In [None]:
train_data

In [None]:
FOLDS = 5

In [None]:
catagorical_features = ["Type", "Breed1", "Breed2", "Color1", "Color2", "Color3", "MaturitySize", "FurLength", "Vaccinated", "Dewormed", "Sterilized", "Health", "State"]

In [None]:
non_features  = ["Name", "Description", "PetID", "RescuerID"]

In [None]:
label = "AdoptionSpeed"

In [None]:
kfold = StratifiedKFold(n_splits=5, random_state=22)

In [None]:
train_data["DescriptionLength"] = train_data["Description"].apply(lambda s: len(s) if isinstance(s, str) else 0)

In [None]:
features = [c for c in train_data.columns if c not in set(non_features+[label])]; features

In [None]:
def clip(x):
    if x < 0:
        return 0
    if x > 4:
        return 4
    return x

In [None]:
vclip = np.vectorize(clip)

In [None]:
models = []
predictions = []
ids = []
result= []
for train_indices, val_indices in kfold.split(X=np.arange(train_data.shape[0]), y=train_data["AdoptionSpeed"]):
    ids.append(train_data.iloc[train_indices]["PetID"])
    model = LGBMRegressor(colsample_bytree=0.9, subsample=0.9, n_estimators=1000, random_state=22, silent=True)
    model.fit(X=train_data.iloc[train_indices][features], y=train_data.iloc[train_indices][label], categorical_feature=catagorical_features, 
             eval_set=(train_data.iloc[val_indices][features], train_data.iloc[val_indices][label]), early_stopping_rounds=10)
    pred = np.round(model.predict(train_data.iloc[val_indices][features]))
    pred = vclip(pred)
    kappa = cohen_kappa_score(y1=pred, y2=np.array(train_data.iloc[val_indices][label]), weights="quadratic")
    print("Kappa score is ", kappa)
    predictions.append(pred)
    result.append(kappa)
    models.append(model)

In [None]:
np.mean(kappa)

In [None]:
trace = go.Bar(x=features, y=list(np.mean([models[i].feature_importances_ for i in range(FOLDS)], axis=0)), name="Training Data")

layout = go.Layout(title="Feature Importance")

fig = go.Figure(data=[trace], layout=layout)

iplot(fig)

In [None]:
iterations = np.mean([models[i].best_iteration_ for i in range(FOLDS)]); [models[i].best_iteration_ for i in range(FOLDS)]

In [None]:
test_data = pd.read_csv("../input/test/test.csv")

In [None]:
test_data["DescriptionLength"] = test_data["Description"].apply(lambda s: len(s) if isinstance(s, str) else 0)

In [None]:
predictions  = []
for i in range(FOLDS):
    pred = np.round(models[i].predict(test_data[features]))
    pred = vclip(pred)
    predictions.append(pred)

In [None]:
predictions = np.array(predictions, dtype=np.int64)

In [None]:
predictions[:, 0]

In [None]:
final_pred = []
for i in range(predictions.shape[1]):
    final_pred.append(np.argmax(np.bincount(predictions[:, i])))

In [None]:
submission = pd.DataFrame({"PetID": list(test_data["PetID"]), "AdoptionSpeed": final_pred})

In [None]:
submission.to_csv("submission.csv", index=None)