Inspired by Exploratory Analysis done by Andrew Lukyanenko
https://www.kaggle.com/artgor/exploration-of-data-step-by-step
    
I will explore the PetFinder Data and Implement the Light GBM Classifier.

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import os
import time
import warnings
import gc
gc.collect()
import os
from six.moves import urllib
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('seaborn')
from scipy.stats import norm, skew
from sklearn.preprocessing import StandardScaler

In [None]:
#Add All the Models Libraries

# Scalers
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from PIL import Image
from wordcloud import WordCloud
from tqdm import tqdm_notebook
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import lightgbm as lgb

from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from scipy.stats import reciprocal, uniform

from sklearn.model_selection import StratifiedKFold, RepeatedKFold

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import cross_validate

# GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Common data processors
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from scipy import sparse

In [None]:
train = pd.read_csv('../input/train/train.csv')
test = pd.read_csv('../input/test/test.csv')
submission = pd.read_csv('../input/test/sample_submission.csv')

In [None]:
train.shape

In [None]:
test.shape

**Exploratory Data Analysis for Pet Finder**

In [None]:
train.drop('Description', axis=1).head()

In [None]:
categoryVariableList = ['Breed1','Breed2','Color1','Color2','Color3']
for var in categoryVariableList:
    train[var] = train[var].astype("category")
    test[var] = test[var].astype("category")

In [None]:
train.info()

In [None]:
train.describe()

Target: Adoption speed

* 0 - Pet was adopted on the same day as it was listed.
* 1 - Pet was adopted between 1 and 7 days (1st week) after being listed.
* 2 - Pet was adopted between 8 and 30 days (1st month) after being listed.
* 3 - Pet was adopted between 31 and 90 days (2nd & 3rd month) after being listed.
* 4 - No adoption after 100 days of being listed. (There are no pets in this dataset that waited between 90 and 100 days).

From the Target Variable, I identified this as a multiple classification problem

In [None]:
breeds = pd.read_csv('../input/breed_labels.csv')
colors = pd.read_csv('../input/color_labels.csv')
states = pd.read_csv('../input/state_labels.csv')

In [None]:
breeds['bread2'] = breeds['BreedID']
breeds.columns = ['Breed1', 'Type','BreedName','Breed2']
breeds = breeds.drop('Type', axis=1)
#add row for Zero's
breeds = breeds.append({'Breed1' : 0 ,'BreedName' : 'NA', 'Breed2': 0} , ignore_index=True)

categoryVariableList = ['Breed1','Breed2']
for var in categoryVariableList:
    breeds[var] = breeds[var].astype("category")

breeds.tail(5)

In [None]:
colors['Color2'] = colors['ColorID']
colors['Color3'] = colors['ColorID']
colors.columns = ['Color1', 'ColorName','Color2','Color3']
colors = colors.append({'Color1' : 0 , 'ColorName' : 'NA', 'Color2': 0, 'Color3':0} , ignore_index=True)

categoryVariableList = ['Color1','Color2','Color3']
for var in categoryVariableList:
    colors[var] = colors[var].astype("category")

colors.tail(5)

In [None]:
states.columns = ['State', 'StateName']
states.head(5)

In [None]:
train.shape

In [None]:
# Merge Breed, Color and States the Train Set and Test Set
train = pd.merge(train, states, on='State',how='left')
train = pd.merge(train,colors[['Color1','ColorName']],on='Color1', how='left')
train = pd.merge(train,colors[['Color2','ColorName']],on='Color2', how='left')
train = pd.merge(train,colors[['Color3','ColorName']],on='Color3', how='left')
train = pd.merge(train, breeds[['Breed1','BreedName']], on='Breed1',how='left')
train = pd.merge(train, breeds[['Breed2','BreedName']], on='Breed2',how='left')

test = pd.merge(test, states, on='State',how='left')
test = pd.merge(test,colors[['Color1','ColorName']],on='Color1', how='left')
test = pd.merge(test,colors[['Color2','ColorName']],on='Color2', how='left')
test = pd.merge(test,colors[['Color3','ColorName']],on='Color3', how='left')
test = pd.merge(test, breeds[['Breed1','BreedName']], on='Breed1',how='left')
test = pd.merge(test, breeds[['Breed2','BreedName']], on='Breed2',how='left')

In [None]:
test.shape

In [None]:
train.shape

In [None]:
train = train.drop(['Breed1','Breed2','Color1','Color2','Color3','State','Description','RescuerID'],axis=1)
train.columns = ['Type','Name','Age','Gender','MaturitySize','FurLength','Vaccinated','Dewormed','Sterilized',
              'Health','Quantity','Fee','VideoAmt','PetID','PhotoAmt','AdoptionSpeed',
              'StateName','ColorName_1','ColorName_2','ColorName_3','Breed1','Breed2']
train.head(10)

**Exploration A : Explore the Adoption Speed**

In [None]:
plt.figure(figsize=(14, 6));
g = sns.countplot(x='AdoptionSpeed', data=train)
plt.title('Adoption speed classes rates');
ax=g.axes #annotate axis = seaborn axis
for p in ax.patches:
     ax.annotate(f"{p.get_height() / train.shape[0]:.2f}%", (p.get_x() + p.get_width() / 2., p.get_height()),
         ha='center', va='center', fontsize=11, color='gray', rotation=0, xytext=(0, 10),
         textcoords='offset points')  

**Exploration B : Explore the Fur Size vs Adoption Speed**

**Step A** : Identify the Fur Length of Cat and Dogs {The Totals}

In [None]:
train['Category'] = train['Type'].apply(lambda x: 'Dog' if x == 1 else 'Cat')
plt.figure(figsize=(10, 6));
Aggregated_1 = pd.DataFrame(train.groupby(['FurLength','Category'],sort=True)['Type'].count()).reset_index()
sns.barplot(x='FurLength', y='Type', data=Aggregated_1, hue='Category');
plt.title('Identify the Fur Length of Cat and Dogs Adopted');

In [None]:
Aggregated_1['Furcategory'] = Aggregated_1['FurLength'].astype(str) + Aggregated_1['Category']
Aggregated_1.columns = ['FurLength','Category','Totals','Furcategory']
Aggregated_1

**Step B**: Get the Total Adoptions of Cat and Dogs based on Fur Length

In [None]:
train['Category'] = train['Type'].apply(lambda x: 'Dog' if x == 1 else 'Cat')
plt.figure(figsize=(10, 6));
Aggregated_2 = pd.DataFrame(train.groupby(['FurLength','Category','AdoptionSpeed'],sort=True)['Type'].count()).reset_index()
sns.catplot(col = 'Category', y='Type', x= 'FurLength' ,hue = 'AdoptionSpeed',data=Aggregated_2,kind="bar")

In [None]:
Aggregated_2['Furcategory'] = Aggregated_2['FurLength'].astype(str) + Aggregated_2['Category']
Aggregated_2.columns = ['FurLength','Category','AdoptionSpeed','Adoptions','Furcategory']
Aggregated_2.head(5)

In [None]:
Aggregated = pd.merge(Aggregated_2,Aggregated_1[['Furcategory','Totals']],on='Furcategory', how='left')

In [None]:
Aggregated ['AdoptionRate'] = Aggregated['Adoptions'] / Aggregated['Totals']

In [None]:
Aggregated.head(5)

**Step C : Now we get the final insight ie. Adoption Rate i.e. Final Adoption : Adoptions / Totals**

In [None]:
sns.catplot(col = 'Category', x= 'FurLength', y='AdoptionRate' ,hue = 'AdoptionSpeed',data=Aggregated,kind="bar")

it is certain that Cats and Dogs with less Fur are more likely to be adopted

**Exploration C: Are Names equally important in adoption, but let's see.**

At first let's look at most common names.

In [None]:
fig, ax = plt.subplots(figsize = (16, 12))
plt.subplot(1, 2, 1)
text_cat = ' '.join(train.loc[train['Category'] == 'Cat', 'Name'].fillna('').values)
wordcloud = WordCloud(max_font_size=None, background_color='white',
                      width=1200, height=1000).generate(text_cat)
plt.imshow(wordcloud)
plt.title('Top cat names')
plt.axis("off")

plt.subplot(1, 2, 2)
text_dog = ' '.join(train.loc[train['Category'] == 'Dog', 'Name'].fillna('').values)
wordcloud = WordCloud(max_font_size=None, background_color='white',
                      width=1200, height=1000).generate(text_dog)
plt.imshow(wordcloud)
plt.title('Top dog names')
plt.axis("off")

plt.show()

In [None]:
train['Name'] = train['Name'].fillna('Unnamed')
test['Name'] = test['Name'].fillna('Unnamed')

train['No_name'] = 0
train.loc[train['Name'] == 'Unnamed', 'No_name'] = 1
test['No_name'] = 0
test.loc[test['Name'] == 'Unnamed', 'No_name'] = 1

print(f"Rate of unnamed pets in train data: {train['No_name'].sum() * 100 / train['No_name'].shape[0]:.4f}%.")
print(f"Rate of unnamed pets in test data: {test['No_name'].sum() * 100 / test['No_name'].shape[0]:.4f}%.")

train['No_name'] = train['No_name'].apply(lambda x: 'No Name' if x == 1 else 'Already Named')
Aggregated = pd.DataFrame(train.groupby(["AdoptionSpeed","No_name"],sort=True)["Category"].count()).reset_index()

sns.barplot(y='Category', x= 'AdoptionSpeed',hue = 'No_name',data=Aggregated)
plt.title('Identify The Dogs and Cats with no names');

The graph above does indicates that name plays significant role in adoption.

**Exploration D:  Identify the Age of Cat and Dogs Adopted**

In [None]:
fig, ax = plt.subplots(figsize = (16, 6))
plt.subplot(1, 2, 1)
plt.title('Distribution of pets age');
train['Age'].plot('hist', label='train');
test['Age'].plot('hist', label='test');
plt.legend();

plt.subplot(1, 2, 2)
plt.title('Distribution of pets age (log)');
np.log1p(train['Age']).plot('hist', label='train');
np.log1p(test['Age']).plot('hist', label='test');
plt.legend();