In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import pandas_profiling

In [2]:
train_df = pd.read_csv('Dataset/train.csv')
soln_df = pd.read_csv('Dataset/test.csv')

In [3]:
train_df.drop(['pet_id'], axis = 1, inplace = True)
main_test = soln_df.drop(['pet_id'], axis = 1)

In [4]:
def pre_process(df):
    df.condition.fillna(3, inplace=True)
    df['time_to_shelter'] = (pd.to_datetime(df.listing_date) - pd.to_datetime(df.issue_date)).dt.days
    df.drop(['issue_date','listing_date'], axis = 1, inplace=True)
    
    top_colors = [
                     'Black',
                     'White',
                     'Brown',
                     'Brown Tabby',
                     'Tan',
                     'Blue',
                     'Orange Tabby',
                     'Red',
                     'Brown Brindle',
                     'Tricolor',
                     'Blue Tabby',
                     'Tortie',
                     'Calico',
                     'Gray',
                     'Chocolate',
                     'Torbie',
                     'Cream Tabby',
                     'Sable',
                     'Cream',
                     'Fawn',
                     'Yellow',
                     'Buff',
                     'Lynx Point',
                     'Blue Merle'
                ]
    
    def reduce_color(color):
        if color in top_colors:
            return color
        return 'Other'
    
    df['color_mod'] = df.color_type.apply(reduce_color)
    df.drop(['color_type'], axis = 1, inplace=True)
    df.drop(['length(m)','height(cm)'], axis = 1, inplace= True)
    
    return df

In [5]:
train_df = pre_process(train_df)
main_test = pre_process(main_test)

In [6]:
train_y1 = train_df.loc[:,['breed_category']]
train_y2 = train_df.loc[:,['pet_category']]

In [7]:
train_X = train_df.drop(['breed_category','pet_category'], axis=1)

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [9]:
ct = ColumnTransformer([("color", OneHotEncoder(sparse=False, drop='first'), [4])], remainder = 'passthrough')
ct.fit(train_X)

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('color',
                                 OneHotEncoder(categories='auto', drop='first',
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='error',
                                               sparse=False),
                                 [4])],
                  verbose=False)

In [10]:
train_X = ct.transform(train_X)
main_test = ct.transform(main_test)

In [11]:
print(train_X.shape)
print(main_test.shape)

(18834, 28)
(8072, 28)


In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

sc = scaler.fit(train_X)

train_X = sc.transform(train_X)

main_test = sc.transform(main_test)

In [13]:
from sklearn.model_selection import train_test_split

X_train_breed, X_test_breed, y_train_breed, y_test_breed = train_test_split(train_X, train_y1, test_size = 0.2)
X_train_pet, X_test_pet, y_train_pet, y_test_pet = train_test_split(train_X, train_y2, test_size = 0.2)

In [14]:
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler()
X_train_breed, y_train_breed = os.fit_resample(X_train_breed, y_train_breed)
X_train_pet, y_train_pet = os.fit_resample(X_train_pet, y_train_pet)

Using TensorFlow backend.


In [16]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix 

In [28]:
params = {
        'C': [0.5, 0.7, 1, 1.5],
        'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': [2, 3, 4, 5, 7]
        }
folds = 5
skf = StratifiedKFold(n_splits=folds, shuffle = True)
random_search = RandomizedSearchCV(SVC(), param_distributions=params, cv=skf.split(X_train_breed, y_train_breed), verbose=3)
random_search.fit(X_train_breed, y_train_breed.breed_category.ravel())

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] kernel=rbf, degree=3, C=0.7 .....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ......... kernel=rbf, degree=3, C=0.7, score=0.918, total=   6.7s
[CV] kernel=rbf, degree=3, C=0.7 .....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.7s remaining:    0.0s


[CV] ......... kernel=rbf, degree=3, C=0.7, score=0.927, total=   6.8s
[CV] kernel=rbf, degree=3, C=0.7 .....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.5s remaining:    0.0s


[CV] ......... kernel=rbf, degree=3, C=0.7, score=0.924, total=   6.7s
[CV] kernel=rbf, degree=3, C=0.7 .....................................
[CV] ......... kernel=rbf, degree=3, C=0.7, score=0.925, total=   6.4s
[CV] kernel=rbf, degree=3, C=0.7 .....................................
[CV] ......... kernel=rbf, degree=3, C=0.7, score=0.922, total=   6.4s
[CV] kernel=sigmoid, degree=4, C=0.5 .................................
[CV] ..... kernel=sigmoid, degree=4, C=0.5, score=0.807, total=   7.0s
[CV] kernel=sigmoid, degree=4, C=0.5 .................................
[CV] ..... kernel=sigmoid, degree=4, C=0.5, score=0.808, total=   7.4s
[CV] kernel=sigmoid, degree=4, C=0.5 .................................
[CV] ..... kernel=sigmoid, degree=4, C=0.5, score=0.824, total=   8.1s
[CV] kernel=sigmoid, degree=4, C=0.5 .................................
[CV] ..... kernel=sigmoid, degree=4, C=0.5, score=0.811, total=   7.3s
[CV] kernel=sigmoid, degree=4, C=0.5 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  6.4min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7fad5f676050>,
                   error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'C': [0.5, 0.7, 1, 1.5],
                                        'degree': [2, 3, 4, 5, 7],
                                        'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, s

In [29]:
random_search.best_params_

{'kernel': 'linear', 'degree': 7, 'C': 0.7}

In [30]:
params = {
        'C': [0.5, 0.7, 1, 1.5],
        'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': [2, 3, 4, 5, 7]
        }
folds = 5
skf = StratifiedKFold(n_splits=folds, shuffle = True)
random_search = RandomizedSearchCV(SVC(), param_distributions=params, cv=skf.split(X_train_pet, y_train_pet), verbose=3)
random_search.fit(X_train_pet, y_train_pet.pet_category.ravel())

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] kernel=sigmoid, degree=7, C=0.7 .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..... kernel=sigmoid, degree=7, C=0.7, score=0.696, total=  19.9s
[CV] kernel=sigmoid, degree=7, C=0.7 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.9s remaining:    0.0s


[CV] ..... kernel=sigmoid, degree=7, C=0.7, score=0.693, total=  22.4s
[CV] kernel=sigmoid, degree=7, C=0.7 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   42.3s remaining:    0.0s


[CV] ..... kernel=sigmoid, degree=7, C=0.7, score=0.702, total=  23.0s
[CV] kernel=sigmoid, degree=7, C=0.7 .................................
[CV] ..... kernel=sigmoid, degree=7, C=0.7, score=0.691, total=  20.6s
[CV] kernel=sigmoid, degree=7, C=0.7 .................................
[CV] ..... kernel=sigmoid, degree=7, C=0.7, score=0.702, total=  26.3s
[CV] kernel=rbf, degree=5, C=0.5 .....................................
[CV] ......... kernel=rbf, degree=5, C=0.5, score=0.805, total=  18.7s
[CV] kernel=rbf, degree=5, C=0.5 .....................................
[CV] ......... kernel=rbf, degree=5, C=0.5, score=0.811, total=  18.8s
[CV] kernel=rbf, degree=5, C=0.5 .....................................
[CV] ......... kernel=rbf, degree=5, C=0.5, score=0.807, total=  18.4s
[CV] kernel=rbf, degree=5, C=0.5 .....................................
[CV] ......... kernel=rbf, degree=5, C=0.5, score=0.808, total=  19.1s
[CV] kernel=rbf, degree=5, C=0.5 .....................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 16.5min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7fad61dfd350>,
                   error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='scale', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'C': [0.5, 0.7, 1, 1.5],
                                        'degree': [2, 3, 4, 5, 7],
                                        'kernel': ['linear', 'poly', 'rbf',
                                                   'sigmoid']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, s

In [31]:
random_search.best_params_

{'kernel': 'rbf', 'degree': 7, 'C': 1.5}

In [32]:
breed_model = SVC(kernel = 'linear', degree = 7, C = 0.7)

breed_model.fit(X_train_breed, y_train_breed)

breed_test_preds = breed_model.predict(X_test_breed) 

confusion_matrix(y_test_breed, breed_test_preds) 

  y = column_or_1d(y, warn=True)


array([[1786,    1,    0],
       [ 398, 1284,    0],
       [   0,    0,  298]])

In [33]:
pet_model = SVC(kernel = 'rbf', degree = 7, C = 1.5)

pet_model.fit(X_train_pet, y_train_pet)

pet_test_preds = pet_model.predict(X_test_pet) 

confusion_matrix(y_test_pet, pet_test_preds) 

  y = column_or_1d(y, warn=True)


array([[  10,    3,    1,    4],
       [  46, 1175,  220,   16],
       [  79,  173, 1830,   16],
       [  33,    3,   20,  138]])

In [34]:
breed_preds = breed_model.predict(main_test)

In [35]:
pet_preds = pet_model.predict(main_test)

In [36]:
submission = pd.DataFrame()

In [37]:
submission['pet_id'] = soln_df.pet_id

In [38]:
submission['breed_category'] = breed_preds

In [39]:
submission['pet_category'] = pet_preds

In [40]:
submission.breed_category = submission.breed_category.astype('int64')

In [41]:
submission

Unnamed: 0,pet_id,breed_category,pet_category
0,ANSL_75005,1,2
1,ANSL_76663,0,1
2,ANSL_58259,0,2
3,ANSL_67171,0,2
4,ANSL_72871,0,2
...,...,...,...
8067,ANSL_66809,0,0
8068,ANSL_59041,1,2
8069,ANSL_60034,1,2
8070,ANSL_58066,2,4


In [42]:
submission.to_csv('submission_5.csv',index=False)