In [None]:
!pip install ppscore
!pip install catboost
!pip install scikit-optimize

In [None]:
import random 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [None]:
train = pd.read_csv('../input/flower-type-prediction-machine-hack/Train.csv')


In [None]:
train.head()

In [None]:
test = pd.read_csv('../input/flower-type-prediction-machine-hack/Test.csv')


In [None]:
test.head()

In [None]:
train.nunique()

In [None]:
test.nunique()

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
# Correlation Coefficient Matrix => Train Dataset

corr = train.corr()
plt.figure(figsize = (15,10))
sns.heatmap(corr, cmap = 'YlGnBu', annot = True, linewidths = 0.5);

In [None]:
# Distribution plots of X_11 and X_15

plt.figure(figsize = (10,8))
sns.pairplot(train);

In [None]:
train['Class'].value_counts()

In [None]:
sns.countplot(x = 'Class', palette = 'GnBu_d', data = train);

### Very highly imbalanced data present here.

In [None]:
plt.figure(figsize = (10,8))
sns.barplot(palette = 'BrBG', data = train);

In [None]:
plt.figure(figsize = (20,15))
sns.boxplot(data = train);

In [None]:
# Splitting training dataset into train and test
X = train.copy().drop(['Class'], axis=1).values
y = train['Class']

In [None]:
test = test.copy().values

In [None]:
print(train.shape)
print(X.shape)
print(y.shape)

In [None]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101)

In [None]:
from imblearn.over_sampling import SMOTE
oversampler = SMOTE(random_state = 110)
smote_train, smote_train_target = oversampler.fit_sample(X_train,y_train)
smote_test,smote_test_target = oversampler.fit_sample(X_test,y_test)

In [None]:
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(smote_train)
X_test = sc.transform(smote_test)
X_whole = sc.transform(X.copy())
test_v = sc.transform(test)

In [None]:
from sklearn.metrics import log_loss

In [None]:
from lightgbm import LGBMClassifier

In [None]:
clf = LGBMClassifier(colsample_bytree=0.727364034739382, min_child_samples=238,
               min_child_weight=0.1, num_leaves=44, reg_alpha=0.1,
               reg_lambda=0.1, subsample=0.20152591992997815)
clf.fit(X_train, smote_train_target)
clf_probs = clf.predict_proba(X_test)
score = log_loss(smote_test_target, clf_probs)

In [None]:
# Importing GridSearch and RandomSearch

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [None]:

params ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [None]:
folds = 20
param_comb = 10

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1231)

random_search = RandomizedSearchCV(clf, param_distributions=params, 
                                   n_iter=param_comb, 
                                   n_jobs=4, cv=skf.split(X_train, smote_train_target), 
                                   verbose=3, random_state=1231 )

# Here we go

random_search.fit(X_train, smote_train_target)

In [None]:
random_search.best_score_

In [None]:
random_search.best_params_

In [None]:
random_search.best_estimator_

In [None]:
# from catboost import CatBoostClassifier 

In [None]:
# clf = CatBoostClassifier()
# clf.fit(X_train, smote_train_target)
# clf_probs = clf.predict_proba(X_test)
# score = log_loss(smote_test_target, clf_probs)

In [None]:
print (score)

In [None]:
submission_format = pd.read_csv('../input/flower-type-prediction-machine-hack/sample_submission.csv')

In [None]:
sub = submission_format.head()

In [None]:
clf_probs = clf.predict_proba(test_v)

In [None]:
my_submission = pd.DataFrame(clf_probs, index=submission_format.index)

In [None]:
my_submission.to_csv('submission.csv')
print('Exported')