In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv
from abc import ABC, abstractmethod
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin, clone
from sklearn.impute import SimpleImputer
from sklearn.metrics import log_loss, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif, SelectKBest, RFE, SelectFromModel
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Reduce the size of your train and test data to model more easily
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Read Data from Kaggle
df = pd.read_csv('/kaggle/input/pokemon/pokemon.csv')

**Exploratory Data Analysis**

In [None]:
df.head(3)

In [None]:
#Check the presence of missing values
df.isnull().values.any()

In [None]:
# Search the column names with missing values
cols_missing_val = df.columns[df.isnull().any()].tolist()
print(cols_missing_val)

In [None]:
#Count missing values in each column:
for col in cols_missing_val:
    print("%s : %d" % (col, df[col].isnull().sum()))

In [None]:
sns.heatmap(df[cols_missing_val].isnull(), yticklabels=False, cbar=False)

In [None]:
#Hence genderless pokemons can be assigned '-1'
df['percentage_male'].fillna(np.int(-1), inplace=True)

In [None]:
#Unique values
df['type2'].unique()

In [None]:
#Replace nan with new type2
df['type2'].fillna('hormann', inplace=True)

In [None]:
#Replace the missing values with 0.
df['height_m'].fillna(np.int(0), inplace=True)
df['weight_kg'].fillna(np.int(0), inplace=True)

In [None]:
#Memory Consumption
mem = df.memory_usage(index=True).sum()
print("Memory consumed by training set  :   {} MB" .format(mem/ 1024**2))

In [None]:
df.isnull().values.any()

In [None]:
df = reduce_mem_usage(df, verbose=True)

In [None]:
#Classfication
df['classfication'].nunique()

In [None]:
df.dtypes

In [None]:
for label,content in df.items():
    if pd.api.types.is_float_dtype(content):
        df[label] = df[label].astype('int')

In [None]:
df.dtypes

In [None]:
for label,content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        df[label] = df[label].astype('category')
df.dtypes

In [None]:
for label,content in df.items():
    if pd.api.types.is_categorical_dtype(content):
        df[label] = pd.Categorical(content).codes + 1

In [None]:
df.dtypes

In [None]:
X = df.drop('is_legendary', axis=1)
y = df['is_legendary']

In [None]:
## For modelling :
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# Modelling tools :
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score,RandomizedSearchCV,GridSearchCV 

In [None]:
model_a = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model_a.fit(X_train, y_train)
model_a.score(X_test, y_test)

In [None]:
model_b = GradientBoostingClassifier()
model_b.fit(X_train,y_train)
model_b.score(X_test,y_test)

In [None]:
# Lets check the cross val score
y_preds = model_a.predict_proba(X_test)
cvm = cross_val_score(model_a,X,y,cv=10)
np.mean(cvm)

In [None]:
# Classification metrics :
y_preds = model_a.predict(X_test)

precision = precision_score(y_test,y_preds)
recall = recall_score(y_test,y_preds)
accuracy = accuracy_score(y_test,y_preds)
accuracy,recall,precision

In [None]:
## Lets get the legendary predictions : 
Pokemon = pd.DataFrame()
y_preds = model_a.predict(X)
Pokemon['Default values'] = y
Pokemon['Predictions'] = y_preds

In [None]:
Pokemon

In [None]:
fig,axes = plt.subplots()
axes.stackplot(Pokemon['Default values'],Pokemon['Predictions'],color=['red','blue']);