In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
from sklearn.datasets import load_iris

In [3]:
df = sns.load_dataset("iris")

In [4]:
df = df.sample(frac=1)

In [5]:
df.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [6]:
col = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

### Checking Skew

In [7]:
for i in col:
    print(f'{i} is {df[i].skew()}')

sepal_length is 0.31491095663697266
sepal_width is 0.31896566471359983
petal_length is -0.27488417975101115
petal_width is -0.10296674764898105


In [8]:
def odiqr(df):
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    iqr = q3 - q1
    low = q1 - (1.5 * iqr)
    high = q3 + (1.5 * iqr)
    m = df.mean()
    df = df.apply(lambda x : m if x < low else (m if x > high else x ) )
    return df

In [9]:
def odmsd(df):
    m = round(df.mean(),2)
    s = round(df.std(),2)
    low = round(m-(3*s),2)
    high = round(m+(3*s),2)
    ft1 = df[df<low]
    ft2 = df[df>high]
    df = df.map(lambda x : low if x < low else (high if x > high else x ))

In [10]:
for i in col:
    if df[i].skew() >= 0.5:
        odmsd(df[i])
    else:
        df[i] = odiqr(df[i])

### After Skew

In [11]:
for i in col:
    print(f'{i} is {df[i].skew()}')

sepal_length is 0.31491095663697266
sepal_width is 0.129811306317394
petal_length is -0.27488417975101115
petal_width is -0.10296674764898105


## Encoding With Label Encoder Of df["species"]

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
le = LabelEncoder()

In [15]:
df = pd.concat([df,pd.DataFrame(le.fit_transform(df["species"]),columns=["SPECIES"])],axis=1).drop("species",axis=1)

In [16]:
le.classes_

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [17]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,SPECIES
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,2
4,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,1
146,6.3,2.5,5.0,1.9,0
147,6.5,3.0,5.2,2.0,1
148,6.2,3.4,5.4,2.3,1


In [18]:
X = df.drop("SPECIES",axis=1)

In [19]:
X.sample()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
117,7.7,3.8,6.7,2.2


In [20]:
y = df["SPECIES"]

In [21]:
y.sample()

80    1
Name: SPECIES, dtype: int64

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=.20)

In [24]:
X.shape , Xtrain.shape , Xtest.shape

((150, 4), (120, 4), (30, 4))

In [25]:
y.shape , ytrain.shape , ytest.shape

((150,), (120,), (30,))

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [27]:
rfc = RandomForestClassifier()

In [28]:
rfc.fit(Xtrain,ytrain)

RandomForestClassifier()

In [29]:
rfc.score(Xtest,ytest)

0.2

### With Feature Scaling

In [30]:
from sklearn.preprocessing import StandardScaler

In [31]:
ss= StandardScaler()

In [32]:
ss.fit(Xtrain)

StandardScaler()

In [33]:
Xtrain_ss = ss.transform(Xtrain)

In [34]:
Xtest_ss = ss.transform(Xtest)

In [35]:
rfc1 = RandomForestClassifier()

In [36]:
rfc1.fit(Xtrain_ss,ytrain)

RandomForestClassifier()

In [37]:
rfc1.score(Xtest_ss,ytest)

0.23333333333333334

### With  XGBOOST

In [38]:
from xgboost import XGBClassifier

In [45]:
xg = XGBClassifier(use_label_encoder=False,objective=typing.Union)

NameError: name 'typing' is not defined

In [42]:
xg.fit(Xtrain,ytrain)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [None]:
xg.score(Xtest,ytest)

### With Feature Scaling

In [None]:
xg1 = XGBClassifier(use_label_encoder=False)

In [None]:
xg1.fit(Xtrain_ss,ytrain)

In [None]:
xg1.score(Xtest_ss,ytest)