In [1]:
#1.

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns

In [2]:
df = sns.load_dataset('dots')

In [3]:
df.head()

Unnamed: 0,align,choice,time,coherence,firing_rate
0,dots,T1,-80,0.0,33.189967
1,dots,T1,-80,3.2,31.691726
2,dots,T1,-80,6.4,34.27984
3,dots,T1,-80,12.8,32.631874
4,dots,T1,-80,25.6,35.060487


In [4]:
df.isna().sum()

align          0
choice         0
time           0
coherence      0
firing_rate    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 848 entries, 0 to 847
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   align        848 non-null    object 
 1   choice       848 non-null    object 
 2   time         848 non-null    int64  
 3   coherence    848 non-null    float64
 4   firing_rate  848 non-null    float64
dtypes: float64(2), int64(1), object(2)
memory usage: 33.3+ KB


In [6]:
X = df.drop('choice', axis=1)
y = df.choice

In [7]:
numerical_cols = ['time', 'coherence', 'firing_rate']
categorical_cols = ['align']

In [8]:
#feature engineering automation
#numericle pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),#missing values
        ('scaler', StandardScaler()) ,
    ]
)

#categorical pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),#missing values
        ('onehotencoder', OneHotEncoder()), # categorical to numerical feature
    ]
)

In [9]:
preprocessor = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ]
)

In [10]:
pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ("feature_selector", SelectKBest(score_func=f_classif, k=4)),
        ("model", RandomForestClassifier(random_state=42))
    ]
)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=0)

In [12]:
pipeline.fit(X_train, y_train)

In [13]:
y_pred = pipeline.predict(X_test)
y_pred

array(['T1', 'T2', 'T1', 'T1', 'T1', 'T1', 'T1', 'T2', 'T1', 'T2', 'T2',
       'T1', 'T1', 'T1', 'T1', 'T2', 'T1', 'T2', 'T2', 'T1', 'T2', 'T2',
       'T1', 'T1', 'T1', 'T1', 'T1', 'T2', 'T1', 'T2', 'T1', 'T1', 'T1',
       'T2', 'T1', 'T2', 'T1', 'T2', 'T2', 'T2', 'T2', 'T2', 'T2', 'T2',
       'T1', 'T1', 'T2', 'T1', 'T2', 'T2', 'T1', 'T1', 'T2', 'T2', 'T2',
       'T1', 'T1', 'T2', 'T2', 'T2', 'T1', 'T2', 'T2', 'T1', 'T1', 'T1',
       'T2', 'T2', 'T1', 'T2', 'T2', 'T2', 'T2', 'T1', 'T2', 'T2', 'T1',
       'T1', 'T2', 'T1', 'T1', 'T2', 'T2', 'T2', 'T2', 'T2', 'T2', 'T1',
       'T2', 'T1', 'T2', 'T1', 'T1', 'T2', 'T2', 'T2', 'T2', 'T1', 'T2',
       'T1', 'T2', 'T2', 'T1', 'T2', 'T2', 'T1', 'T2', 'T2', 'T2', 'T2',
       'T2', 'T2', 'T1', 'T1', 'T1', 'T1', 'T2', 'T2', 'T1', 'T1', 'T1',
       'T1', 'T2', 'T2', 'T1', 'T2', 'T1', 'T2', 'T2', 'T2', 'T1', 'T1',
       'T2', 'T1', 'T2', 'T2', 'T2', 'T2', 'T2', 'T1', 'T1', 'T2', 'T1',
       'T1', 'T2', 'T1', 'T2', 'T1', 'T1', 'T1', 'T

In [14]:
accuracy_score(y_test, y_pred)

0.803921568627451

In [15]:
#2.

df = sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [16]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [18]:
df.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [19]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [20]:
df['species'] = df['species'].map({
    'setosa' : 1,
    'versicolor': 2,
    'virginica' : 3
})

In [21]:
df.species

0      1
1      1
2      1
3      1
4      1
      ..
145    3
146    3
147    3
148    3
149    3
Name: species, Length: 150, dtype: int64

In [22]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=0)

In [24]:
voting_pipeline = Pipeline([
    ('voting', VotingClassifier(estimators=[('rfc', RandomForestClassifier()), ('lr', LogisticRegression())], voting='hard'))
])

In [25]:
voting_pipeline.fit(X_train, y_train)

In [26]:
y_pred = voting_pipeline.predict(X_test)
y_pred

array([3, 2, 1, 3, 1, 3, 1, 2, 2, 2, 3, 2, 2, 2, 2, 1, 2, 2, 1, 1, 3, 2,
       1, 1, 3, 1, 1, 2, 2, 1, 3, 2, 1, 3, 3, 2, 1, 3, 2, 2, 3, 1, 3, 1,
       1], dtype=int64)

In [27]:
accuracy_score(y_test, y_pred)

0.9777777777777777