In [5]:
import seaborn as sns
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load data

In [6]:
df_iris = sns.load_dataset('iris')

In [7]:
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [9]:
df_iris.to_csv('../data/raw_data/df_iris_raw.csv', index=False)

# Transform data 

In [10]:
df_iris.sample(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
97,6.2,2.9,4.3,1.3,versicolor
62,6.0,2.2,4.0,1.0,versicolor
17,5.1,3.5,1.4,0.3,setosa
144,6.7,3.3,5.7,2.5,virginica
84,5.4,3.0,4.5,1.5,versicolor


In [11]:
# Split data into dev and prod sets
df_iris_dev, df_iris_prod = train_test_split(df_iris, test_size=0.1, random_state=42)

In [12]:
df_iris_dev.shape, df_iris_prod.shape

((135, 5), (15, 5))

In [13]:
df_iris_dev.to_csv('../data/raw_data/df_iris_dev_raw.csv', index=False)
df_iris_prod.to_csv('../data/raw_data/df_iris_prod_raw.csv', index=False)

In [14]:
# Split data into train and test sets
X = df_iris_dev.drop('species', axis=1)
y = df_iris_dev['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train.shape, X_test.shape

((108, 4), (27, 4))

In [16]:
X_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
108,6.7,2.5,5.8,1.8
99,5.7,2.8,4.1,1.3
89,5.5,2.5,4.0,1.3
59,5.2,2.7,3.9,1.4
119,6.0,2.2,5.0,1.5


In [17]:
"""
categorical_columns = df_iris.select_dtypes('object')
numerical_columns = df_iris.select_dtypes(['float64','int64'])
#date_columns = df_iris.select_dtypes(include="datetime64[ns]")
print(f"Numerical columns are: {list(numerical_columns)}\nCategorical columns are: {list(categorical_columns)}")

pipeline_for_categorical = Pipeline([
    ("Impute", SimpleImputer(strategy="most_frequent")),
    ("Label Encode", LabelEncoder())
])

pipeline_for_numerical = Pipeline([
    ("Impute",SimpleImputer(strategy="mean")),
    ("Standard Scale", StandardScaler())
])

preprocessing_pipeline = ColumnTransformer([
    ("Categorical", pipeline_for_categorical, categorical_columns),
    ("Numerical", pipeline_for_numerical, numerical_columns)
], remainder='passthrough')

model = RandomForestClassifier(n_estimators=100, random_state=42)

full_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('classifier', model)
])
"""

'\ncategorical_columns = df_iris.select_dtypes(\'object\')\nnumerical_columns = df_iris.select_dtypes([\'float64\',\'int64\'])\n#date_columns = df_iris.select_dtypes(include="datetime64[ns]")\nprint(f"Numerical columns are: {list(numerical_columns)}\nCategorical columns are: {list(categorical_columns)}")\n\npipeline_for_categorical = Pipeline([\n    ("Impute", SimpleImputer(strategy="most_frequent")),\n    ("Label Encode", LabelEncoder())\n])\n\npipeline_for_numerical = Pipeline([\n    ("Impute",SimpleImputer(strategy="mean")),\n    ("Standard Scale", StandardScaler())\n])\n\npreprocessing_pipeline = ColumnTransformer([\n    ("Categorical", pipeline_for_categorical, categorical_columns),\n    ("Numerical", pipeline_for_numerical, numerical_columns)\n], remainder=\'passthrough\')\n\nmodel = RandomForestClassifier(n_estimators=100, random_state=42)\n\nfull_pipeline = Pipeline([\n    (\'preprocessing\', preprocessing_pipeline),\n    (\'classifier\', model)\n])\n'

In [20]:
ss = StandardScaler()
X_train_transformed = ss.fit_transform(X_train)
X_test_transformed = ss.transform(X_test)

In [21]:
le = LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

In [22]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [23]:
model.fit(X_train_transformed, y_train_transformed)

In [24]:
y_pred = model.predict(X_test_transformed)

In [25]:
accuracy = accuracy_score(y_test_transformed, y_pred)
accuracy

0.9629629629629629

In [26]:
joblib.dump(model, "../models/evaluated_dev_model.pkl")

['../models/evaluated_dev_model.pkl']

In [27]:
y_pred

array([2, 0, 1, 0, 0, 1, 2, 1, 1, 2, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 0, 1,
       0, 0, 2, 2, 2])

In [28]:
le.inverse_transform(y_pred)

array(['virginica', 'setosa', 'versicolor', 'setosa', 'setosa',
       'versicolor', 'virginica', 'versicolor', 'versicolor', 'virginica',
       'versicolor', 'versicolor', 'versicolor', 'virginica', 'setosa',
       'versicolor', 'versicolor', 'virginica', 'virginica', 'setosa',
       'setosa', 'versicolor', 'setosa', 'setosa', 'virginica',
       'virginica', 'virginica'], dtype=object)

In [29]:
df_iris_prod.iloc[0,:]

sepal_length           6.1
sepal_width            2.8
petal_length           4.7
petal_width            1.2
species         versicolor
Name: 73, dtype: object

In [30]:
df_iris_prod.iloc[5,:]

sepal_length       5.4
sepal_width        3.4
petal_length       1.5
petal_width        0.4
species         setosa
Name: 31, dtype: object

In [31]:
data = {
        "sepal_length": 6.1,
        "sepal_width": 2.8,
        "petal_length": 4.7,
        "petal_width": 1.2,
    }

In [32]:
pd.DataFrame([data])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,6.1,2.8,4.7,1.2


In [33]:
data = {
        "sepal_length": [6.1],
        "sepal_width": [2.8],
        "petal_length": [4.7],
        "petal_width": [1.2],
    }
features = [data[key] for key in data.keys()]

In [34]:
features

[[6.1], [2.8], [4.7], [1.2]]

In [35]:
data

{'sepal_length': [6.1],
 'sepal_width': [2.8],
 'petal_length': [4.7],
 'petal_width': [1.2]}

In [36]:
pd.DataFrame(data)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,6.1,2.8,4.7,1.2


In [37]:
import numpy as np

In [38]:
np.array(features).reshape(1,-1)

array([[6.1, 2.8, 4.7, 1.2]])