In [31]:
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

In [15]:
iris = load_iris()

In [16]:
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

In [17]:
df['target_names'] = [iris.target_names[i] for i in iris.target]

In [18]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [6]:
def column_cleaner(col, bad_characters="()*&^@#$%"):
    for ch in bad_characters:
        col = col.replace(ch, "")
    col = col.replace(" ", "_")
    col = col.lower()
    return col

In [10]:
df.rename(mapper=column_cleaner, axis=1, inplace=True)
df.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [11]:
df['pl_pw_ratio'] = df['petal_length_cm']/df['petal_width_cm']
df.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,target,target_names,pl_pw_ratio
0,5.1,3.5,1.4,0.2,0,setosa,7.0
1,4.9,3.0,1.4,0.2,0,setosa,7.0
2,4.7,3.2,1.3,0.2,0,setosa,6.5
3,4.6,3.1,1.5,0.2,0,setosa,7.5
4,5.0,3.6,1.4,0.2,0,setosa,7.0


In [12]:
df['sepal_diff'] = df['sepal_length_cm'] - df['sepal_width_cm']
df['abs_sepal_diff'] = df['sepal_diff'].abs()
df.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,target,target_names,pl_pw_ratio,sepal_diff,abs_sepal_diff
0,5.1,3.5,1.4,0.2,0,setosa,7.0,1.6,1.6
1,4.9,3.0,1.4,0.2,0,setosa,7.0,1.9,1.9
2,4.7,3.2,1.3,0.2,0,setosa,6.5,1.5,1.5
3,4.6,3.1,1.5,0.2,0,setosa,7.5,1.5,1.5
4,5.0,3.6,1.4,0.2,0,setosa,7.0,1.4,1.4


In [13]:
df['pw_pl_sum'] = df['petal_length_cm'] + df['petal_width_cm']
df.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,target,target_names,pl_pw_ratio,sepal_diff,abs_sepal_diff,pw_pl_sum
0,5.1,3.5,1.4,0.2,0,setosa,7.0,1.6,1.6,1.6
1,4.9,3.0,1.4,0.2,0,setosa,7.0,1.9,1.9,1.6
2,4.7,3.2,1.3,0.2,0,setosa,6.5,1.5,1.5,1.5
3,4.6,3.1,1.5,0.2,0,setosa,7.5,1.5,1.5,1.7
4,5.0,3.6,1.4,0.2,0,setosa,7.0,1.4,1.4,1.6


In [28]:
import data_handling.pipeline as pl
from importlib import reload
reload(pl)

<module 'data_handling.pipeline' from '/Users/rafael/MyProjects/PyCharm/project00/data_handling/pipeline.py'>

In [29]:
pl.clean_data(df=df, write_file=True, filepath="./data/df_cleaned.csv")

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,target,target_names,pl_pw_ratio,sepal_diff,abs_sepal_diff,pw_pl_sum
0,5.1,3.5,1.4,0.2,0,setosa,7.000000,1.6,1.6,1.6
1,4.9,3.0,1.4,0.2,0,setosa,7.000000,1.9,1.9,1.6
2,4.7,3.2,1.3,0.2,0,setosa,6.500000,1.5,1.5,1.5
3,4.6,3.1,1.5,0.2,0,setosa,7.500000,1.5,1.5,1.7
4,5.0,3.6,1.4,0.2,0,setosa,7.000000,1.4,1.4,1.6
...,...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica,2.260870,3.7,3.7,7.5
146,6.3,2.5,5.0,1.9,2,virginica,2.631579,3.8,3.8,6.9
147,6.5,3.0,5.2,2.0,2,virginica,2.600000,3.5,3.5,7.2
148,6.2,3.4,5.4,2.3,2,virginica,2.347826,2.8,2.8,7.7


In [20]:
import os

In [22]:
filepath = "./data/df_cleaned.csv"

In [24]:
path, filename = os.path.split(filepath)

In [25]:
os.mkdir(path)

# let's build a model

In [30]:
clf = RandomForestClassifier()

In [32]:
df_train, df_test = train_test_split(df, test_size=0.30)

In [33]:
x_train = df_train.drop(columns=[i for i in df.columns if 'target' in i])
y_train = df_train['target_names']

In [34]:
clf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [35]:
x_test = df_test.drop(columns=[i for i in df.columns if 'target' in i])
y_test = df_test['target_names']

In [37]:
clf.score(x_test, y_test), clf.score(x_train, y_train)

(0.9777777777777777, 1.0)

In [38]:
import pickle

In [39]:
pkl_filename = "./models/rf_bc.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)