In [1]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier
# Load pandas
import pandas as pd
# Load numpy
import numpy as np
# 设置随机数生成器的种子（seed），以确保每次运行代码时产生的随机数是相同的。
np.random.seed(0)

In [2]:
# Create an object called iris with the iris data
iris = load_iris()
# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)
# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
# Add a new column with the species names, this is what we are going to try to pred
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier
# Load pandas
import pandas as pd
# Load numpy
import numpy as np
# 设置随机数生成器的种子（seed），以确保每次运行代码时产生的随机数是相同的。
np.random.seed(0)
# 读取 CSV 文件
df = pd.read_csv('iris.data')

# 如果目标列没有命名为 'species'，手动重命名列
df.columns = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)','species']
df['species'] = pd.Categorical(df['species'])
# 再次查看数据，确保正确加载
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [9]:
# 新建一列，为每一行随机生成一个介于 0 和 1 之间的数字，如果该值小于或等于 .75，
# 则将该单元格的值设置为 True，否则设置为 false。这是一种快速随机分配训练数据和测试数
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,4.9,3.0,1.4,0.2,Iris-setosa,True
1,4.7,3.2,1.3,0.2,Iris-setosa,True
2,4.6,3.1,1.5,0.2,Iris-setosa,True
3,5.0,3.6,1.4,0.2,Iris-setosa,True
4,5.4,3.9,1.7,0.4,Iris-setosa,True


In [10]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 118
Number of observations in the test data: 31


In [11]:
# Create a list of the feature column's names
features = df.columns[:4]
# View features
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [12]:
y = pd.factorize(train['species'])[0]
# View target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [14]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier( random_state=0)
# Train the Classifier to take the training features and learn how they relate
clf.fit(train[features], y)

In [15]:
# Apply the Classifier we trained to the test data (which, remember, it has never s
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2], dtype=int64)

In [16]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.95, 0.05, 0.  ],
       [0.93, 0.07, 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ]])

In [17]:
# Create actual english names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]
# View the PREDICTED species for the first five observations
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [18]:
# View the ACTUAL species for the first five observations
test['species'].head()

7     Iris-setosa
8     Iris-setosa
10    Iris-setosa
13    Iris-setosa
17    Iris-setosa
Name: species, dtype: category
Categories (3, object): ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

In [19]:
# Create confusion matrix,使用 pd.crosstab 生成一个交叉表，用于比较实际类别和模型预测
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predict Species'])

Predict Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,13,0,0
Iris-versicolor,0,7,0
Iris-virginica,0,1,10


In [20]:
# View a list of the features and their importance scores
list(zip(train[features], clf.feature_importances_))

[('sepal length (cm)', 0.09109686125700395),
 ('sepal width (cm)', 0.028838866756288495),
 ('petal length (cm)', 0.43146687384413385),
 ('petal width (cm)', 0.44859739814257377)]