In [21]:
#KNN算法

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

iris = load_iris()
x_train, x_test, y_train, y_test = train_test_split(iris.data,iris.target, random_state=6)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

estimator = KNeighborsClassifier(n_neighbors=3)
estimator.fit(x_train,y_train)

y_predict = estimator.predict(x_test)
print(y_predict)
print(y_test == y_predict)

score = estimator.score(x_test,y_test)
print(score)

[0 2 0 0 2 1 1 0 2 1 2 1 2 2 1 1 2 1 1 0 0 2 0 0 1 1 1 2 0 1 0 1 0 0 1 2 1
 2]
[ True  True  True  True  True  True False  True  True  True  True  True
  True  True  True False  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True False  True
  True  True]
0.9210526315789473


In [22]:
#模型选择与调优

from sklearn.model_selection import GridSearchCV

iris = load_iris()
x_train, x_test, y_train, y_test = train_test_split(iris.data,iris.target, random_state=6)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

estimator = KNeighborsClassifier()

estimator = GridSearchCV(estimator, param_grid={"n_neighbors":[1,3,5,7,9,11]},cv=10)

estimator.fit(x_train,y_train)

y_predict = estimator.predict(x_test)
print(y_predict)
print(y_test == y_predict)

score = estimator.score(x_test,y_test)
print(score)

print(estimator.best_params_)
print(estimator.best_score_)
print(estimator.best_estimator_)
print(estimator.cv_results_)

[0 2 0 0 2 1 2 0 2 1 2 1 2 2 1 1 2 1 1 0 0 2 0 0 1 1 1 2 0 1 0 1 0 0 1 2 1
 2]
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True False  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True False  True
  True  True]
0.9473684210526315
{'n_neighbors': 11}
0.9734848484848484
KNeighborsClassifier(n_neighbors=11)
{'mean_fit_time': array([0.00170128, 0.00250113, 0.00456886, 0.00289838, 0.0024008 ,
       0.00230052]), 'std_fit_time': array([0.00078129, 0.00257959, 0.00644505, 0.00281052, 0.00237537,
       0.00228414]), 'mean_score_time': array([0.01509995, 0.01429994, 0.00580242, 0.00630162, 0.00610094,
       0.00539999]), 'std_score_time': array([0.02245772, 0.02026098, 0.00340189, 0.00313597, 0.00350797,
       0.00372111]), 'param_n_neighbors': masked_array(data=[1, 3, 5, 7, 9, 11],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object)

In [23]:
#朴素贝叶斯算法

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups

news = fetch_20newsgroups(subset='all')

x_train, x_test, y_train, y_test = train_test_split(news.data, news.target)

transfer = TfidfVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

estimator = MultinomialNB()
estimator.fit(x_train,y_train)

y_predict = estimator.predict(x_test)
print(y_predict)
print(y_test == y_predict)

score = estimator.score(x_test,y_test)
print(score)

[ 0 12 14 ...  7  5  2]
[ True False  True ...  True  True False]
0.8590831918505942


In [24]:
#决策树

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
x_train, x_test, y_train, y_test = train_test_split(iris.data,iris.target, random_state=22)

estimator = DecisionTreeClassifier(criterion='entropy')
estimator.fit(x_train,y_train)

y_predict = estimator.predict(x_test)
print(y_predict)
print(y_test == y_predict)

score = estimator.score(x_test,y_test)
print(score)

[0 2 1 2 1 1 1 1 1 0 2 1 2 2 0 2 1 1 1 1 0 2 0 1 2 0 2 2 2 1 0 0 1 1 1 0 0
 0]
[ True  True  True  True  True  True  True False  True  True  True  True
  True  True  True  True  True  True False  True  True  True  True  True
  True  True  True  True  True False  True  True  True  True  True  True
  True  True]
0.9210526315789473


In [25]:
from sklearn.tree import export_graphviz
export_graphviz(estimator,out_file='iris_tree.dot',feature_names=iris.feature_names)

In [26]:
#泰坦尼克号
import pandas as pd

titanic = pd.read_csv(r'titanic.csv')
titanic.head()

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,zero,zero.1,zero.2,zero.3,zero.4,...,zero.12,zero.13,zero.14,Pclass,zero.15,zero.16,Embarked,zero.17,zero.18,2urvived
0,1,22.0,7.25,0,1,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
1,2,38.0,71.2833,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0.0,0,0,1
2,3,26.0,7.925,1,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,1
3,4,35.0,53.1,1,1,0,0,0,0,0,...,0,0,0,1,0,0,2.0,0,0,1
4,5,35.0,8.05,0,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0


In [27]:
x = titanic[["Pclass","Sex","Age"]]
y = titanic["2urvived"]

x["Age"].fillna(x["Age"].mean(),inplace=True)

x = x.to_dict(orient="recorda")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

x_train, x_test, y_train,y_test = train_test_split(x,y,random_state=22)

transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

estimator = DecisionTreeClassifier(criterion='entropy',max_depth=8)
estimator.fit(x_train,y_train)

y_predict = estimator.predict(x_test)
print(y_predict)
print(y_test == y_predict)

score = estimator.score(x_test,y_test)
print(score)

from sklearn.tree import export_graphviz
export_graphviz(estimator,out_file='titanic_tree.dot',feature_names=transfer.get_feature_names_out())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["Age"].fillna(x["Age"].mean(),inplace=True)
  x = x.to_dict(orient="recorda")


[1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 1
 0 1 0 0 1 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1
 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0
 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
827      True
882      True
1206     True
1071     True
607     False
        ...  
1246     True
1279     True
575      True
100      True
195     False
Name: 2urvived, Length: 328, dtype: bool
0.7774390243902439


In [29]:
#随机森林
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

titanic = pd.read_csv(r'titanic.csv')
x = titanic[["Pclass","Sex","Age"]]
y = titanic["2urvived"]

x["Age"].fillna(x["Age"].mean(),inplace=True)

x = x.to_dict(orient="recorda")

x_train, x_test, y_train,y_test = train_test_split(x,y,random_state=22)

transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

estimator = RandomForestClassifier()

estimator = GridSearchCV(estimator, param_grid={"n_estimators":[120,200,300,500,800,1200],"max_depth":[5,8,15,25,30]},cv=3)

estimator.fit(x_train,y_train)

y_predict = estimator.predict(x_test)
print(y_predict)
print(y_test == y_predict)

score = estimator.score(x_test,y_test)
print(score)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["Age"].fillna(x["Age"].mean(),inplace=True)
  x = x.to_dict(orient="recorda")


[1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1
 0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0
 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
827      True
882      True
1206     True
1071     True
607     False
        ...  
1246     True
1279     True
575      True
100      True
195     False
Name: 2urvived, Length: 328, dtype: bool
0.7896341463414634
