In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor, export_graphviz, export_text
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from IPython.display import Image
from sklearn import metrics
from IPython.core.display import HTML

In [None]:
data = pd.read_csv(r'C:\Users\Дарья\Downloads\Cancer_Data.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data['diagnosis'].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
le.fit(data['diagnosis'])
data['diagnosis'] = le.transform(data['diagnosis'])
data

In [None]:
data['diagnosis'].unique()

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(data.corr(), annot=True, fmt='.2f', cmap='GnBu')

In [None]:
print('Признаки, имеющие максимальную по модулю корреляцию с качеством вина')
best_params = data.corr()['diagnosis'].map(abs).sort_values(ascending=False)[1:]
best_params = best_params[best_params.values > 0.3]
best_params

In [None]:
best_params = best_params.drop(['texture_mean', 'smoothness_mean', 'symmetry_mean'])

In [None]:
plt.figure(figsize=(8, 4))
sns.heatmap(data[best_params.index].corr(), vmin=-1, vmax=1, cmap='GnBu', annot=True)
plt.show()

In [None]:
plt.figure(figsize=(6, 3))
sns.heatmap(pd.DataFrame(data[np.append(best_params.index.values, 'diagnosis')].corr()['diagnosis'].sort_values(ascending=False)[1:]), vmin=-1, vmax=1, cmap='GnBu', annot=True)
plt.show()

In [None]:
X = data.drop('diagnosis', axis = 1)
# Теперь выделим нужный столбец
y = data['diagnosis']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

In [None]:
def print_metrics(y_test, y_pred):
    print(f"R^2: {r2_score(y_test, y_pred)}")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)


In [None]:
print_metrics(y_test, y_pred_linear)

In [None]:
poly_model = PolynomialFeatures(degree=3)
x_train_poly = poly_model.fit_transform(X_train)
x_test_poly = poly_model.fit_transform(X_test)
linear_model = LinearRegression()
linear_model.fit(x_train_poly, y_train)
y_pred_poly = linear_model.predict(x_test_poly)

In [None]:
print_metrics(y_test, y_pred_poly)

In [None]:
scaler = StandardScaler().fit(X_train)
x_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
x_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)
x_train_scaled.describe()

In [None]:
params = {'C': np.concatenate([np.arange(0.1, 2, 0.1), np.arange(2, 15, 1)])}
svm_model = SVR(kernel='linear')
grid_cv = GridSearchCV(estimator=svm_model, param_grid=params, cv=10, n_jobs=-1, scoring='r2')
grid_cv.fit(x_train_scaled, y_train)
print(grid_cv.best_params_)

In [None]:
best_svm_model = grid_cv.best_estimator_
best_svm_model = SVC(kernel='linear', C=14)
best_svm_model.fit(x_train_scaled, y_train)
y_pred_svm = best_svm_model.predict(x_test_scaled)


In [None]:
print_metrics(y_test, y_pred_svm)

In [None]:

params = {'min_samples_leaf': range(3, 30)}
tree = DecisionTreeRegressor(random_state=3)
grid_cv = GridSearchCV(estimator=tree, cv=5, param_grid=params, n_jobs=-1, scoring='neg_mean_absolute_error')
grid_cv.fit(x_train, y_train)
print(grid_cv.best_params_)

In [None]:

best_tree = grid_cv.best_estimator_
best_tree.fit(x_train, y_train)
y_pred_tree = best_tree.predict(x_test)


In [None]:
print_metrics(y_test, y_pred_tree)

In [None]:
importances = pd.DataFrame(data=zip(x_train.columns, best_tree.feature_importances_), columns=['Признак', 'Важность'])
print('Важность признаков в дереве решений\n')
for row in importances.sort_values(by='Важность', ascending=False).values:
    print(f'{row[0]}: {round(row[1], 3)}')

In [None]:
plt.figure(figsize=(12, 4))
sns.barplot(data=importances.sort_values(by='Важность', ascending=False), y='Признак', x='Важность', orient='h', )
plt.title('Важность признаков в дереве решений')
plt.show()

In [None]:

print('Линейная регрессия')
print_metrics(y_test, y_pred_linear)


print('\nПолиномиальная регрессия')
print_metrics(y_test, y_pred_poly)

print('\nМетод опорных векторов')
print_metrics(y_test, y_pred_svm)


print('\nДерево решений')
print_metrics(y_test, y_pred_tree)