In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.decomposition import PCA

from sklearn.feature_selection import VarianceThreshold, SelectFromModel, RFECV, SequentialFeatureSelector

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score

from sklearn.datasets import make_classification, load_wine, load_breast_cancer, load_diabetes

ModuleNotFoundError: No module named 'sklearn.impute'

In [None]:
plt.style.use('seaborn-darkgrid')


In [None]:
def plot_scatter(x, y, auto_scaled=True, title=None, clusters=None):
    plt.figure(figsize=(4, 4))
    plt.scatter(x, y)
    
    if not auto_scaled:
        plt.axis('square')
    
    plt.grid(True)
    plt.title(title)
    plt.show()
   
    
def return_X_y(data, target_column):
    return data.drop(target_column, axis=1), data[target_column]

In [None]:
housing_data = pd.read_csv('Melbourne_housing_FULL.csv')
# prepare dataset for price regression
housing_data = housing_data[~housing_data['Price'].isnull()]

In [None]:
housing_data.isnull().mean() # housing_data.isnull().sum() to get absolute numbers


In [None]:
threshold = 0.3
housing_data_dropped = housing_data[housing_data.columns[housing_data.isnull().mean() < threshold]]
housing_data_dropped = housing_data_dropped.dropna(axis=0, how='any') # params is optinal here (matching defaults)
print(f'Original dataset shape (rows, cols): {housing_data.shape}')
print(f'Dataset shape (rows, cols) after dropna: {housing_data_dropped.shape}')

In [None]:
# const imputing
housing_data_const = housing_data.fillna(value=0)

# mean imputing
housing_data_mean = housing_data.fillna(housing_data.mean())

In [None]:
wine_sklearn = load_wine(as_frame=True)
wine_data, wine_labels = wine_sklearn['data'], wine_sklearn['target']
wine_data

In [None]:
scaler = StandardScaler()
wine_data_scaled = scaler.fit_transform(wine_data)
wine_data_scaled

In [None]:
pca = PCA(n_components=2)

wine_data_pca = pca.fit_transform(wine_data)
wine_data_scaled_pca = pca.fit_transform(wine_data_scaled)

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(18, 10))

for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax1.scatter(wine_data_pca[wine_labels == l, 0], wine_data_pca[wine_labels == l, 1], 
                color=c, label=f'class {l}', alpha=0.5, marker=m)

for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax2.scatter(wine_data_scaled_pca[wine_labels == l, 0], wine_data_scaled_pca[wine_labels == l, 1], 
                color=c, label=f'class {l}', alpha=0.5, marker=m)
    
ax1.set_title('Dataset after PCA')
ax2.set_title('Standardized dataset after PCA')

for ax in (ax1, ax2):
    ax.set_xlabel('1st principal component')
    ax.set_ylabel('2nd principal component')
    ax.legend(loc='upper right')

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit_transform(wine_data)

In [None]:
mu, sigma = 5, 1
lognorm_data = np.random.lognormal(mu, sigma, 1000)

In [None]:
plt.figure(figsize=(16,8))
sns.histplot(lognorm_data, stat='probability')
plt.show()

In [None]:
plt.figure(figsize=(16,8))
sns.histplot(np.log(lognorm_data), stat='probability')
plt.show()

In [None]:
plt.figure(figsize=(16,8))
sns.histplot(housing_data['Price'], stat='probability')
plt.show()

In [None]:
X = [['male', 'US', 'Safari'], ['female', 'Europe', 'Firefox'], ['male', 'Europe', 'Opera']]
pd.DataFrame(X, columns=['gender', 'place', 'browser'])

In [None]:
encoder = OrdinalEncoder()
ordinal_encoded_X = encoder.fit_transform(X)

In [None]:
encoder = OneHotEncoder()
ohe_encoded_X = encoder.fit_transform(X).toarray()

In [None]:
pd.DataFrame(ohe_encoded_X, columns=encoder.get_feature_names())


In [None]:
cancer_sklearn = load_breast_cancer(as_frame=True)
cancer_data, cancer_labels = cancer_sklearn['data'], cancer_sklearn['target']
cancer_data_scaled = StandardScaler().fit_transform(cancer_data)
cancer_data

In [None]:
X_generated, y_generated = make_classification(n_samples=1000, n_features=25, n_informative=3,
                                                         n_redundant=2, n_repeated=0)
X_generated.shape

In [None]:
print(VarianceThreshold(0.9).fit_transform(X_generated).shape)
print(VarianceThreshold(1).fit_transform(X_generated).shape)
print(VarianceThreshold(1.1).fit_transform(X_generated).shape)

In [None]:
selection_model = RandomForestClassifier(random_state=42)
selector = SelectFromModel(selection_model).fit(cancer_data, cancer_labels)
cancer_data_pruned = selector.transform(cancer_data)
print(cancer_data.columns[selector.get_support()])
print(f'Original shape: {cancer_data.shape}')
print(f'Shape after selection: {cancer_data_pruned.shape}')

In [None]:
main_model = LogisticRegression(solver='liblinear', penalty='l1')
pipe_baseline = make_pipeline(StandardScaler(), main_model)
pipe_selection = make_pipeline(StandardScaler(), SelectFromModel(selection_model), main_model) # fix to select only once

print('Result on original data: {:f}'.format(cross_val_score(pipe_baseline, cancer_data, cancer_labels, 
                      scoring='accuracy', cv=5).mean()))

print('Result after selection {:f}'.format(cross_val_score(pipe_selection, cancer_data, cancer_labels, 
                      scoring='accuracy', cv=5).mean()))

In [None]:
min_features_to_select = 1 
rfecv = RFECV(estimator=main_model, step=1, cv=KFold(3), 
              scoring='accuracy', min_features_to_select=min_features_to_select)
rfecv.fit(cancer_data_scaled, cancer_labels)

print("Optimal number of features : %d" % rfecv.n_features_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(range(min_features_to_select,
               len(rfecv.grid_scores_) + min_features_to_select),
         rfecv.grid_scores_)
plt.show()

In [None]:
selector = SequentialFeatureSelector(main_model, scoring='accuracy', n_jobs=-1).fit(cancer_data_scaled, cancer_labels)
cancer_data_scaled_pruned = selector.transform(cancer_data_scaled)

print(cancer_data.columns[selector.get_support()])
print(f'Original shape: {cancer_data.shape}')
print(f'Shape after selection: {cancer_data_pruned.shape}\n')

print('Result on original data: {:f}'.format(cross_val_score(main_model, cancer_data_scaled, 
                                                           cancer_labels, scoring='accuracy', cv=5).mean()))

print('Result after selection {:f}'.format(cross_val_score(main_model, cancer_data_scaled_pruned, 
                                                        cancer_labels, scoring='accuracy', cv=5).mean()))

In [None]:
#Homework

In [None]:
#part1

In [None]:
def ownStandardScaler(df):

    own=df.iloc[:,:].round(decimals=12)
    own_mean=df.iloc[:,:].mean().round(decimals=12)
    sigma=(((((own-own_mean)**2).sum())/(df.shape[0]))**0.5).round(decimals=12)
    own_scaled=((own-own_mean)/sigma).round(decimals=12)
    print('\n'"own_scaled wine_data_scaled: ",np.allclose(own_scaled, wine_data_scaled))
    return own_scaled

ownStandardScaler(wine_data)

In [None]:
#part2

In [None]:
feature_0 = np.random.randn(1000) * 10   
feature_1 = np.concatenate([np.random.randn(500), np.random.randn(500) + 5])
data = np.column_stack([feature_0, feature_1])
data

In [None]:
plot_scatter(data[:, 0], data[:, 1], auto_scaled=True, title='Data (different axes units!)')


In [None]:
plot_scatter(data[:, 0], data[:, 1], auto_scaled=False , title='Data (equal axes units!)')


In [None]:
data_x=pd.DataFrame(data).round(decimals=12)
data_x_min=data_x.min()
data_x_max=data_x.max()
data_x_norm=(data_x-data_x_min)/(data_x_max-data_x_min)
data_x_norm=np.asarray(data_x_norm)
print("data_x_norm = ",'\n',data_x_norm)
plot_scatter(data_x_norm[:,0],data_x_norm[:,1], auto_scaled=False, title='Data (equal axes units!)')

In [None]:
data_x=pd.DataFrame(data, columns=['feature_0','feature_1']).round(decimals=12)
data_x_mean=data_x.iloc[:,:].mean().round(decimals=12)
sigma=(((((data_x-data_x_mean)**2).sum())/(data_x.shape[0]))**0.5).round(decimals=12)
data_x_scaled=((data_x-data_x_mean)/sigma).round(decimals=12)
data_x_scaled=np.asarray(data_x_scaled)
plot_scatter(data_x_scaled[:,0],data_x_scaled[:,1], auto_scaled=False, title='Data (equal axes units!)')

In [None]:
#run algorithm (with k=2, k - number of clusters/classes) on unscaled data

In [None]:
def plot_scatter_own(x,y,title=None):

    plt.figure(figsize=(8, 4))
    clusters = Kmean.fit_predict(np.array([x, y]).T)
    plt.title(title)
    plt.scatter(x, y,c=clusters,cmap='bwr',marker='.')

In [None]:
from sklearn.cluster import KMeans
Kmean = KMeans(n_clusters=2)
Kmean.fit(data)
data_x=data[:,0]
data_y=data[:,1]
c_data=Kmean.cluster_centers_
plot_scatter_own(data_x,data_y,title='k=2 on unscaled data')
plt.scatter(c_data[0][0], c_data[0][1], c='y', marker='s')
plt.scatter(c_data[1][0], c_data[1][1], c='b', marker='s')
plt.show()

In [None]:
#run algorithm (with k=2) on scaled data

In [None]:
from sklearn.cluster import KMeans
Kmean = KMeans(n_clusters=2)
Kmean.fit(data_x_scaled)
data_x=data_x_scaled[:,0]
data_y=data_x_scaled[:,1]

c_data=Kmean.cluster_centers_
plot_scatter_own(data_x,data_y,title='k=2 on scaled data')
plt.scatter(c_data[0][0], c_data[0][1], c='y', marker='s')
plt.scatter(c_data[1][0], c_data[1][1], c='g', marker='s')
plt.show()

In [None]:
from sklearn.cluster import KMeans
Kmean = KMeans(n_clusters=2)
Kmean.fit(data_x_norm)
data_x=data_x_norm[:,0]
data_y=data_x_norm[:,1]

c_data=Kmean.cluster_centers_
plot_scatter_my(data_x,data_y,title='k=2 on minmax data')
plt.scatter(c_data[0][0], c_data[0][1], c='g', marker='s')
plt.scatter(c_data[1][0], c_data[1][1], c='y', marker='s')
plt.show()

In [None]:
#part3

In [None]:
wine_train, wine_val, wine_labels_train, wine_labels_val = train_test_split(wine_data, wine_labels, 
                                                                            test_size=0.3, random_state=42)

In [None]:
main_model = LogisticRegression(solver='liblinear', penalty='l1')
pipe_baseline = make_pipeline(main_model)

print('original data: {:f}'.format(cross_val_score(pipe_baseline, wine_val, wine_labels_val, 
                      scoring='accuracy', cv=5).mean()))

In [None]:
main_model = LogisticRegression(solver='liblinear', penalty='l1')
pipe_selection = make_pipeline(StandardScaler(), SelectFromModel(selection_model), main_model) 

print('StandardScaler + FeatureSelection {:f}'.format(cross_val_score(pipe_selection, wine_val, wine_labels_val, 
                      scoring='accuracy', cv=5).mean()))

In [None]:
Результат при применении StandardScaler улучшился с 0,94 до 0,98
Если добавить FeatureSelection то рехультат ухудшился до 0,96
Вывод: FeatureSelection не всегда приводит к улучшению работы модели