# Imports

In [36]:
import pandas as pd 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import datasets 
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import time

# Params

In [37]:
RANDOM_STATE = 42
LOGISTIC_REGRESSION_MAX_ITERATIONS = 10000 
RANDOM_FOREST_CLASSIFIER_ESTIMATORS = 100
PCA_DISPERSION = 0.9
TSNE_DIMENSIONS = 2
TEST_SIZE = 0.2

# Dataset

In [38]:
df = datasets.load_digits()
X = df.data
y = df.target

# Models

In [39]:
models = {
    "SVC": SVC(random_state=RANDOM_STATE),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "LogisticRegression" : LogisticRegression(max_iter=LOGISTIC_REGRESSION_MAX_ITERATIONS, random_state=RANDOM_STATE),
    "RandomForestClassifier" : RandomForestClassifier(n_estimators=RANDOM_FOREST_CLASSIFIER_ESTIMATORS, random_state=RANDOM_STATE)
}

In [40]:
transformers = {
    "PCA": PCA(n_components=PCA_DISPERSION, random_state=RANDOM_STATE),
    "TSNE": TSNE(n_components=TSNE_DIMENSIONS, random_state=RANDOM_STATE)
}

# Train

## Без уменьшения размерности

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

predicts = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    predicts[model_name] = model.predict(X_test)

## С уменьшением размерности

In [42]:
processing_time = {}
transformed_predicts = {}

### PCA

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

transformed_predicts["PCA"] = {}

start_time = time.time()
transformers["PCA"].fit(X_train)
X_train_transformed = transformers["PCA"].transform(X_train)
X_test_transformed = transformers["PCA"].transform(X_test)
processing_time["PCA"] = time.time() - start_time

for model_name, model in models.items():
    model.fit(X_train_transformed, y_train)
    transformed_predicts["PCA"][model_name] = model.predict(X_test_transformed)

### TSNE

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

transformed_predicts["TSNE"] = {}

start_time = time.time()
X = transformers["TSNE"].fit_transform(X)
processing_time["TSNE"] = time.time() - start_time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

for model_name, model in models.items():
    model.fit(X_train, y_train)
    transformed_predicts["TSNE"][model_name] = model.predict(X_test)

# Результаты

## Score без снижения размерности:

In [45]:
for model_name, predict in predicts.items():
    print(f"Модель {model_name} score: ", accuracy_score(y_true=y_test, y_pred=predict), "\n")

Модель SVC score:  0.9916666666666667 

Модель DecisionTreeClassifier score:  0.825 

Модель LogisticRegression score:  0.9611111111111111 

Модель RandomForestClassifier score:  0.9611111111111111 



## Score со снижением размерности

In [46]:
for transformer_name, predicts in transformed_predicts.items():
    print(f"**********{transformer_name}**********")
    for model_name, predict in predicts.items():
        print(f"Модель {model_name} score: ", accuracy_score(y_true=y_test, y_pred=predict), "\n")
    print("************************")


**********PCA**********
Модель SVC score:  0.9888888888888889 

Модель DecisionTreeClassifier score:  0.8472222222222222 

Модель LogisticRegression score:  0.9333333333333333 

Модель RandomForestClassifier score:  0.9722222222222222 

************************
**********TSNE**********
Модель SVC score:  0.9638888888888889 

Модель DecisionTreeClassifier score:  0.9722222222222222 

Модель LogisticRegression score:  0.9083333333333333 

Модель RandomForestClassifier score:  0.9805555555555555 

************************


# Результаты

Время, потребовавшееся для сокращения размерности исходного набора данных:

In [47]:
for transformer_name, time in processing_time.items():
    print(f"{transformer_name}: {time} секунд")

PCA: 0.02181553840637207 секунд
TSNE: 3.5311460494995117 секунд


Для сохранения 90% дисперсии алгоритм PCA оставил:

In [48]:
print(transformers["PCA"].n_components_)

21


В результате работы, можно увидеть, что снижение размерности хорошо сказалось на алгоритмах случайного леса и древа решений, причём наиболее явные изменения для древа решений, особенно в случае TSNE.

Для остальных моделей, результаты стали хуже.

В целом, если сравнивать по результатам лучше всего отработал SVC без уменьшения размерности.