In [29]:
import openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

dataset = openml.datasets.get_dataset(10)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    target=dataset.default_target_attribute, dataset_format="dataframe"
)

y = LabelEncoder().fit_transform(y)

X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [30]:
X_train

Unnamed: 0,lymphatics,block_of_affere,bl_of_lymph_c,bl_of_lymph_s,by_pass,extravasates,regeneration_of,early_uptake_in,lym_nodes_dimin,lym_nodes_enlar,changes_in_lym,defect_in_node,changes_in_node,changes_in_stru,special_forms,dislocation_of,exclusion_of_no,no_of_nodes_in
132,arched,no,no,no,no,no,no,no,1,2,oval,lacunar,lacunar,faint,no,no,no,1
13,deformed,no,no,no,no,no,no,no,1,2,oval,lac_central,lac_margin,diluted,no,yes,yes,1
130,deformed,yes,no,no,no,no,no,yes,1,2,oval,lacunar,lacunar,diluted,no,no,no,1
16,displaced,no,no,no,no,no,no,yes,1,4,oval,lacunar,lac_central,stripped,vesicles,yes,yes,7
71,displaced,yes,no,no,no,no,no,yes,1,2,oval,lacunar,lacunar,drop_like,chalices,no,no,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31,displaced,no,no,no,yes,yes,no,yes,1,3,round,lac_margin,lac_central,diluted,vesicles,yes,yes,4
124,displaced,yes,no,no,yes,yes,no,yes,1,3,round,lac_central,lac_margin,coarse,vesicles,yes,yes,2
25,arched,yes,no,no,no,yes,no,yes,1,3,round,lac_margin,lac_margin,faint,vesicles,yes,yes,4
53,deformed,no,no,no,yes,yes,yes,no,3,1,bean,lac_central,lacunar,diluted,vesicles,no,yes,4


In [31]:
y_train # labelencoded

array([2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1,
       2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 3, 2,
       2, 1, 2, 1, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 0, 1, 2, 1,
       1, 2, 2, 2, 1, 1, 3, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 1, 0,
       1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 1, 2, 1, 0, 1])

In [89]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
import numpy as np

def lymph_pipeline(X, clf):
  cat_cols = X.select_dtypes(include=["category"]).columns.tolist()
  cat_pipe = make_pipeline(OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan), SimpleImputer(strategy="most_frequent"))
  num_pipe = make_pipeline(SimpleImputer(strategy="mean"))
  ct = make_column_transformer((cat_pipe, cat_cols), remainder=num_pipe)
  return make_pipeline(ct, clf)

In [233]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold

clfs = [
  ("LinearSVC", LinearSVC(C=0.4, max_iter=1000)), 
  ("LogisticRegression", LogisticRegression(C=0.5)), 
  ("SGDClassifier", SGDClassifier(alpha=0.1, max_iter=1500))]

for clf_name, clf in clfs:
  lp = lymph_pipeline(X, clf)
  cv = cross_validate(lp, X_train, y_train, cv=2, return_train_score=True)
  print(clf_name)
  print(f"Training Score: {cv['train_score']}")
  print(f"Test Score: {cv['test_score']}\n")

LinearSVC
Training Score: [0.93220339 0.98305085]
Test Score: [0.74576271 0.79661017]

LogisticRegression
Training Score: [0.89830508 0.93220339]
Test Score: [0.77966102 0.74576271]

SGDClassifier
Training Score: [0.81355932 0.86440678]
Test Score: [0.77966102 0.84745763]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [217]:
lp = lymph_pipeline(X_train, SGDClassifier(alpha=0.1, max_iter=1500))
lp.fit(X_train, y_train)
test_score = lp.score(X_test, y_test)
print(test_score)

0.8666666666666667
