In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('folders\dataset\diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
from scipy import stats
df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

In [13]:
import pandas as pd
import itertools
import numpy as np
import seaborn as sns
from scipy.stats import pearsonr, ttest_ind

In [17]:
corrs = {}
columns = df.columns.tolist()
for col_a, col_b in itertools.combinations(columns, 2):
    corrs[col_a + '__' + col_b] = pearsonr(df.loc[:, col_a], df.loc[:, col_b])
result = pd.DataFrame.from_dict(corrs, orient='index')
result.columns = ['stat', 'p-value']
result.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36 entries, Pregnancies__Glucose to Age__Outcome
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   stat     36 non-null     float64
 1   p-value  36 non-null     float64
dtypes: float64(2)
memory usage: 864.0+ bytes


In [18]:
# если значение p < 0,05, мы приходим к выводу, что существует статистически значимая связь между этими двумя переменными.
# if result['p-value'] >= 0.05:
res = result[result['p-value'] < 0.05]
res.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27 entries, Pregnancies__Glucose to Age__Outcome
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   stat     27 non-null     float64
 1   p-value  27 non-null     float64
dtypes: float64(2)
memory usage: 648.0+ bytes


In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(df.iloc[: , :-1],df.iloc[: , -1], train_size=0.8, random_state=123)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [12]:
def objective(trial):
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    tol = trial.suggest_float("tol", 0.0001, 0.01, log=True)
    C = trial.suggest_float("C", 1.0, 10.0, log=True)
    intercept = trial.suggest_categorical("fit_intercept", [True, False])
    solver = trial.suggest_categorical("solver", ["liblinear", "saga"])

    ## Create Model
    classifier = LogisticRegression(penalty=penalty,
                                    tol=tol,
                                    C=C,
                                    fit_intercept=intercept,
                                    solver=solver,
                                    multi_class="auto",
                                   )
    ## Fit Model
    classifier.fit(X_train, Y_train)

    return classifier.score(X_test, Y_test)

In [None]:
%%time

study3 = optuna.create_study(study_name="LogisticRegression", direction="maximize")
study3.optimize(objective, n_trials=8)

In [23]:
print("Best Params : {}".format(study3.best_params))

print("\nBest Accuracy : {}".format(study3.best_value))

Best Params : {'penalty': 'l1', 'tol': 0.009849725351692103, 'C': 6.272253330656255, 'fit_intercept': True, 'solver': 'liblinear'}

Best Accuracy : 0.7987012987012987


In [28]:
classifier = LogisticRegression(**study3.best_params, multi_class="auto")

classifier.fit(X_train, Y_train)

print("Logistic Regression Accuracy on Train Dataset : {}".format(classifier.score(X_train, Y_train)))
print("Logistic Regression Accuracy on Test  Dataset : {}".format(classifier.score(X_test, Y_test)))

Logistic Regression Accuracy on Train Dataset : 0.7768729641693811
Logistic Regression Accuracy on Test  Dataset : 0.8051948051948052


In [None]:
%%time

param_grid = {
              "penalty": ["l1", "l2"],
              "C" : np.linspace(1, 10.0, 25),
              "fit_intercept": [True, False],
              "tol": np.linspace(0.0001, 0.01,10),
              "solver": ["liblinear", "saga"]
             }

grid = RandomizedSearchCV(LogisticRegression(multi_class="auto", max_iter=1000), param_grid, cv=5, n_iter=25, random_state=123)

grid.fit(X_train, Y_train)

grid.best_params_

In [32]:
classifier = LogisticRegression(**grid.best_params_, multi_class="auto")

classifier.fit(X_train, Y_train)

print("Logistic Regression Accuracy on Train Dataset : {}".format(classifier.score(X_train, Y_train)))
print("Logistic Regression Accuracy on Test  Dataset : {}".format(classifier.score(X_test, Y_test)))

Logistic Regression Accuracy on Train Dataset : 0.7719869706840391
Logistic Regression Accuracy on Test  Dataset : 0.8051948051948052
