# Algorithms

In [24]:
import pandas as pd
df=pd.read_csv('clinical_mastitis_cows_version1.csv')

In [54]:
df=pd.read_csv('clinical_mastitis_cows_version1.csv')

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6600 entries, 0 to 6599
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Cow_ID                     6600 non-null   object
 1   Day                        6600 non-null   int64 
 2   Breed                      6600 non-null   object
 3   Months after giving birth  6600 non-null   int64 
 4   Previous_Mastits_status    6600 non-null   int64 
 5   IUFL                       6600 non-null   int64 
 6   EUFL                       6600 non-null   int64 
 7   IUFR                       6600 non-null   int64 
 8   EUFR                       6600 non-null   int64 
 9   IURL                       6600 non-null   int64 
 10  EURL                       6600 non-null   int64 
 11  IURR                       6600 non-null   int64 
 12  EURR                       6600 non-null   int64 
 13  Temperature                6600 non-null   int64 
 14  Hardness

In [37]:
df=pd.read_csv('retail_store_sales.csv')

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12575 entries, 0 to 12574
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction ID    12575 non-null  object 
 1   Customer ID       12575 non-null  object 
 2   Category          12575 non-null  object 
 3   Item              11362 non-null  object 
 4   Price Per Unit    11966 non-null  float64
 5   Quantity          11971 non-null  float64
 6   Total Spent       11971 non-null  float64
 7   Payment Method    12575 non-null  object 
 8   Location          12575 non-null  object 
 9   Transaction Date  12575 non-null  object 
 10  Discount Applied  8376 non-null   object 
dtypes: float64(3), object(8)
memory usage: 1.1+ MB


In [39]:
df.nunique()

Transaction ID      12575
Customer ID            25
Category                8
Item                  200
Price Per Unit         25
Quantity               10
Total Spent           227
Payment Method          3
Location                2
Transaction Date     1114
Discount Applied        2
dtype: int64

In [40]:
df.head()

Unnamed: 0,Transaction ID,Customer ID,Category,Item,Price Per Unit,Quantity,Total Spent,Payment Method,Location,Transaction Date,Discount Applied
0,TXN_6867343,CUST_09,Patisserie,Item_10_PAT,18.5,10.0,185.0,Digital Wallet,Online,2024-04-08,True
1,TXN_3731986,CUST_22,Milk Products,Item_17_MILK,29.0,9.0,261.0,Digital Wallet,Online,2023-07-23,True
2,TXN_9303719,CUST_02,Butchers,Item_12_BUT,21.5,2.0,43.0,Credit Card,Online,2022-10-05,False
3,TXN_9458126,CUST_06,Beverages,Item_16_BEV,27.5,9.0,247.5,Credit Card,Online,2022-05-07,
4,TXN_4575373,CUST_05,Food,Item_6_FOOD,12.5,7.0,87.5,Digital Wallet,Online,2022-10-02,False


In [30]:
from sklearn.preprocessing import LabelEncoder

In [31]:
encoder=LabelEncoder()

In [None]:
# 1. Drop
df.drop(columns=['Transaction ID','Customer ID'], inplace=True)

# 2. Encoding
for col in df.columns:
    if df[col].dtype=='object':
        if df[col].nunique()<=4:
            dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
            df = pd.concat([df.drop(columns=col), dummies], axis=1)
        else:    
            df[col] = encoder.fit_transform(df[col])
            

# 3. Scaling
num_col = df.select_dtypes(include=['int64','float64']).columns
scaler = MinMaxScaler()
df[num_col] = scaler.fit_transform(df[num_col])


In [41]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

encoder = LabelEncoder()
scaler = MinMaxScaler()

# 1️⃣ Drop columns we do not need
# 'Transaction ID' and 'Customer ID' are identifiers and not useful for modeling
df.drop(columns=['Transaction ID','Customer ID'], inplace=True)

# 2️⃣ Handle missing values first (before encoding)
for col in df.columns:
    if df[col].dtype == 'object':
        # Categorical columns → fill missing values with mode
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        # Numeric columns → fill missing values based on skewness
        skew = df[col].skew()
        if skew > 0.5 or skew < -0.5:  # highly skewed → use median
            df[col].fillna(df[col].median(), inplace=True)
        else:  # roughly symmetric → use mean
            df[col].fillna(df[col].mean(), inplace=True)

# 3️⃣ Encode categorical columns
num_cols = []  # list to store numeric columns for scaling later
for col in df.columns:
    if df[col].dtype == 'object':
        if df[col].nunique() <= 4:
            # Few unique categories → create dummy variables (0/1)
            dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
            df = pd.concat([df.drop(columns=col), dummies], axis=1)
            # Add dummy columns to numeric list
            num_cols.extend(dummies.columns.tolist())
        else:
            # Many categories → label encode
            df[col] = encoder.fit_transform(df[col])
            num_cols.append(col)
    else:
        # Numeric columns → add to list
        num_cols.append(col)

# 4️⃣ Scale all numeric columns
# MinMaxScaler scales values to [0,1]
df[num_cols] = scaler.fit_transform(df[num_cols])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values 

In [None]:
df.drop(columns=['Transaction ID','Customer ID'],inplace=True)
for col in df.columns:
    if df[col].dtype=='object':
        if df[col].nunique()<=4:
            dummies=pd.get_dummies(df[col], prefix=col,dtype=int)
            df=pd.concat([df.drop(columns=col),dummies],axis=1)
        else:    
            df[col]=encoder.fit_transform(df[col])                                             # correct


IndentationError: unexpected indent (3189121770.py, line 1)

In [None]:
num_col = df.select_dtypes(include=['int64','float64']).columns

scaler = MinMaxScaler()
df[num_col] = scaler.fit_transform(df[num_col])

In [None]:
for col in df.columns:
    if col=='Transaction ID':
        df.drop(['Transaction ID'],axis=1,inplace=True)
        continue
    if df[col].dtype=='object':
        if df[col].nunique()<=4:
            dummies=pd.get_dummies(df[col], prefix=col,dtype=int)
            df=pd.concat([df.drop(columns=col),dummies],axis=1)
        else:    
            df[col]=encoder.fit_transform(df[col])                                                      #Incorrect 

In [32]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler

In [155]:
df.drop(columns=['Transaction Date'], inplace=True)

num_col = df.select_dtypes(include=['int64','float64']).columns

scaler = MinMaxScaler()
df[num_col] = scaler.fit_transform(df[num_col])


In [42]:
df.head()

Unnamed: 0,Category,Item,Price Per Unit,Quantity,Total Spent,Transaction Date,Discount Applied,Payment Method_Cash,Payment Method_Credit Card,Payment Method_Digital Wallet,Location_In-store,Location_Online
0,1.0,0.035176,0.375,1.0,0.444444,0.743935,1.0,0.0,0.0,1.0,0.0,1.0
1,0.857143,0.311558,0.666667,0.888889,0.632099,0.510332,1.0,0.0,0.0,1.0,0.0,1.0
2,0.142857,0.085427,0.458333,0.111111,0.093827,0.248877,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.241206,0.625,0.888889,0.598765,0.113208,1.0,0.0,1.0,0.0,0.0,1.0
4,0.571429,0.864322,0.208333,0.666667,0.203704,0.246181,0.0,0.0,0.0,1.0,0.0,1.0


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12575 entries, 0 to 12574
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Category                       12575 non-null  float64
 1   Item                           12575 non-null  float64
 2   Price Per Unit                 12575 non-null  float64
 3   Quantity                       12575 non-null  float64
 4   Total Spent                    12575 non-null  float64
 5   Transaction Date               12575 non-null  float64
 6   Discount Applied               12575 non-null  float64
 7   Payment Method_Cash            12575 non-null  float64
 8   Payment Method_Credit Card     12575 non-null  float64
 9   Payment Method_Digital Wallet  12575 non-null  float64
 10  Location_In-store              12575 non-null  float64
 11  Location_Online                12575 non-null  float64
dtypes: float64(12)
memory usage: 1.2 MB


In [44]:
from sklearn.linear_model import LinearRegression,LogisticRegression

In [45]:
lr=LinearRegression()

In [46]:
lr

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [47]:
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor

In [48]:
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12575 entries, 0 to 12574
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Category                       12575 non-null  float64
 1   Item                           12575 non-null  float64
 2   Price Per Unit                 12575 non-null  float64
 3   Quantity                       12575 non-null  float64
 4   Total Spent                    12575 non-null  float64
 5   Transaction Date               12575 non-null  float64
 6   Discount Applied               12575 non-null  float64
 7   Payment Method_Cash            12575 non-null  float64
 8   Payment Method_Credit Card     12575 non-null  float64
 9   Payment Method_Digital Wallet  12575 non-null  float64
 10  Location_In-store              12575 non-null  float64
 11  Location_Online                12575 non-null  float64
dtypes: float64(12)
memory usage: 1.2 MB


In [None]:
x-----inputlar 
y-----outputlar

In [60]:
num_col=df.select_dtypes(include=['int64']).columns

In [61]:
num_col

Index(['Day', 'Months after giving birth', 'Previous_Mastits_status', 'IUFL',
       'EUFL', 'IUFR', 'EUFR', 'IURL', 'EURL', 'IURR', 'EURR', 'Temperature',
       'Hardness', 'Pain', 'Milk_visibility', 'class1'],
      dtype='object')

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6600 entries, 0 to 6599
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Day                        6600 non-null   int64
 1   Months after giving birth  6600 non-null   int64
 2   Previous_Mastits_status    6600 non-null   int64
 3   IUFL                       6600 non-null   int64
 4   EUFL                       6600 non-null   int64
 5   IUFR                       6600 non-null   int64
 6   EUFR                       6600 non-null   int64
 7   IURL                       6600 non-null   int64
 8   EURL                       6600 non-null   int64
 9   IURR                       6600 non-null   int64
 10  EURR                       6600 non-null   int64
 11  Temperature                6600 non-null   int64
 12  Hardness                   6600 non-null   int64
 13  Pain                       6600 non-null   int64
 14  Milk_visibility         

In [58]:
df.drop(['Cow_ID','Breed'],axis=1,inplace=True)

In [62]:
df.head()

Unnamed: 0,Day,Months after giving birth,Previous_Mastits_status,IUFL,EUFL,IUFR,EUFR,IURL,EURL,IURR,EURR,Temperature,Hardness,Pain,Milk_visibility,class1
0,1,1,0,150,180,150,180,150,181,150,181,43,0,0,0,0
1,2,1,0,152,180,152,185,151,180,152,181,43,0,0,0,0
2,3,1,0,152,182,153,186,151,186,153,183,43,0,0,0,0
3,4,1,0,155,183,155,189,155,182,155,186,43,0,0,0,0
4,5,1,0,150,186,150,181,150,185,150,188,43,0,0,0,0


In [63]:
from sklearn.preprocessing import MinMaxScaler

In [64]:
scaler=MinMaxScaler()

In [65]:
scaler

0,1,2
,"feature_range  feature_range: tuple (min, max), default=(0, 1) Desired range of transformed data.","(0, ...)"
,"copy  copy: bool, default=True Set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array).",True
,"clip  clip: bool, default=False Set to True to clip transformed values of held-out data to provided `feature_range`. Since this parameter will clip values, `inverse_transform` may not be able to restore the original data. .. note::  Setting `clip=True` does not prevent feature drift (a distribution  shift between training and test data). The transformed values are clipped  to the `feature_range`, which helps avoid unintended behavior in models  sensitive to out-of-range inputs (e.g. linear models). Use with care,  as clipping can distort the distribution of test data. .. versionadded:: 0.24",False


In [67]:
for col in df.columns:
    df[col]=scaler.fit_transform(df[[col]])

In [68]:
df.head()

Unnamed: 0,Day,Months after giving birth,Previous_Mastits_status,IUFL,EUFL,IUFR,EUFR,IURL,EURL,IURR,EURR,Temperature,Hardness,Pain,Milk_visibility,class1
0,0.0,0.0,0.0,0.104167,0.127273,0.093897,0.119835,0.101523,0.134884,0.149485,0.143541,0.558824,0.0,0.0,0.0,0.0
1,0.2,0.0,0.0,0.114583,0.127273,0.103286,0.140496,0.106599,0.130233,0.159794,0.143541,0.558824,0.0,0.0,0.0,0.0
2,0.4,0.0,0.0,0.114583,0.136364,0.107981,0.144628,0.106599,0.15814,0.164948,0.15311,0.558824,0.0,0.0,0.0,0.0
3,0.6,0.0,0.0,0.130208,0.140909,0.117371,0.157025,0.126904,0.139535,0.175258,0.167464,0.558824,0.0,0.0,0.0,0.0
4,0.8,0.0,0.0,0.104167,0.154545,0.093897,0.123967,0.101523,0.153488,0.149485,0.177033,0.558824,0.0,0.0,0.0,0.0


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6600 entries, 0 to 6599
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Day                        6600 non-null   float64
 1   Months after giving birth  6600 non-null   float64
 2   Previous_Mastits_status    6600 non-null   float64
 3   IUFL                       6600 non-null   float64
 4   EUFL                       6600 non-null   float64
 5   IUFR                       6600 non-null   float64
 6   EUFR                       6600 non-null   float64
 7   IURL                       6600 non-null   float64
 8   EURL                       6600 non-null   float64
 9   IURR                       6600 non-null   float64
 10  EURR                       6600 non-null   float64
 11  Temperature                6600 non-null   float64
 12  Hardness                   6600 non-null   float64
 13  Pain                       6600 non-null   float

In [None]:
x----inputlar
y----outputlar

In [71]:
num_col=df.select_dtypes(include=['float64']).columns

In [72]:
num_col

Index(['Day', 'Months after giving birth', 'Previous_Mastits_status', 'IUFL',
       'EUFL', 'IUFR', 'EUFR', 'IURL', 'EURL', 'IURR', 'EURR', 'Temperature',
       'Hardness', 'Pain', 'Milk_visibility', 'class1'],
      dtype='object')

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6600 entries, 0 to 6599
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Day                        6600 non-null   float64
 1   Months after giving birth  6600 non-null   float64
 2   Previous_Mastits_status    6600 non-null   float64
 3   IUFL                       6600 non-null   float64
 4   EUFL                       6600 non-null   float64
 5   IUFR                       6600 non-null   float64
 6   EUFR                       6600 non-null   float64
 7   IURL                       6600 non-null   float64
 8   EURL                       6600 non-null   float64
 9   IURR                       6600 non-null   float64
 10  EURR                       6600 non-null   float64
 11  Temperature                6600 non-null   float64
 12  Hardness                   6600 non-null   float64
 13  Pain                       6600 non-null   float

In [74]:
x=df.drop('class1',axis=1)

In [76]:
y=df['class1'].astype(int)

In [77]:
y

0       0
1       0
2       0
3       0
4       0
       ..
6595    0
6596    0
6597    0
6598    0
6599    0
Name: class1, Length: 6600, dtype: int64

In [82]:
x

Unnamed: 0,Day,Months after giving birth,Previous_Mastits_status,IUFL,EUFL,IUFR,EUFR,IURL,EURL,IURR,EURR,Temperature,Hardness,Pain,Milk_visibility
0,0.0,0.0,0.0,0.104167,0.127273,0.093897,0.119835,0.101523,0.134884,0.149485,0.143541,0.558824,0.0,0.0,0.0
1,0.2,0.0,0.0,0.114583,0.127273,0.103286,0.140496,0.106599,0.130233,0.159794,0.143541,0.558824,0.0,0.0,0.0
2,0.4,0.0,0.0,0.114583,0.136364,0.107981,0.144628,0.106599,0.158140,0.164948,0.153110,0.558824,0.0,0.0,0.0
3,0.6,0.0,0.0,0.130208,0.140909,0.117371,0.157025,0.126904,0.139535,0.175258,0.167464,0.558824,0.0,0.0,0.0
4,0.8,0.0,0.0,0.104167,0.154545,0.093897,0.123967,0.101523,0.153488,0.149485,0.177033,0.558824,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6595,0.2,0.8,0.0,0.140625,0.168182,0.145540,0.181818,0.142132,0.176744,0.175258,0.186603,0.558824,0.0,0.0,0.0
6596,0.4,0.8,0.0,0.135417,0.181818,0.150235,0.169421,0.147208,0.200000,0.190722,0.215311,0.558824,0.0,0.0,0.0
6597,0.6,0.8,0.0,0.135417,0.190909,0.112676,0.169421,0.162437,0.200000,0.195876,0.167464,0.558824,0.0,0.0,0.0
6598,0.8,0.8,0.0,0.114583,0.195455,0.117371,0.169421,0.121827,0.167442,0.170103,0.177033,0.558824,0.0,0.0,0.0


In [83]:
from sklearn.model_selection import train_test_split

In [85]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [86]:
x_train.shape

(4620, 15)

In [87]:
x_test.shape

(1980, 15)

In [88]:
y_train.shape

(4620,)

In [89]:
y_test.shape

(1980,)

In [90]:
from sklearn.linear_model import LogisticRegression

In [91]:
log_reg=LogisticRegression()

In [92]:
log_reg

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [93]:
log_reg.fit(x_train,y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [94]:
y_pred=log_reg.predict(x_test)

In [95]:
y_pred[1:10]

array([1, 0, 0, 0, 1, 0, 0, 1, 0])

# Evaluation

In [97]:
from sklearn.metrics import accuracy_score

In [98]:
score=accuracy_score(y_test,y_pred)

In [101]:
print('Model accuracy:',score)

Model accuracy: 1.0
