In [3]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import roc_curve, roc_auc_score,RocCurveDisplay
import matplotlib.pyplot as plt 

In [4]:
glass = pd.read_csv('../Cases/Glass_Identification/Glass.csv')
glass

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,building_windows_float_processed
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,building_windows_float_processed
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,headlamps
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,headlamps
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,headlamps
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,headlamps


In [5]:
x , y = glass.drop('Type', axis=1), glass['Type']

In [6]:
glass.value_counts(glass['Type'])

Type
building_windows_non_float_processed    76
building_windows_float_processed        70
headlamps                               29
vehicle_windows_float_processed         17
containers                              13
tableware                                9
Name: count, dtype: int64

In [7]:
# ohe = OneHotEncoder(sparse_output = False, drop = 'first').set_output(transform = 'pandas')
# col_trnf = ColumnTransformer([('OHE', ohe, make_column_selector(dtype_include=object))],
#                              remainder = 'passthrough',
#                              verbose_feature_names_out=False).set_output(transform='pandas')


le = LabelEncoder()
glass['Type'] = le.fit_transform(glass['Type'])
glass

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,0
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,0
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,3
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,3
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,3
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,3


In [8]:
x_train, x_test, y_train ,y_test = train_test_split(x,y,random_state=25, test_size=0.3, stratify=y)
warnings.filterwarnings("ignore")

In [9]:
import warnings
warnings.filterwarnings('ignore')

solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
penalties = ['l1', 'l2', 'elasticnet', None]
Cs = np.linspace(0.001, 15, 20)

scores =[]

for s in tqdm(solvers):
    for p in penalties:
        for c in Cs:
            try:
                ls = LogisticRegression(solver=s, penalty=p, C=c)
                ls.fit(x_train, y_train)
                y_pred = ls.predict(x_test)
                y_pred_proba = ls.predict_proba(x_test)
                scores.append([s, p, c, accuracy_score(y_test, y_pred)])
            except:
                continue

df_acc_scores = pd.DataFrame(scores, columns=['solver', 'penalty', 'C', 'accuracy_score'])
df_acc_scores.sort_values('accuracy_score', ascending=False)

100%|██████████| 6/6 [00:14<00:00,  2.46s/it]


Unnamed: 0,solver,penalty,C,accuracy_score
81,newton-cg,l2,0.790421,0.692308
121,newton-cholesky,l2,0.790421,0.692308
42,liblinear,l1,1.579842,0.676923
1,lbfgs,l2,0.790421,0.661538
83,newton-cg,l2,2.369263,0.646154
...,...,...,...,...
40,liblinear,l1,0.001000,0.353846
160,sag,l2,0.001000,0.353846
120,newton-cholesky,l2,0.001000,0.353846
200,saga,l1,0.001000,0.353846


Best Model

## Inferencing on unlabelled data

In [10]:
best_model = LogisticRegression(solver ='newton-cholesky', penalty='l2', C = 0.7904 )
best_model.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.7904
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'newton-cholesky'
,max_iter,100


In [11]:
tst = pd.read_csv('../Cases/Glass_Identification/tst_Glass.csv')
pred_class = best_model.predict(tst)
print(pred_class)
# tst['predicted_category'] = le.inverse_transform(pred_class)
tst['predicted_category'] = (pred_class)
tst

['headlamps' 'tableware' 'building_windows_float_processed' 'containers'
 'building_windows_non_float_processed' 'headlamps']


Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,predicted_category
0,1.5321,14.0,0.0,0.34,70.23,0.001,6.7,1.23,0.0,headlamps
1,1.5212,15.0,3.0,1.23,75.9,0.1,7.0,0.0,0.44,tableware
2,1.5112,13.0,3.5,2.3,73.0,3.4,14.0,2.3,0.22,building_windows_float_processed
3,1.5,12.4,1.23,3.22,74.22,4.5,10.0,3.1,0.1,containers
4,1.52,13.0,2.4,0.34,71.22,3.2,9.0,1.44,0.001,building_windows_non_float_processed
5,1.51,16.0,2.7,4.0,70.0,2.0,6.0,2.9,0.89,headlamps
