# Multinomial Logistic Regression

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, make_scorer, confusion_matrix

Possible values: SA, RR, RVNS, SD or None

- Using 'liblinear' solver - problem is decomposed to a "one-vs-rest" problem. And individual binary classifiers are trained.
- 'saga' is better for sparse multinomial regression.

Evaluation metrics (
https://scikit-learn.org/stable/modules/model_evaluation.html#precision-recall-f-measure-metrics )

- Note that if all labels are included, “micro”-averaging in a multiclass setting will produce precision, recall and F1 that are all identical to accuracy.



In [2]:
resultsSummary = pd.read_pickle('./Data/Performance Data/resultsSummary.pkl')
features = pd.read_csv('./Data/features.csv').drop(columns=['puzzles', 'source'])

In [3]:
resultsSummary[['SA_SR', 'RR_SR', 'RVNS_SR', 'SD_SR',
                'SA_meanCT', 'RR_meanCT', 'RVNS_meanCT', 'SD_meanCT']].describe()

Unnamed: 0,SA_SR,RR_SR,RVNS_SR,SD_SR,SA_meanCT,RR_meanCT,RVNS_meanCT,SD_meanCT
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.08815,0.08595,0.0621,0.05815,2.201569,2.084878,2.679376,2.775323
std,0.204289,0.20162,0.184411,0.179815,0.491848,0.45856,0.611188,0.628236
min,0.0,0.0,0.0,0.0,0.059631,0.01963,0.0083,0.002653
25%,0.0,0.0,0.0,0.0,2.185729,2.081912,2.481429,2.563323
50%,0.0,0.0,0.0,0.0,2.274711,2.160078,2.726923,2.814924
75%,0.1,0.1,0.05,0.05,2.39013,2.253337,2.980403,3.114227
max,1.0,1.0,1.0,1.0,3.462554,3.209921,4.15569,4.619463


In [4]:
feat_all = list(features.columns)
X_full = StandardScaler().fit_transform(features.values)

Actual best algorithm in terms of success rate

In [5]:
Y = pd.DataFrame()

#actual best algorithm in terms of SR
Y['SR_max'] = resultsSummary[['SA_SR', 'RR_SR', 'RVNS_SR', 'SD_SR']].max(axis='columns')

SR_counts = {}
for alg in ['SA_SR', 'RR_SR', 'RVNS_SR', 'SD_SR']:
    SR_counts[alg] = [list(resultsSummary[alg])[i]== list(Y['SR_max'])[i] for i in range(1000)]

Y['SR_ties_count'] = pd.DataFrame(SR_counts).sum(axis='columns').values
Y['SR_best'] = resultsSummary[['SA_SR', 'RR_SR', 'RVNS_SR', 'SD_SR']].idxmax(axis='columns')

#handling 3-way ties
Y['SR_best'] = Y.apply(lambda x: 'None' if x.SR_ties_count>2 else x.SR_best,
                       axis='columns')

In [6]:
Y['SR_best'].value_counts()

None       406
SA_SR      275
RR_SR      197
RVNS_SR     70
SD_SR       52
Name: SR_best, dtype: int64

Actual best performance in terms of mean cost-time

In [7]:
Y['CT_min'] = resultsSummary[[ 'RR_meanCT', 'SA_meanCT','RVNS_meanCT', 'SD_meanCT']].min(axis='columns')

CT_counts = {}
for alg in ['RR_meanCT','SA_meanCT',  'RVNS_meanCT', 'SD_meanCT']:
    CT_counts[alg] = [np.abs(list(resultsSummary[alg])[i]- list(Y['CT_min'])[i])<10e-4 for i in range(1000)]

Y['CT_ties_count'] = pd.DataFrame(CT_counts).sum(axis='columns').values
Y['CT_best'] = resultsSummary[['SA_meanCT', 'RR_meanCT', 'RVNS_meanCT', 'SD_meanCT']].idxmin(axis='columns')

#handling 3-way ties
Y['CT_best'] = Y.apply(lambda x: 'None' if x.CT_ties_count>2 else x.CT_best,
                       axis='columns')

In [8]:
Y['CT_best'].value_counts()

RR_meanCT      783
SA_meanCT      172
RVNS_meanCT     30
SD_meanCT       15
Name: CT_best, dtype: int64

Tuning and training multinomial logistic regression model.

In [9]:
def LogRegModelMul(X,Y,Yhat, mod,avg,sol = 'saga',maxit=5000):

    y = Y[mod]

    # finding best C for regularisation
    lgc = LogisticRegressionCV(Cs=20,penalty='l1',solver=sol, max_iter=maxit,
        cv=StratifiedKFold(5,shuffle=True,random_state=111),random_state=111).fit(X,y)

    C_best = lgc.C_[0]


    lg = LogisticRegression(penalty='l1',C=C_best,solver=sol,random_state=111,max_iter=maxit).fit(X,y)


    metrics = {'accuracy':make_scorer(accuracy_score),
        #'confusion': make_scorer(confusion_matrix,labels=['SA','RR','None'],normalize='all'),
        'precision': make_scorer(precision_score,zero_division=1,average=avg),
        'recall':make_scorer(recall_score,zero_division=1,average=avg),
        'f1':make_scorer(f1_score,zero_division=1,average=avg)}
    scores = cross_validate(lg, X_full, y,scoring=metrics,cv=10)
    avgScores = {m: np.mean(scores['test_'+m]) for m in metrics}


    selFeats = {feat_all[i]: lg.coef_[0][i] for i in range(54) if lg.coef_[0][i] != 0}


    Yhat[mod] = lg.predict(X)

    return C_best, avgScores, selFeats


In [10]:
Yhat = pd.DataFrame()
ScoresSel = dict()
CsSel = dict()
FeatsSel = dict()

In [11]:
for mod in ['SR_best','CT_best']:
    print(mod)
    CsSel[mod], ScoresSel[mod], FeatsSel[mod] = LogRegModelMul(X_full,Y,Yhat,mod,avg='weighted',sol='saga',maxit=5000)


SR_best
CT_best


In [12]:
ScoresSelTable = pd.DataFrame(ScoresSel.values(),index=ScoresSel.keys())
ScoresSelTable = ScoresSelTable*100
#ScoresSelTable['ind'] = ScoresSelTable.index
ScoresSelTable.insert(0,'feat',[len(x) for x in FeatsSel.values()])

In [13]:
ScoresSelTable.transpose()

Unnamed: 0,SR_best,CT_best
feat,10.0,0.0
accuracy,50.7,78.3
precision,62.881735,83.011
recall,50.7,78.3
f1,37.812224,68.771238


In [15]:
Yhat['SR_best'].value_counts()

None     658
SA_SR    338
RR_SR      4
Name: SR_best, dtype: int64

C and features selected

In [16]:
CsSel

{'SR_best': 0.08858667904100823, 'CT_best': 0.0001}

In [14]:
FeatsSel['SR_best'].keys()

dict_keys(['fixedDig_min', 'fixedDig_entropy', 'counts_naked1', 'counts_naked2', 'counts_CV', 'counts_min', 'value_min', 'LPslack_entropy', 'GCP_nDeg_mean', 'GCP_nDeg_std'])

Small C selected for CT model, so no features with non-zero coefficients.

In [17]:
FeatsSel['CT_best']

{}