In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from tqdm.notebook import tqdm
from sklearn.manifold import TSNE
import seaborn as sns

In [4]:
from preprocessing import Process
from Constants import random_state
from feature_selection import select

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
df = pd.read_csv('data/train.csv')
X_test = pd.read_csv('data/test.csv')

In [7]:
df = df.dropna().reset_index(drop=True)
#indices whre Sex is unknown
index_to_drop = [i for i, x in enumerate(df['SexuponOutcome'].apply(lambda s: s=='Unknown')) if x] 
df = df.drop(index_to_drop).reset_index(drop=True)

X_train, y_train = df.drop(['Outcome'], axis=1), df['Outcome'] 
process = Process()
X_train = process.fit_transform(X_train)

# deal with unknown in Sex
X_test['SexuponOutcome'] = X_test['SexuponOutcome'].apply(lambda s: ('Neutered Male' if s == 'Unknown' else s))
X_test = process.transform(X_test)

del df

In [8]:
X_train_train, X_train_val, y_train_train, y_train_val = train_test_split(X_train, y_train, test_size=0.33,
                                                                          stratify=y_train,
                                                                          random_state=random_state) 

# LogisticRegression

In [27]:
clf = LogisticRegression(random_state=random_state)
params = {'C': np.logspace(-3, 3, num=10),
          'penalty': ['l1', 'l2', 'none'],
          'max_iter': np.logspace(2, 3, num=10).astype(int),
          'class_weight': ['balanced', None]}

gs = RandomizedSearchCV(clf, random_state=random_state, param_distributions=params, scoring='f1_macro', n_iter=20,
                        n_jobs=-1, verbose=1)
search = gs.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.0min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
cv_results = cross_validate(search.best_estimator_, X_train, y_train, scoring='f1_macro')
cv_results['test_score'].mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.3315282620204568

In [32]:
search.best_params_

{'penalty': 'none',
 'max_iter': 464,
 'class_weight': None,
 'C': 0.004641588833612777}

In [36]:
clf = LogisticRegression(random_state=random_state, **search.best_params_)
clf.fit(X_train_train, y_train_train)
y_pred = clf.predict(X_train_val)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
print(classification_report(y_train_val, y_pred))
print(pd.DataFrame(confusion_matrix(y_train_val, y_pred)))

              precision    recall  f1-score   support

           0       0.61      0.84      0.71      2093
           1       0.58      0.32      0.42      1029
           2       0.48      0.39      0.43      1069
           3       0.32      0.09      0.14       168
           4       0.00      0.00      0.00        17

    accuracy                           0.58      4376
   macro avg       0.40      0.33      0.34      4376
weighted avg       0.56      0.58      0.55      4376

      0    1    2   3  4
0  1757  106  220   6  4
1   522  334  162  10  1
2   545   89  415  16  4
3    57   33   63  15  0
4     6   10    1   0  0


In [194]:
clf = LogisticRegression(random_state=random_state, class_weight='balanced')
cv_results = cross_validate(clf, X_train, y_train, scoring='f1_macro')
cv_results['test_score'].mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.3208383391500714

In [40]:
clf = LogisticRegression(random_state=random_state, class_weight='balanced', max_iter=500)
clf.fit(X_train_train, y_train_train)
y_pred = clf.predict(X_train_val)

In [41]:
print(classification_report(y_train_val, y_pred))
print(pd.DataFrame(confusion_matrix(y_train_val, y_pred)))

              precision    recall  f1-score   support

           0       0.68      0.66      0.67      2093
           1       0.47      0.18      0.26      1029
           2       0.46      0.41      0.43      1069
           3       0.14      0.54      0.22       168
           4       0.02      0.47      0.05        17

    accuracy                           0.48      4376
   macro avg       0.35      0.45      0.33      4376
weighted avg       0.55      0.48      0.50      4376

      0    1    2    3    4
0  1387  114  328  162  102
1   352  187  159  168  163
2   281   85  436  232   35
3    23    9   32   91   13
4     3    4    0    2    8


In [42]:
pd.DataFrame(confusion_matrix(y_train_val, y_pred))

Unnamed: 0,0,1,2,3,4
0,1387,114,328,162,102
1,352,187,159,168,163
2,281,85,436,232,35
3,23,9,32,91,13
4,3,4,0,2,8


# Feature selection

In [9]:
feat = select(X_train, y_train)



In [10]:
X_train_selected_feat = X_train[feat]
X_train_train_selected_feat = X_train_train[feat]
X_train_val_selected_feat = X_train_val[feat]

In [11]:
clf = LogisticRegression(random_state=random_state, max_iter=500)
cv_results = cross_validate(clf, X_train_selected_feat, y_train, scoring='f1_macro')
cv_results['test_score'].mean()

0.3269655254983165

# SVM

In [11]:
params = {'C': np.logspace(-3, 3, num=5)}

score = None
best_C = None

process_bar = tqdm(total=len(params['C']))
for C in params['C']:
    clf = SVC(random_state=random_state, class_weight='balanced', C=C)
    clf.fit(X_train_train_selected_feat, y_train_train)
    y_pred = clf.predict(X_train_val_selected_feat)
    res = f1_score(y_train_val, y_pred, average='macro')
    if (score is None or res > score):
        score = res
        best_C = C
    process_bar.update(1)
print("score: ", score)
print(best_C)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

score:  0.3502475356825919
31.622776601683793


In [12]:
clf = SVC(random_state=random_state, C=31, class_weight='balanced')
clf.fit(X_train_train_selected_feat, y_train_train)
y_pred = clf.predict(X_train_val_selected_feat)

In [13]:
print(classification_report(y_train_val, y_pred))
print(pd.DataFrame(confusion_matrix(y_train_val, y_pred)))

              precision    recall  f1-score   support

           0       0.68      0.64      0.66      2093
           1       0.48      0.30      0.37      1029
           2       0.44      0.46      0.45      1069
           3       0.13      0.40      0.20       168
           4       0.03      0.29      0.06        17

    accuracy                           0.51      4376
   macro avg       0.35      0.42      0.35      4376
weighted avg       0.55      0.51      0.52      4376

      0    1    2    3   4
0  1330  177  377  135  74
1   324  313  213  126  53
2   261  122  495  178  13
3    31   28   39   68   2
4     1    6    2    3   5


# KNN

In [17]:
clf = KNeighborsClassifier()
clf.fit(X_train_train_selected_feat, y_train_train)
y_pred = clf.predict(X_train_val_selected_feat)

In [18]:
print(classification_report(y_train_val, y_pred))
print(pd.DataFrame(confusion_matrix(y_train_val, y_pred)))

              precision    recall  f1-score   support

           0       0.59      0.80      0.68      2093
           1       0.44      0.34      0.38      1029
           2       0.44      0.31      0.36      1069
           3       0.17      0.03      0.05       168
           4       0.00      0.00      0.00        17

    accuracy                           0.54      4376
   macro avg       0.33      0.29      0.30      4376
weighted avg       0.50      0.54      0.51      4376

      0    1    2   3  4
0  1670  209  211   3  0
1   523  348  152   6  0
2   549  177  328  15  0
3    65   50   48   5  0
4     4   10    3   0  0


  _warn_prf(average, modifier, msg_start, len(result))
