In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, KFold
print(os.listdir("../input/matrix20000npz"))

['X_test_matrix_20000.npz', 'X_train_matrix_20000.npz']


In [2]:
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish','muslim', 'black', 'white', 'psychiatric_or_mental_illness']
AUX_COLUMNS = ['target', 'severe_toxicity','obscene','identity_attack','insult','threat']

In [3]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

y_train = np.where(train['target'] >= 0.5, 1, 0)
train_y_identity = np.where(train[identity_columns].values >= 0.5, 1, 0)

In [4]:
train_y_identity = np.where(np.sum(train_y_identity, axis=1)>1,1,0)

In [5]:
X_train_matrix = sparse.load_npz("../input/matrix20000npz/X_train_matrix_20000.npz")
X_test_matrix = sparse.load_npz("../input/matrix20000npz/X_train_matrix_20000.npz")

In [6]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        y = y
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x.tocsr()[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)
        
        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [7]:
seed=123
n_splits=10
test_preds = np.zeros(len(test))
train_preds = np.zeros(len(y_train))

In [8]:
splits = list(KFold(n_splits=n_splits, shuffle=False, random_state=seed).split(X_train_matrix, train_y_identity))

In [9]:
# Делаем предсказание в разбивке 10 фолдов
for fold, (train_idx, valid_idx) in enumerate(splits):
    NbSvm = NbSvmClassifier(C=0.5, dual=False, n_jobs=-1)
    NbSvm.fit(X_train_matrix[train_idx], train_y_identity[train_idx])
    train_preds[valid_idx] = NbSvm.predict_proba(X_train_matrix[valid_idx])[:,1]

In [10]:
NbSvm = NbSvmClassifier(C=0.5, dual=False, n_jobs=-1)
NbSvm.fit(X_train_matrix, train_y_identity)
test_preds = NbSvm.predict_proba(X_test_matrix)[:,1]

In [11]:
#print(roc_auc_score(train_y_identity_eval, prediction))
#print(accuracy_score(np.where(train_y_identity_eval>=0.5,1,0), np.where(prediction>=0.5,1,0)))

In [12]:
pd.DataFrame(train_preds).to_csv('train_identity.csv', index=False)
pd.DataFrame(test_preds).to_csv('test_identity.csv', index=False)