In [48]:
import warnings
warnings.filterwarnings("ignore")


In [49]:
import numpy as np
import pandas as pd

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import base
from sklearn.model_selection import train_test_split

Загружаем данные из train.csv и test.csv

In [51]:
train = pd.read_csv('train.csv', index_col=0)
train_data = train[train.columns[0:400]]
train_data = train_data.values
test_data = pd.read_csv('test.csv', index_col=0)
test_data = test_data.values
Category = train['Category']
Category = Category.values

In [52]:
np.random.seed(0)
X_train, X_test, Y_train, Y_test = train_test_split(data, Category, test_size = 2000)
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((8000, 400), (8000,), (2000, 400), (2000,))

Используем простейший метод ансамблирования из семинаров. Несмотря на все его минусы, он в итоге дает результат лучше, чем просто логистическая регрессия.

In [53]:
class DummyEnsemble(object):
    def __init__(self, base_estimator=None, n_estimators=10):
        self.n_estimators = n_estimators
        self.base_estimator = DecisionTreeClassifier(max_depth=1)
        if base_estimator:
            self.base_estimator = base_estimator    
        self.b = [base.clone(self.base_estimator) for _ in range(self.n_estimators)]
        
    def predict(self, X):
        probas = self.predict_proba(X)
        return np.argmax(probas, axis=1)
    
    def get_params(self, deep=True):
        return {'n_estimators': self.n_estimators, 
                'base_estimator': self.base_estimator}
        
    def fit(self, X, Y):
        for b in self.b:
            b.fit(X, Y)
            
    def predict_proba(self, X):
        return np.mean([elem.predict_proba(X) for elem in self.b], axis=0)

Обучаем модель

In [54]:
model = DummyEnsemble(LogisticRegression())
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=2, random_state=1)
n_scores = cross_val_score(model, train_data, Category, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Score: %.3f' % (np.mean(n_scores)))


Score: 0.159


In [55]:
model = DummyEnsemble(LogisticRegression())
model.fit(X_train,Y_train.astype(int))

Записываем в файл

In [56]:
Category = model.predict(test_data)
Category = Category.astype(int)
Id = np.arange(0,1000)
model_answer = {'Id': Id, 'Category': Category}
df = pd.DataFrame(data=model_answer)
df.to_csv('model_answer.csv',index=False)