In [195]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

def read_data(filename, labels, types, sep=None):
    data = dict()
    for label in labels:
        data[label] = []
    
    with open(filename) as fin:
        for line in fin:
            keys = []
            if sep is None:
                keys = line.split()
            else:
                keys = line.split(sep)
            
            for key, label, ttype in zip(keys, labels, types):
                data[label].append(ttype(key))
            
    data['size'] = len(data[labels[0]])
    return data

In [196]:
prefix = '../../data/linear_contest1/'
raw_data = read_data(prefix + 'linear_train.txt', ['words', 'y'], [str, int], sep=', ')

# Генерируем признаки и обрабатываем данные
    Хеши по модулю
    Хеш последних трех символов (логично, что окончание влияет)
    *В одном ли регистре написана (если с большой буквы, то может и фамилия)

In [197]:
def get_by_indices(data, indices):
    new_data = dict()
    new_data['size'] = len(indices)
    
    for key, feature in data.items():
        if key == 'size':
            continue
        new_data[key] = np.array(data[key])[list(indices)]
    
    return new_data


def sample(data, frac, random_state):  # data must have key 'size'
    np.random.seed(random_state)
    indices = set()
    N = int(data['size'] * frac)
    
    while len(indices) < N:
        indices.add(np.random.randint(0, data['size']))
    
    return get_by_indices(data, indices)

In [198]:
frac = .1

train_sample = sample(raw_data, frac=frac, random_state=501)

train_sample['size']

10140

In [289]:
from sklearn.feature_extraction.text import HashingVectorizer
from scipy.sparse import coo_matrix

def gen_features(sample, ngram_range=(1, 4), n_features=2 ** 20):
    hashes = HashingVectorizer(ngram_range=ngram_range, 
                               analyzer='char', 
                               n_features=n_features).fit_transform(sample['words'])
    new_sample = dict(sample)
    new_sample['features'] = hashes
    return new_sample

In [279]:
gen_features(train_sample)

{'features': <10140x1048576 sparse matrix of type '<class 'numpy.float64'>'
 	with 212631 stored elements in Compressed Sparse Row format>,
 'size': 10140,
 'words': array(['Аалтонен', 'Катон', 'шпиля', ..., 'прегрешениями', 'католиков',
        'ПРЕДАНИЯМИ'], 
       dtype='<U33'),
 'y': array([1, 1, 0, ..., 0, 0, 0])}

In [276]:
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

def roc_auc(y, y_pred):
    fpr, tpr, _ = metrics.roc_curve(y_pred, y)
    return metrics.auc(fpr, tpr)


def cross_validation(model, data, n_splits=5, **kwargs):
    score = 0.
    for train_indices, test_indices in KFold(n_splits=n_splits).split(data['y']):
        train = get_by_indices(data, train_indices)
        test = get_by_indices(data, test_indices)
        train = gen_features(train, **kwargs)
        test = gen_features(test, **kwargs)
        model.fit(train['features'], train['y'])
        score += roc_auc(model.predict_proba(test['features'])[:, 1], test['y'])
    
    return score / n_splits

In [300]:
from sklearn.linear_model import LogisticRegression

# print(cross_validation(LogisticRegression(random_state=501, n_jobs=3, C=3.8, dual=True), 
#                        raw_data,
#                        ngram_range=(1, 4)))

print(cross_validation(LogisticRegression(random_state=501, n_jobs=3, C=5.0, dual=True), 
                       raw_data,
                       ngram_range=(1, 4),
                       n_features=262144))

0.844635776661


0.8458

In [None]:
from sklearn.linear_model import LogisticRegression

for C in np.arange(3., 5.01, 0.2):
    print(C, cross_validation(LogisticRegression(random_state=501, n_jobs=3, C=C, dual=True), 
                              raw_data,
                              ngram_range=(1, 4),
                              n_features=262144))

3.0 0.844921861779
3.2 0.844989852439
3.4 0.845033025448
3.6 0.845045724419


In [296]:
for n_features in [2 ** i for i in range(15, 23)]:
    print(n_features, cross_validation(LogisticRegression(random_state=501, n_jobs=3, C=3.8), 
                              train_sample,
                              ngram_range=(1, 4),
                              n_features=n_features))

32768 0.802397299792
65536 0.803444985897
131072 0.80392836637
262144 0.804428782149
524288 0.803693258327
1048576 0.803800561423
2097152 0.804000592551
4194304 0.804250318339


In [281]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

model = LogisticRegression()
clf = GridSearchCV(model, 
                   {
                       'random_state': (501,), 
                       'C': np.arange(1, 6, 0.1),
                       'penalty': ('l2',)
                   }, 
                   n_jobs=3,
                   scoring=make_scorer(roc_auc),
                   verbose=1)

data = gen_features(train_sample, ngram_range=(1, 4))
clf.fit(data['features'], data['y'])

print(*sorted(list(zip(clf.cv_results_['mean_test_score'], clf.cv_results_['params'])), 
              key=lambda x: x[0])[-5:], sep='\n')

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   29.2s
[Parallel(n_jobs=3)]: Done 150 out of 150 | elapsed:  2.1min finished


(0.81800827719452041, {'C': 2.0000000000000009, 'penalty': 'l2', 'random_state': 501})
(0.8183958393179217, {'C': 1.6000000000000005, 'penalty': 'l2', 'random_state': 501})
(0.82574306968175415, {'C': 1.3000000000000003, 'penalty': 'l2', 'random_state': 501})
(0.82749149731093952, {'C': 1.5000000000000004, 'penalty': 'l2', 'random_state': 501})
(0.84174974507011713, {'C': 1.4000000000000004, 'penalty': 'l2', 'random_state': 501})


In [231]:
print(*sorted(list(zip(clf.cv_results_['mean_test_score'], clf.cv_results_['params'])), 
              key=lambda x: x[0])[-5:], sep='\n')

(0.76156287049395088, {'C': 5.3000000000000043, 'penalty': 'l2', 'random_state': 501})
(0.76215475423129719, {'C': 5.5000000000000036, 'penalty': 'l2', 'random_state': 501})
(0.76316474013881208, {'C': 5.4000000000000039, 'penalty': 'l2', 'random_state': 501})
(0.76431691384592904, {'C': 1.0, 'penalty': 'l2', 'random_state': 501})
(0.76761605604853467, {'C': 1.1000000000000001, 'penalty': 'l2', 'random_state': 501})


# Для отправки

In [233]:
prefix = '../../data/linear_contest1/'
raw_data = read_data(prefix + 'linear_train.txt', ['words', 'y'], [str, int], sep=', ')
raw_test = read_data(prefix + 'linear_test.txt', ['words'], [str])
sample_submission = pd.read_csv(prefix + 'linear_ans_example.txt')

In [256]:
%%time
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# M = 25
# M = 15
# model = RandomForestClassifier(n_estimators=150, n_jobs=3, random_state=501)
# model = SVC(probability=True, random_state=501)
C = 3.8
ngram_range = (1, 4)
n_features = 2 ** 20
model = LogisticRegression(random_state=501, n_jobs=3, C=5.3, dual=True)
data = gen_features(raw_data, ngram_range=ngram_range, n_features=n_features)
print(model.fit(data['features'], data['y']))

LogisticRegression(C=5.3, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=3,
          penalty='l2', random_state=501, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
CPU times: user 6.67 s, sys: 16.7 ms, total: 6.68 s
Wall time: 6.78 s


In [260]:
test = gen_features(raw_test, ngram_range=ngram_range, n_features=n_features)
sample_submission['Answer'] = model.predict_proba(test['features'])[:, 1]

In [259]:
sample_submission.to_csv("submission.tsv", sep=',', index=False)