In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

def read_data(filename, labels, types, sep=None):
    data = dict()
    for label in labels:
        data[label] = []
    
    with open(filename) as fin:
        for line in fin:
            keys = []
            if sep is None:
                keys = line.split()
            else:
                keys = line.split(sep)
            
            for key, label, ttype in zip(keys, labels, types):
                data[label].append(ttype(key))
            
    data['size'] = len(data[labels[0]])
    return data

In [4]:
prefix = '../../data/linear_contest1/'
raw_data = read_data(prefix + 'linear_train.txt', ['words', 'y'], [str, int], sep=', ')

# Генерируем признаки и обрабатываем данные
    Хеши по модулю
    Хеш последних трех символов (логично, что окончание влияет)
    *В одном ли регистре написана (если с большой буквы, то может и фамилия)

In [5]:
def get_by_indices(data, indices):
    new_data = dict()
    new_data['size'] = len(indices)
    
    for key, feature in data.items():
        if key == 'size':
            continue
        new_data[key] = np.array(data[key])[list(indices)]
    
    return new_data


def sample(data, frac, random_state):  # data must have key 'size'
    np.random.seed(random_state)
    indices = set()
    N = int(data['size'] * frac)
    
    while len(indices) < N:
        indices.add(np.random.randint(0, data['size']))
    
    return get_by_indices(data, indices)

In [6]:
frac = .1

train_sample = sample(raw_data, frac=frac, random_state=501)

train_sample['size']

10140

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import coo_matrix

def gen_features(sample, M=10):
    N = sample['size']
    np_hash = np.vectorize(hash)
    
    # 0. Hashes by module M, first M features
    hashes0 = np_hash(sample['words']) % M
    
    # 1. Hashes of last three characters, feature M
    hashes1 = [hash(word[-3:]) for word in sample['words']]
    
    features = coo_matrix(([1] * N + hashes1, (list(range(N)) * 2, list(hashes0) + [M] * N)), shape=(N, M + 1))
    sample['features'] = features

In [8]:
# gen_features(train_sample)
train_sample

{'size': 10140,
 'words': array(['Аалтонен', 'Катон', 'шпиля', ..., 'прегрешениями', 'католиков',
        'ПРЕДАНИЯМИ'], 
       dtype='<U33'),
 'y': array([1, 1, 0, ..., 0, 0, 0])}

In [9]:
from sklearn import metrics
from sklearn.model_selection import KFold

def roc_auc(y, y_pred):
    fpr, tpr, _ = metrics.roc_curve(y_pred, y)
    return metrics.auc(fpr, tpr)


def cross_validation(model, data, M=10, n_splits=5):
    score = 0.
    for train_indices, test_indices in KFold(n_splits=n_splits).split(data['y']):
        train = get_by_indices(data, train_indices)
        test = get_by_indices(data, test_indices)
        gen_features(train, M)
        gen_features(test, M)
        model.fit(train['features'], train['y'])
        score += roc_auc(model.predict_proba(test['features'])[:, 1], test['y'])
    
    return score / n_splits

In [15]:
from sklearn.ensemble import RandomForestClassifier

for n in range(10, 101, 10):
    print(n, cross_validation(RandomForestClassifier(n_estimators=n, n_jobs=3, random_state=501), train_sample))

10 0.599314141025
20 0.603835039564
30 0.607399395756
40 0.612593450988
50 0.613140561982
60 0.613616461546
70 0.614443102129
80 0.61766127595
90 0.617861933024
100 0.619682536284


In [14]:
from sklearn.ensemble import RandomForestClassifier

for n in range(100, 301, 50):
    print(n, cross_validation(RandomForestClassifier(n_estimators=n, n_jobs=3, random_state=501), train_sample))

100 0.614704810181


KeyboardInterrupt: 

In [13]:
for M in range(5, 56, 2):
    print(M, cross_validation(RandomForestClassifier(n_estimators=150, 
                                                     n_jobs=3, 
                                                     random_state=501), 
                              train_sample, M))

NameError: name 'RandomForestClassifier' is not defined

In [2]:
from sklearn.linear_model import LogisticRegression

LogisticRegression()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
from sklearn.linear_model import LogisticRegression

for M in range(100, 201, 10):
    print(M, cross_validation(LogisticRegression(random_state=501, n_jobs=3), train_sample, M))

100 0.500409097572
110 0.500409097572
120 0.500409097572
130 0.500409097572
140 0.500409097572
150 0.500409097572
160 0.500409097572
170 0.500409097572
180 0.500409097572
190 0.500409097572
200 0.500409097572


# Для отправки

In [11]:
raw_test = read_data(prefix + 'linear_test.txt', ['words'], [str])
sample_submission = pd.read_csv(prefix + 'linear_ans_example.txt')

In [12]:
%%time
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

M = 25
# M = 15
# model = RandomForestClassifier(n_estimators=150, n_jobs=3, random_state=501)
model = SVC(probability=True, random_state=501)
gen_features(raw_data, M)
model.fit(raw_data['features'], raw_data['y'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=501, shrinking=True,
  tol=0.001, verbose=False)

In [13]:
gen_features(raw_test, M)
sample_submission['Answer'] = model.predict_proba(raw_test['features'])[:, 1]

In [14]:
sample_submission.head()

Unnamed: 0,Id,Answer
0,0,0.083132
1,1,0.105191
2,2,0.093699
3,3,0.073659
4,4,0.083132


In [15]:
sample_submission.to_csv("submission.tsv", sep=',', index=False)