In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

def read_data(filename, labels=(), types=(), sep=None):
    data = dict()
    data['last'] = []
    
    with open(filename) as fin:
        file = fin.readlines()
        if not labels:
            if sep is None:
                labels = file[0].strip().split()
            else:
                labels = file[0].strip().split(sep)

        if not types:
            types = [str] * len(labels)

        for label in labels:
            data[label] = []
    
    
        for line in file[1:]:
            keys = []
            if sep is None:
                keys = line.strip().split()
            else:
                keys = line.strip().split(sep)
            
            for key, label, ttype in zip(keys, labels, types):
                data[label].append(ttype(key))
            data['last'].append(keys[len(labels):])
            
    data['size'] = len(data[labels[0]])
    return data

In [6]:
def lsp(s1, s2):
    ans = 0
    while ans < len(s1) and ans < len(s2) and s1[ans] == s2[ans]:
        ans += 1
    
    return ans

def init(data):
    data['form'] = []
    data['part'] = []
    data['lsp'] = []
    data['x_ending_len'] = []
    data['y_ending'] = []
    
    for x, y in zip(data['X'], data['y']):
        form, part = y.split('+')
        data['form'].append(form)
        data['part'].append(part)
        data['lsp'].append(lsp(data['form'][-1], x))
        data['x_ending_len'].append(len(x) - data['lsp'][-1])
        data['y_ending'].append(data['form'][-1][data['lsp'][-1]:])

In [7]:
prefix = '../../data/linear_contest2/'
raw_data = read_data(prefix + 'task2_lemmas_train', sep=',')

init(raw_data)

In [8]:
def get_by_indices(data, indices):
    new_data = dict()
    new_data['size'] = len(indices)
    
    for key, feature in data.items():
        if key == 'size':
            continue
        new_data[key] = np.array(data[key])[list(indices)]
    
    return new_data


def sample(data, frac, random_state):  # data must have key 'size'
    np.random.seed(random_state)
    indices = set()
    N = int(data['size'] * frac)
    
    while len(indices) < N:
        indices.add(np.random.randint(0, data['size']))
    
    return get_by_indices(data, indices)

In [51]:
frac = .05

train_sample = sample(raw_data, frac=frac, random_state=501)

train_sample['size']

5932

In [52]:
len(set(train_sample['x_ending_len']))

11

# Определение части речи

In [53]:
from sklearn.feature_extraction.text import HashingVectorizer
from scipy.sparse import coo_matrix

def gen_features(sample, ngram_range=(1, 4), n_features=2 ** 20):
    hashes = HashingVectorizer(ngram_range=ngram_range, 
                               analyzer='char_wb', 
                               n_features=n_features,
                               norm='l2').fit_transform(sample['X'])
    new_sample = dict(sample)
    new_sample['features'] = hashes
    return new_sample

In [54]:
gen_features(train_sample)

{'Id': array(['1', '32771', '10', ..., '65503', '32743', '32757'], 
       dtype='<U6'),
 'X': array(['vergognerete', 'rivangheremmo', 'computando', ..., 'spazzolerà',
        'pranzato', 'vacasser'], 
       dtype='<U26'),
 'features': <5932x1048576 sparse matrix of type '<class 'numpy.float64'>'
 	with 234628 stored elements in Compressed Sparse Row format>,
 'form': array(['vergognare', 'rivangare', 'computare', ..., 'spazzolare',
        'pranzare', 'vacare'], 
       dtype='<U26'),
 'last': array([[], [], [], ..., [], [], []], dtype=object),
 'lsp': array([7, 6, 7, ..., 7, 6, 4]),
 'part': array(['V', 'V', 'V', ..., 'V', 'V', 'V'], 
       dtype='<U1'),
 'size': 5932,
 'x_ending_len': array([5, 7, 3, ..., 3, 2, 4]),
 'y': array(['vergognare+V', 'rivangare+V', 'computare+V', ..., 'spazzolare+V',
        'pranzare+V', 'vacare+V'], 
       dtype='<U28'),
 'y_ending': array(['are', 'are', 're', ..., 'are', 're', 're'], 
       dtype='<U11')}

In [55]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

def cross_validation(model, data, n_splits=5, X_name='fetaures', y_name='y', **kwargs):
    score = 0.
    for train_indices, test_indices in KFold(n_splits=n_splits).split(data[y_name]):
        train = get_by_indices(data, train_indices)
        test = get_by_indices(data, test_indices)
        train = gen_features(train, **kwargs)
        test = gen_features(test, **kwargs)
        model.fit(train[X_name], train[y_name])
        score += accuracy_score(model.predict(test[X_name]), test[y_name])
    
    return score / n_splits

In [67]:
%%time
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression
# 0.9024
# print(cross_validation(LogisticRegression(random_state=501, C=20, dual=True),

print(cross_validation(PassiveAggressiveClassifier(random_state=501, C=20., n_iter=15), 
                       train_sample,
                       X_name='features',
                       y_name='part',
                       ngram_range=(1, 6),
                       n_features=2 ** 20))

0.90239525722
CPU times: user 3.86 s, sys: 83.3 ms, total: 3.94 s
Wall time: 4.02 s


In [72]:
%%time
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression

# 0.8356
# print(cross_validation(LogisticRegression(random_state=501, C=20, dual=True), 
print(cross_validation(PassiveAggressiveClassifier(random_state=501, C=20., n_iter=26), 
                       train_sample,
                       X_name='features',
                       y_name='x_ending_len',
                       ngram_range=(1, 6),
                       n_features=2 ** 20))

0.835640461378
CPU times: user 8.69 s, sys: 367 ms, total: 9.05 s
Wall time: 9.09 s


In [71]:
for n in range(20, 31, 2):
    print(n, cross_validation(PassiveAggressiveClassifier(random_state=501, C=20., n_iter=n), 
                              train_sample,
                              X_name='features',
                              y_name='x_ending_len',
                              ngram_range=(1, 6),
                              n_features=2 ** 20))

20 0.834122896869
22 0.833279868616
24 0.833111660754
26 0.835640461378
28 0.834628799061
30 0.833786054943


In [75]:
%%time
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression

# 0.8445
# print(cross_validation(LogisticRegression(random_state=501, C=20, dual=True), 
print(cross_validation(PassiveAggressiveClassifier(random_state=501, C=20., n_iter=15), 
                       train_sample,
                       X_name='features',
                       y_name='y_ending',
                       ngram_range=(1, 6),
                       n_features=2 ** 20))

0.844573804751
CPU times: user 18.4 s, sys: 1.36 s, total: 19.8 s
Wall time: 19.8 s


In [74]:
for n in range(10, 21, 2):
    print(n, cross_validation(PassiveAggressiveClassifier(random_state=501, C=20., n_iter=n), 
                              train_sample,
                              X_name='features',
                              y_name='y_ending',
                              ngram_range=(1, 6),
                              n_features=2 ** 20))

10 0.842550764252
12 0.844067902559
14 0.845079564876
16 0.844236962825
18 0.843899410562
20 0.84288803238


In [62]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.metrics import make_scorer

model = PassiveAggressiveClassifier()
clf = GridSearchCV(model, 
                   {
                       'random_state': (501,), 
                       'C': (20,),
                       'n_iter': range(5, 46, 10)
                   }, 
                   n_jobs=3,
                   verbose=1
                  )

data = gen_features(train_sample, ngram_range=(1, 6))
clf.fit(data['features'], data['part'])

print(*sorted(list(zip(clf.cv_results_['mean_test_score'], clf.cv_results_['params'])), 
              key=lambda x: x[0]), sep='\n')

Fitting 3 folds for each of 5 candidates, totalling 15 fits
(0.89801078894133513, {'C': 20, 'n_iter': 15, 'random_state': 501})
(0.89817936614969651, {'C': 20, 'n_iter': 25, 'random_state': 501})
(0.89817936614969651, {'C': 20, 'n_iter': 35, 'random_state': 501})
(0.89817936614969651, {'C': 20, 'n_iter': 45, 'random_state': 501})
(0.90053944706675659, {'C': 20, 'n_iter': 5, 'random_state': 501})


[Parallel(n_jobs=3)]: Done  15 out of  15 | elapsed:    3.6s finished


In [76]:
from sklearn.linear_model import PassiveAggressiveClassifier, LogisticRegression

def predict(train, test, model, X_name, y_name):
    train = gen_features(train, ngram_range=(1, 6), n_features=2 ** 20)
    test = gen_features(test, ngram_range=(1, 6), n_features=2 ** 20)
    
    model.fit(train[X_name], train[y_name])
    return model.predict(test[X_name])


def predict_part(train, test):
    model = PassiveAggressiveClassifier(random_state=501, C=20., n_iter=15)
    return predict(train, test, model, 'features', 'part')


def predict_x_ending_len(train, test):
    model = PassiveAggressiveClassifier(random_state=501, C=20., n_iter=26)
    return predict(train, test, model, 'features', 'x_ending_len')


def predict_y_ending(train, test):
    model = PassiveAggressiveClassifier(random_state=501, C=20., n_iter=15)
    return predict(train, test, model, 'features', 'y_ending')

# Для отправки

In [77]:
prefix = '../../data/linear_contest2/task2_lemmas_'
raw_data = read_data(prefix + 'train', sep=',')
init(raw_data)

raw_test = read_data(prefix + 'test', sep=',')

sample_submission = pd.read_csv(prefix + 'sample_submission')

In [79]:
%time raw_test['y_ending'] = predict_y_ending(raw_data, raw_test)
%time raw_test['x_ending_len'] = predict_x_ending_len(raw_data, raw_test)
%time raw_test['part'] = predict_part(raw_data, raw_test)

CPU times: user 2min 3s, sys: 2.14 s, total: 2min 5s
Wall time: 2min 6s
CPU times: user 40.5 s, sys: 1.83 s, total: 42.3 s
Wall time: 43.2 s
CPU times: user 16.2 s, sys: 363 ms, total: 16.5 s
Wall time: 16.6 s


In [80]:
raw_test['y'] = []

for x, x_ending_len, y_ending, part in zip(raw_test['X'], 
                                           raw_test['x_ending_len'], 
                                           raw_test['y_ending'], 
                                           raw_test['part']):
    raw_test['y'].append(x[:len(x) - x_ending_len] + y_ending + '+' + part)

sample_submission['Category'] = raw_test['y']

In [81]:
raw_test['y'][:10]

['gettonare+V',
 'incidentale+A',
 'involtare+V',
 'lievo+N',
 'comunistizzare+V',
 'vidimare+V',
 'imbrodre+V',
 'strillare+V',
 'cifrare+V',
 'compassare+V']

In [82]:
raw_test['x_ending_len'][:10]

array([1, 1, 3, 1, 3, 6, 2, 0, 3, 4])

In [83]:
sample_submission

Unnamed: 0,Id,Category
0,1,gettonare+V
1,2,incidentale+A
2,3,involtare+V
3,4,lievo+N
4,5,comunistizzare+V
5,6,vidimare+V
6,7,imbrodre+V
7,8,strillare+V
8,9,cifrare+V
9,10,compassare+V


In [85]:
sample_submission.to_csv("submission.tsv", sep=',', index=False)