In [1]:
# -*- coding: UTF-8 -*-

In [20]:
import os
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from scipy.sparse import coo_matrix, hstack
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
# Read data.
linear_train = pd.read_csv('data/linear_train.txt', header=None).dropna()
linear_ans_example = pd.read_csv('data/linear_ans_example.txt').dropna()
linear_test = pd.read_csv('data/linear_test.txt', header=None).dropna()

In [4]:
full_x = linear_train[0]
full_y = linear_train[1]

In [5]:
def to_last_n_letters(array, n):
    return [word[-(n*2):] for word in array]

def append_hash_back(array):
    return [word + "#" for word in array]

def append_dollar_front(array):
    return ["$" + word for word in array]

def append_front_back(array):
    return ["$" + word + "#" for word in array]

In [6]:
def encrypt_word(string, length):
    return ' '.join(string[i:i+length] for i in range(0,len(string) - length,2))

def encrypt_array_lengths(words_array, lengths):
    return np.array([ ' '.join([encrypt_word(string, length) for length in lengths]) 
                     for string in words_array ])

def isCapitalized(word):
    capitals = ['А','Б','В','Г','Д','Е','Ё','Ж','З','И','Й','К','Л','М','Н','О',
           'П','Р','С','Т','У','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ы','Ь','Э','Ю','Я']
    return int(word[0:2] in capitals and not (word[2:4] in capitals))

def vowel_count(word):
    vowels = ['А','Е','Ё','И','О','У','Ы','Э','Ю','Я',
              'а','е','ё','и','о','у','ы','э','ю','я']
    retval = 0
    for i in range (len(word)/2):
        if word[i*2 : (i*2)+2] in vowels:
            retval+=1
    return retval

def consonant_count(word):
    consonants = ['Б','В','Г','Д','Ж','З','Й','К','Л','М','Н','П','Р','С','Т','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ь',
                  'б','в','г','д','ж','з','й','к','л','м','н','п','р','с','т','ф','х','ц','ч','ш','щ','ъ','ь',]
    retval = 0
    for i in range (len(word)/2):
        if word[i*2 : (i*2)+2] in consonants:
            retval+=1
    return retval
    

def cvect_fit_transform(count_vect, x, n_gram_count):
    return count_vect.fit_transform(encrypt_array_lengths(x, n_gram_count*2))

def cvect_transform(count_vect, x, n_gram_count):
    return count_vect.transform(encrypt_array_lengths(x, n_gram_count*2))

In [7]:
def add_feature(functor,surnames_train, surnames_test, x_train, x_test):
    first_capital_train = np.array([functor(word) for word in surnames_train]).reshape([-1,1])
    x_train = hstack((x_train, coo_matrix(first_capital_train)))
    
    first_capital_test = np.array([functor(word) for word in surnames_test]).reshape([-1,1])
    x_test = hstack((x_test, coo_matrix(first_capital_test)))
    return (x_train, x_test)

In [8]:
def write_to_csv(y, csv_name):
    try :
        os.mkdir("results")
    except:
        pass
    output = pd.DataFrame(data=y, columns=['Answer'])
    output.index.name = 'Id'
    output.to_csv(path_or_buf = './results/' + csv_name, index=True)

In [34]:
def get_result(clf, x_train_l, y_train_l, x_test_l):
    count_vect = CountVectorizer(ngram_range=(1,1), decode_error='ignore', lowercase=True)
    
    x_train_new0 = count_vect.fit_transform(encrypt_array_lengths(x_train_l, np.array([3,4])*2))
    x_test_new0 = count_vect.transform(encrypt_array_lengths(x_test_l, np.array([3,4])*2))
    
    x_train_new0, x_test_new0 = add_feature(isCapitalized, x_train_l, x_test_l, x_train_new0, x_test_new0)
    x_train_new0, x_test_new0 = add_feature(vowel_count, x_train_l, x_test_l, x_train_new0, x_test_new0)
    x_train_new0, x_test_new0 = add_feature(consonant_count, x_train_l, x_test_l, x_train_new0, x_test_new0)

    x_train_new1 = count_vect.fit_transform(to_last_n_letters(x_train_l, 3))
    x_test_new1 = count_vect.transform(to_last_n_letters(x_test_l, 3))
                                  
    x_train_new0, x_test_new0 = (hstack((x_train_new0, x_train_new1)), hstack((x_test_new0, x_test_new1)))
                                  
    clf = clf.fit(x_train_new0, y_train_l)
    return clf.predict(x_test_new0)
    
    

In [35]:
x_train, x_test, y_train, y_test = train_test_split(full_x, full_y, train_size=0.6)

In [36]:
%%time
print(roc_auc_score(get_result(LogisticRegression(), append_front_back(x_train), y_train, append_front_back(x_test)), 
                     y_test))

0.833395014867
CPU times: user 8.02 s, sys: 132 ms, total: 8.15 s
Wall time: 5.57 s


In [37]:
%%time
print(roc_auc_score(get_result(MultinomialNB(), append_hash_back(x_train), y_train, append_hash_back(x_test)), 
                     y_test))

0.718507513867
CPU times: user 5.05 s, sys: 84 ms, total: 5.14 s
Wall time: 5.04 s


In [38]:
%%time
result = get_result(LogisticRegression(), linear_train[0], linear_train[1], linear_test[0])

CPU times: user 21.6 s, sys: 336 ms, total: 22 s
Wall time: 14.8 s


In [39]:
# write_to_csv(result, "result0.csv")

In [40]:
# Количество гласных/согласных - даёт небольшой буст ~ 0.5%
# Окончания - добавление проверки сверху (поверх hash & dollar) ничего не даёт, кажется
# Извлекать фичи SelectFromModel
# Add hash to the end / beginning

In [41]:
# Добавление символа в начало улучшило результат на 2% при 3,4 граммах.

In [44]:
from __future__ import print_function

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

def find_parameters(clf, x_train_l, y_train_l, x_test_l, y_test_l):
    count_vect = CountVectorizer(ngram_range=(1,1), decode_error='ignore', lowercase=True)
    
    x_train_new0 = count_vect.fit_transform(encrypt_array_lengths(x_train_l, np.array([3,4])*2))
    x_test_new0 = count_vect.transform(encrypt_array_lengths(x_test_l, np.array([3,4])*2))
    
    x_train_new0, x_test_new0 = add_feature(isCapitalized, x_train_l, x_test_l, x_train_new0, x_test_new0)
    x_train_new0, x_test_new0 = add_feature(vowel_count, x_train_l, x_test_l, x_train_new0, x_test_new0)
    x_train_new0, x_test_new0 = add_feature(consonant_count, x_train_l, x_test_l, x_train_new0, x_test_new0)

    x_train_new1 = count_vect.fit_transform(to_last_n_letters(x_train_l, 3))
    x_test_new1 = count_vect.transform(to_last_n_letters(x_test_l, 3))
                                  
    x_train_new0, x_test_new0 = (hstack((x_train_new0, x_train_new1)), hstack((x_test_new0, x_test_new1)))
                               
    clf_config = {           
                'penalty' : ['l1', 'l2'], 
                'dual' : [False],
                'max_iter' : np.arange(500,1501,500),
                'tol' : [1e-4, 1e-5, 1e-6],
                'C': [1, 10]
              }
    
    scorer = make_scorer(score_func=roc_auc_score)
    
    grid_search = GridSearchCV(estimator=clf, scoring=scorer, param_grid=clf_config, cv=5)
    grid_search.fit(x_train_new0, y_train_l)
        
    print("Best parameters set found on development set:")
    print()
    print(grid_search.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test_l, grid_search.predict(x_test_new0)
    print(classification_report(y_true, y_pred))
    print()
        

In [None]:
%%time
find_parameters(LogisticRegression(), x_train, y_train, x_test, y_test)