In [54]:
import os

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

from scipy.sparse import coo_matrix, hstack

In [32]:
# Read data.
linear_train = pd.read_csv('data/linear_train.txt', header=None, names=['word', 'label']).dropna()
linear_ans_example = pd.read_csv('data/linear_ans_example.txt').dropna()
linear_test = pd.read_csv('data/linear_test.txt', header=None, names=['word']).dropna()

In [3]:
def to_last_n_letters(array, n):
    return [word[-(n*2):] for word in array]

def append_hash_back(array):
    return [word + "#" for word in array]

def append_dollar_front(array):
    return ["$" + word for word in array]

def append_front_back(array):
    return ["$" + word + "#" for word in array]

In [74]:
# Слово с заглавной буквы
def isCapitalized(word):
    capitals = ['А','Б','В','Г','Д','Е','Ё','Ж','З','И','Й','К','Л','М','Н','О',
                'П','Р','С','Т','У','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ы','Ь','Э','Ю','Я']
    if len(word) == 1:
        return int(word[0] in capitals)
    else:
        return int(word[0] in capitals and not (word[1] in capitals))

# Количество гласных в слове
def vowel_count(word):
    vowels = ['А','Е','Ё','И','О','У','Ы','Э','Ю','Я',
              'а','е','ё','и','о','у','ы','э','ю','я']
    retval = 0
    for c in word:
        if c in vowels:
            retval+=1
    return retval

# Количество согласных в слове
def consonant_count(word):
    consonants = ['Б','В','Г','Д','Ж','З','Й','К','Л','М','Н','П','Р','С','Т','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ь',
                  'б','в','г','д','ж','з','й','к','л','м','н','п','р','с','т','ф','х','ц','ч','ш','щ','ъ','ь',]
    retval = 0
    for c in word:
        if c in consonants:
            retval+=1
    return retval

# Количество парных букв в слове
def count_doubles(word):
    l = [let for let in word.lower()]
    return len([(x,y) for x,y in zip(l, l[1:]) if x == y])

In [75]:
a = "hello, world, bblaaqqq"
l = [let for let in a.lower()]
z = zip(l, l[1:])
print([(x,y) for x,y in zip(l, l[1:]) if x == y])

[('l', 'l'), ('b', 'b'), ('a', 'a'), ('q', 'q'), ('q', 'q')]


In [76]:
def append_feature(functor, surnames, x_transformed):
    new_feature = np.array([functor(word) for word in surnames]).reshape([-1,1])
    x_transformed = hstack((x_transformed, coo_matrix(new_feature)))
    return x_transformed

In [77]:
def write_to_csv(y, csv_name):
    try :
        os.mkdir("results")
    except:
        pass
    output = pd.DataFrame(data=y, columns=['Answer'])
    output.index.name = 'Id'
    output.to_csv(path_or_buf = './results/' + csv_name, index=True)

Добавим фичи

In [148]:
def add_features(dataset):
    dataset['length'] = dataset['word'].apply(lambda word: len(word))
    dataset['capitalized'] = dataset['word'].apply(lambda word: isCapitalized(word))
    dataset['vowel_count'] = dataset['word'].apply(lambda word: vowel_count(word))
    dataset['consonant_count'] = dataset['word'].apply(lambda word: consonant_count(word))
    dataset['doubles'] = dataset['word'].apply(lambda word: count_doubles(word))
    return dataset

add_features(linear_train).head()

Unnamed: 0,word,label,length,capitalized,vowel_count,consonant_count,doubles
0,Аалтонен,1,8,1,4,4,1
1,Аар,0,3,1,2,1,1
2,Аарон,0,5,1,3,2,1
3,ААРОН,0,5,0,3,2,1
4,Аарона,0,6,1,4,2,1


In [80]:
linear_train.head()

Unnamed: 0,word,label,length,capitalized,vowel_count,consonant_count,doubles
0,Аалтонен,1,8,1,4,4,1
1,Аар,0,3,1,2,1,1
2,Аарон,0,5,1,3,2,1
3,ААРОН,0,5,0,3,2,1
4,Аарона,0,6,1,4,2,1


In [81]:
clf = LogisticRegression()

## cross_val

In [87]:
needed_cols = linear_train.columns.drop(['word', 'label'])

In [91]:
linear_train[needed_cols].head()

Unnamed: 0,length,capitalized,vowel_count,consonant_count,doubles
0,8,1,4,4,1
1,3,1,2,1,1
2,5,1,3,2,1
3,5,0,3,2,1
4,6,1,4,2,1


In [102]:
xtrain, xcv, ytrain, ycv = train_test_split(linear_train[needed_cols], linear_train['label'], test_size = 0.1)

In [104]:
xtrain, xcv = map(lambda x: pd.DataFrame(x, columns=needed_cols), [xtrain, xcv])
ytrain, ycv = map(lambda x: pd.DataFrame(x, columns=['label']), [ytrain, ycv])

In [141]:
prediction = LogisticRegression().fit(xtrain, ytrain).predict_proba(xcv)

  y = column_or_1d(y, warn=True)


In [142]:
roc_auc_score(ycv, prediction[:,1])

0.80736589300773665

In [144]:
predictor = LogisticRegression().fit(linear_train[needed_cols], linear_train['label'])

In [160]:
prediction = predictor.predict_proba(add_features(linear_test)[needed_cols])

In [162]:
write_to_csv(prediction[:,1], "no_vectorizer.csv")

In [8]:
transformer = CountVectorizer(min_df=1, 
                              ngram_range=(3, 8), 
                              analyzer='char_wb', 
                              binary=True)

matrix = transformer.fit_transform(x_train)

classifier = LogisticRegression(penalty='l2', 
                                C=3)

In [9]:
%%time
scores = cross_val_score(classifier, matrix, y_train, scoring=make_scorer(roc_auc_score), cv=3, n_jobs=-1)

CPU times: user 296 ms, sys: 60 ms, total: 356 ms
Wall time: 13 s


In [10]:
print(scores)

[ 0.58349104  0.54144344  0.58926707]


In [14]:
xtrain, xtest, ytrain, ytest = train_test_split(x_train, y_train)

In [19]:
transformer = CountVectorizer(min_df=1, 
                              ngram_range=(3, 8), 
                              analyzer='char_wb', 
                              binary=True)

transformer.fit(x_train)
matrix = transformer.transform(xtrain)

classifier = LogisticRegression(penalty='l2', 
                                C=3)

In [20]:
roc_auc_score(ytest, classifier.fit(matrix, ytrain).predict(transformer.transform(xtest)))

0.7476184595220452