In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

from scipy.sparse import coo_matrix, hstack

In [2]:
# Read data.
linear_train = pd.read_csv('data/linear_train.txt', header=None, names=['word', 'label']).dropna()
linear_ans_example = pd.read_csv('data/linear_ans_example.txt').dropna()
linear_test = pd.read_csv('data/linear_test.txt', header=None, names=['word']).dropna()

In [51]:
def to_last_n_letters(array, n):
    return [word[-(n*2):] for word in array]

In [4]:
# Слово с заглавной буквы
def isCapitalized(word):
    capitals = ['А','Б','В','Г','Д','Е','Ё','Ж','З','И','Й','К','Л','М','Н','О',
                'П','Р','С','Т','У','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ы','Ь','Э','Ю','Я']
    if len(word) == 1:
        return int(word[0] in capitals)
    else:
        return int(word[0] in capitals and not (word[1] in capitals))

# Количество гласных в слове
def vowel_count(word):
    vowels = ['А','Е','Ё','И','О','У','Ы','Э','Ю','Я',
              'а','е','ё','и','о','у','ы','э','ю','я']
    retval = 0
    for c in word:
        if c in vowels:
            retval+=1
    return retval

# Количество согласных в слове
def consonant_count(word):
    consonants = ['Б','В','Г','Д','Ж','З','Й','К','Л','М','Н','П','Р','С','Т','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ь',
                  'б','в','г','д','ж','з','й','к','л','м','н','п','р','с','т','ф','х','ц','ч','ш','щ','ъ','ь',]
    retval = 0
    for c in word:
        if c in consonants:
            retval+=1
    return retval

# Количество парных букв в слове
def count_doubles(word):
    l = [let for let in word.lower()]
    return len([(x,y) for x,y in zip(l, l[1:]) if x == y])

In [5]:
a = "hello, world, bblaaqqq"
l = [let for let in a.lower()]
z = zip(l, l[1:])
print([(x,y) for x,y in zip(l, l[1:]) if x == y])

[('l', 'l'), ('b', 'b'), ('a', 'a'), ('q', 'q'), ('q', 'q')]


In [6]:
def append_feature(functor, surnames, x_transformed):
    new_feature = np.array([functor(word) for word in surnames]).reshape([-1,1])
    x_transformed = hstack((x_transformed, coo_matrix(new_feature)))
    return x_transformed

In [7]:
def write_to_csv(y, csv_name):
    try :
        os.mkdir("results")
    except:
        pass
    output = pd.DataFrame(data=y, columns=['Answer'])
    output.index.name = 'Id'
    output.to_csv(path_or_buf = './results/' + csv_name, index=True)

Добавим фичи

In [8]:
def add_features(dataset):
    dataset['length'] = dataset['word'].apply(lambda word: len(word))
    dataset['capitalized'] = dataset['word'].apply(lambda word: isCapitalized(word))
    dataset['vowel_count'] = dataset['word'].apply(lambda word: vowel_count(word))
    dataset['consonant_count'] = dataset['word'].apply(lambda word: consonant_count(word))
    dataset['doubles'] = dataset['word'].apply(lambda word: count_doubles(word))
    return dataset

add_features(linear_train).head()

Unnamed: 0,word,label,length,capitalized,vowel_count,consonant_count,doubles
0,Аалтонен,1,8,1,4,4,1
1,Аар,0,3,1,2,1,1
2,Аарон,0,5,1,3,2,1
3,ААРОН,0,5,0,3,2,1
4,Аарона,0,6,1,4,2,1


In [9]:
linear_train.head()

Unnamed: 0,word,label,length,capitalized,vowel_count,consonant_count,doubles
0,Аалтонен,1,8,1,4,4,1
1,Аар,0,3,1,2,1,1
2,Аарон,0,5,1,3,2,1
3,ААРОН,0,5,0,3,2,1
4,Аарона,0,6,1,4,2,1


In [10]:
clf = LogisticRegression()

## Making cross validation : just features

In [11]:
needed_cols = linear_train.columns.drop(['word', 'label'])

In [12]:
linear_train[needed_cols].head()

Unnamed: 0,length,capitalized,vowel_count,consonant_count,doubles
0,8,1,4,4,1
1,3,1,2,1,1
2,5,1,3,2,1
3,5,0,3,2,1
4,6,1,4,2,1


In [13]:
xtrain, xcv, ytrain, ycv = train_test_split(linear_train[needed_cols], linear_train['label'], test_size = 0.1)

In [14]:
xtrain, xcv = map(lambda x: pd.DataFrame(x, columns=needed_cols), [xtrain, xcv])
ytrain, ycv = map(lambda x: pd.DataFrame(x, columns=['label']), [ytrain, ycv])

In [15]:
prediction = LogisticRegression().fit(xtrain, ytrain).predict_proba(xcv)

  y = column_or_1d(y, warn=True)


In [16]:
roc_auc_score(ycv, prediction[:,1])

0.81439825406624611

## Making a submission : just features

In [17]:
predictor = LogisticRegression().fit(linear_train[needed_cols], linear_train['label'])

In [18]:
prediction = predictor.predict_proba(add_features(linear_test)[needed_cols])

In [19]:
write_to_csv(prediction[:,1], "no_vectorizer.csv")

## Let's use here only ngrams : going on cross validation

In [20]:
transformer = CountVectorizer(ngram_range=(2, 8), analyzer='char_wb', binary=True, lowercase=True, max_df=0.87)

In [21]:
matrix = transformer.fit_transform(linear_train['word'])

In [22]:
xtrain, xcv, ytrain, ycv = train_test_split(matrix, linear_train['label'], test_size=0.1)

In [23]:
prediction = LogisticRegression().fit(xtrain, ytrain).predict_proba(xcv)

In [24]:
roc_auc_score(ycv, prediction[:,1])

0.91814756622641369

#### Let's try te search nice max_df:

In [25]:
# search max_df
for max_df in np.arange(0.8,0.96, 0.02):
    transformer = CountVectorizer(ngram_range=(2, 8), analyzer='char_wb', binary=True, lowercase=True, max_df=max_df)
    matrix = transformer.fit_transform(linear_train['word'])
    xtrain, xcv, ytrain, ycv = train_test_split(matrix, linear_train['label'], test_size=0.1)
    prediction = LogisticRegression().fit(xtrain, ytrain).predict_proba(xcv)
    print("for max_df={} ".format(max_df) + str(roc_auc_score(ycv, prediction[:,1])))
    

for max_df=0.8 0.919442668651
for max_df=0.8200000000000001 0.921038728164
for max_df=0.8400000000000001 0.911088475091
for max_df=0.8600000000000001 0.921764380743
for max_df=0.8800000000000001 0.926029594542
for max_df=0.9000000000000001 0.923748990813
for max_df=0.9200000000000002 0.918440292586
for max_df=0.9400000000000002 0.919446437058


## Here try to mix features and ngrams

In [53]:
def append_features_to_sparse_matrix(feature_columns, sparse_matrix):
    if len(feature_columns) != sparse_matrix.shape[0]:
        raise "Wrong sizes!"
    return hstack((sparse_matrix, coo_matrix(feature_columns)))

In [65]:
transformer = CountVectorizer(ngram_range=(2, 8), analyzer='char_wb', binary=True, lowercase=True, max_df=0.84)

Try to add specific feature : denote a beginings and endings of words - let's append smth to the end and to the beg.

In [66]:
linear_train['appended_word'] = linear_train['word'].apply(lambda x : "#" + x + "$")
linear_train.head()

Unnamed: 0,word,label,length,capitalized,vowel_count,consonant_count,doubles,appended_word
0,Аалтонен,1,8,1,4,4,1,#Аалтонен$
1,Аар,0,3,1,2,1,1,#Аар$
2,Аарон,0,5,1,3,2,1,#Аарон$
3,ААРОН,0,5,0,3,2,1,#ААРОН$
4,Аарона,0,6,1,4,2,1,#Аарона$


In [67]:
matrix = transformer.fit_transform(linear_train['appended_word'])

In [68]:
matrix = append_features_to_sparse_matrix(linear_train[needed_cols], matrix)

#### And now try to make predictions again

In [69]:
xtrain, xcv, ytrain, ycv = train_test_split(matrix, linear_train['label'], test_size=0.1)

In [70]:
prediction = LogisticRegression().fit(xtrain, ytrain).predict_proba(xcv)

In [71]:
roc_auc_score(ycv, prediction[:,1])

0.92343890741254298

## Make mixed submission : features + ngrams

In [72]:
transformer = CountVectorizer(ngram_range=(2, 8), analyzer='char_wb', binary=True, lowercase=True, max_df=0.84)

In [73]:
matrix = transformer.fit_transform(linear_train['appended_word'])

In [74]:
matrix = append_features_to_sparse_matrix(linear_train[needed_cols], matrix)

In [75]:
predictor = LogisticRegression().fit(matrix, linear_train['label'])

In [76]:
featured_test_df = add_features(linear_test)
featured_test_df.head()

Unnamed: 0,word,length,capitalized,vowel_count,consonant_count,doubles
0,Аалто,5,1,3,2,1
1,ААР,3,0,2,1,1
2,Аара,4,1,3,1,1
3,Ааре,4,1,3,1,1
4,Аарон,5,1,3,2,1


In [77]:
featured_test_df['appended_word'] = featured_test_df['word'].apply(lambda x :"#" + x + "$")

In [79]:
test_matrix = append_features_to_sparse_matrix(featured_test_df[needed_cols], 
                                              transformer.transform(featured_test_df['appended_word']))

In [80]:
test_matrix

<188920x920629 sparse matrix of type '<class 'numpy.int64'>'
	with 10525996 stored elements in COOrdinate format>

In [81]:
prediction = predictor.predict_proba(test_matrix)

In [82]:
prediction[:,1]

array([  4.01638509e-01,   4.26550738e-02,   1.51354958e-01, ...,
         1.23344206e-02,   1.39387891e-04,   1.25489703e-04])

In [83]:
write_to_csv(prediction[:,1], "mixed_features_vectorizer_v1_appended.csv")

## The end