In [1]:
# -*- coding: UTF-8 -*-

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from scipy.sparse import coo_matrix, hstack

In [3]:
# Read data.
linear_train = pd.read_csv('data/linear_train.txt', header=None).dropna()
linear_ans_example = pd.read_csv('data/linear_ans_example.txt').dropna()
linear_test = pd.read_csv('data/linear_test.txt', header=None).dropna()

In [4]:
linear_ans_example.tail()

Unnamed: 0,Id,Answer
188915,188915,0.0
188916,188916,0.0
188917,188917,0.0
188918,188918,0.0
188919,188919,0.0


In [5]:
full_x = linear_train[0]
full_y = linear_train[1]

In [6]:
def encrypt(words_array, length):
    return np.array([' '.join(string[i:i+length] for i in range(0,len(string) - length,2)) 
                     for string in words_array])

def isCapitalized(word):
    capitals = ['А','Б','В','Г','Д','Е','Ё','Ж','З','И','Й','К','Л','М','Н','О',
           'П','Р','С','Т','У','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ы','Ь','Э','Ю','Я']
    return int(word[0:2] in capitals and not (word[2:4] in capitals))

count_vect = CountVectorizer(ngram_range=(1,1), decode_error='ignore', max_features=8000)

def cvect_fit_transform(x, n_gram_count):
    return count_vect.fit_transform(encrypt(x, n_gram_count*2))

def cvect_transform(x, n_gram_count):
    return count_vect.transform(encrypt(x, n_gram_count*2))

In [7]:
def add_feature(functor,surnames_train, surnames_test, x_train, x_test):
    first_capital_train = np.array([functor(word) for word in surnames_train]).reshape([-1,1])
    x_train = hstack((x_train, coo_matrix(first_capital_train)))
    
    first_capital_test = np.array([functor(word) for word in surnames_test]).reshape([-1,1])
    x_test = hstack((x_test, coo_matrix(first_capital_test)))
    return (x_train, x_test)

In [8]:
def write_to_csv(y, csv_name):
    try :
        os.mkdir("results")
    except:
        pass
    output = pd.DataFrame(data=y, columns=['Answer'])
    output.index.name = 'Id'
    output.to_csv(path_or_buf = './results/' + csv_name, index=True)

In [20]:
def get_result(clf, x_train, y_train, x_test):
    x_train_new = cvect_fit_transform(x_train, 3)
    x_test_new = cvect_transform(x_test, 3)
    x_train_new, x_test_new = add_feature(isCapitalized, x_train, x_test, x_train_new, x_test_new)
    clf = clf.fit(x_train_new, y_train)    
    return clf.predict(x_test_new)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(full_x, full_y, train_size=0.75)

In [11]:
result = get_result(LinearSVC(), x_train, y_train, x_test)

In [12]:
print(accuracy_score(result, y_test))

0.926988008836


In [13]:
result = get_result(MultinomialNB(), x_train, y_train, x_test)

In [14]:
print(accuracy_score(result, y_test))

0.896181760808


In [19]:
result = get_result(LinearSVC(), linear_train[0], linear_train[1], linear_test[0])

In [21]:
write_to_csv(result, "third_result.csv")