In [23]:
# -*- coding: UTF-8 -*-

In [24]:
import os
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from scipy.sparse import coo_matrix, hstack
from sklearn.feature_extraction.text import TfidfTransformer

In [25]:
# Read data.
linear_train = pd.read_csv('data/linear_train.txt', header=None).dropna()
linear_ans_example = pd.read_csv('data/linear_ans_example.txt').dropna()
linear_test = pd.read_csv('data/linear_test.txt', header=None).dropna()

In [26]:
full_x = linear_train[0]
full_y = linear_train[1]

In [27]:
def to_last_n_letters(array, n):
    return [word[-(n*2):] for word in array]

def append_hash_back(array):
    return [word + "#" for word in array]

def append_dollar_front(array):
    return ["$" + word for word in array]

def append_front_back(array):
    return ["$" + word + "#" for word in array]

In [28]:
def isCapitalized(word):
    capitals = ['А','Б','В','Г','Д','Е','Ё','Ж','З','И','Й','К','Л','М','Н','О',
           'П','Р','С','Т','У','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ы','Ь','Э','Ю','Я']
    return int(word[0:2] in capitals and not (word[2:4] in capitals))

def vowel_count(word):
    vowels = ['А','Е','Ё','И','О','У','Ы','Э','Ю','Я',
              'а','е','ё','и','о','у','ы','э','ю','я']
    retval = 0
    for i in range (len(word)/2):
        if word[i*2 : (i*2)+2] in vowels:
            retval+=1
    return retval

def consonant_count(word):
    consonants = ['Б','В','Г','Д','Ж','З','Й','К','Л','М','Н','П','Р','С','Т','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ь',
                  'б','в','г','д','ж','з','й','к','л','м','н','п','р','с','т','ф','х','ц','ч','ш','щ','ъ','ь',]
    retval = 0
    for i in range (len(word)/2):
        if word[i*2 : (i*2)+2] in consonants:
            retval+=1
    return retval

In [29]:
def add_feature(functor,surnames_train, surnames_test, x_train, x_test):
    first_capital_train = np.array([functor(word) for word in surnames_train]).reshape([-1,1])
    x_train = hstack((x_train, coo_matrix(first_capital_train)))
    
    first_capital_test = np.array([functor(word) for word in surnames_test]).reshape([-1,1])
    x_test = hstack((x_test, coo_matrix(first_capital_test)))
    return (x_train, x_test)

In [30]:
def write_to_csv(y, csv_name):
    try :
        os.mkdir("results")
    except:
        pass
    output = pd.DataFrame(data=y, columns=['Answer'])
    output.index.name = 'Id'
    output.to_csv(path_or_buf = './results/' + csv_name, index=True)

In [64]:
def get_result(clf, x_train_l, y_train_l, x_test_l):
    count_vect = CountVectorizer(ngram_range=(4*2,8*2), decode_error='ignore', lowercase=True, analyzer='char')
        
    x_train_new0 = count_vect.fit_transform(x_train_l)
    x_test_new0 = count_vect.transform(x_test_l)
    
    print(count_vect.get_feature_names()[0])
    
    
    x_train_new0, x_test_new0 = add_feature(isCapitalized, x_train_l, x_test_l, x_train_new0, x_test_new0)
    x_train_new0, x_test_new0 = add_feature(vowel_count, x_train_l, x_test_l, x_train_new0, x_test_new0)
    x_train_new0, x_test_new0 = add_feature(consonant_count, x_train_l, x_test_l, x_train_new0, x_test_new0)

    clf = clf.fit(x_train_new0, y_train_l)
    return clf.predict(x_test_new0)

In [65]:
%%time
x_train, x_test, y_train, y_test = train_test_split(full_x, full_y, train_size=0.7)

CPU times: user 68 ms, sys: 0 ns, total: 68 ms
Wall time: 66.5 ms


In [66]:
%%time
print(roc_auc_score(get_result(LinearSVC(), append_front_back(x_train), y_train, append_front_back(x_test)), 
                     y_test))

 «газпро
0.697433151456
CPU times: user 21.6 s, sys: 64 ms, total: 21.6 s
Wall time: 21.5 s


In [67]:
count_vect = CountVectorizer(ngram_range=(4*2,8*2), decode_error='ignore', lowercase=True, analyzer='char')

x_train_new0 = count_vect.fit_transform(x_train)
print(count_vect.get_feature_names()[0])

 «газпро


In [43]:
print(roc_auc_score(get_result(LinearSVC(), append_front_back(x_train), y_train, append_front_back(x_test)), 
                     y_test))

0.719151770351


In [37]:
%%time
print(roc_auc_score(get_result(MultinomialNB(), append_hash_back(x_train), y_train, append_hash_back(x_test)), 
                     y_test))

0.600625377976
CPU times: user 4.91 s, sys: 60 ms, total: 4.97 s
Wall time: 4.83 s


In [37]:
# %%time
# result = get_result(LogisticRegression(), linear_train[0], linear_train[1], linear_test[0])

In [38]:
# write_to_csv(result, "result0.csv")

In [39]:
# Количество гласных/согласных - даёт небольшой буст ~ 0.5%
# Окончания - добавление проверки сверху (поверх hash & dollar) ничего не даёт, кажется
# Извлекать фичи SelectFromModel
# Add hash to the end / beginning

In [40]:
# Добавление символа в начало улучшило результат на 2% при 3,4 граммах.

In [41]:
from __future__ import print_function

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

def find_parameters(clf, x_train_l, y_train_l, x_test_l, y_test_l):
    count_vect = CountVectorizer(ngram_range=(1,1), decode_error='ignore', lowercase=True)
    
    x_train_new0 = count_vect.fit_transform(encrypt_array_lengths(x_train_l, np.array([3,4])*2))
    x_test_new0 = count_vect.transform(encrypt_array_lengths(x_test_l, np.array([3,4])*2))
    
    x_train_new0, x_test_new0 = add_feature(isCapitalized, x_train_l, x_test_l, x_train_new0, x_test_new0)
    x_train_new0, x_test_new0 = add_feature(vowel_count, x_train_l, x_test_l, x_train_new0, x_test_new0)
    x_train_new0, x_test_new0 = add_feature(consonant_count, x_train_l, x_test_l, x_train_new0, x_test_new0)

    x_train_new1 = count_vect.fit_transform(to_last_n_letters(x_train_l, 3))
    x_test_new1 = count_vect.transform(to_last_n_letters(x_test_l, 3))
                                  
    x_train_new0, x_test_new0 = (hstack((x_train_new0, x_train_new1)), hstack((x_test_new0, x_test_new1)))
                               
    clf_config = {           
                'penalty' : ['l1', 'l2'], 
                'dual' : [False],
                'max_iter' : np.arange(500,1501,500),
                'tol' : [1e-4, 1e-5, 1e-6],
                'C': [1, 10],
                'n_jobs': [-1],
              }
    
    scorer = make_scorer(score_func=roc_auc_score)
    
    grid_search = GridSearchCV(estimator=clf, scoring=scorer, param_grid=clf_config, cv=5, verbose=100)
    grid_search.fit(x_train_new0, y_train_l)
        
    print("Best parameters set found on development set:")
    print()
    print(grid_search.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test_l, grid_search.predict(x_test_new0)
    print(classification_report(y_true, y_pred))
    print()
        

In [42]:
#%%time
#find_parameters(LogisticRegression(), append_front_back(x_train), y_train, append_front_back(x_test), y_test)

In [43]:
from sklearn.linear_model import LogisticRegression #  0.844
from sklearn.linear_model import RidgeClassifier # 0.8577
from sklearn.linear_model import RidgeClassifierCV # memory error
from sklearn.linear_model import SGDClassifier # 0.869
from sklearn.linear_model import PassiveAggressiveClassifier # 0.694

from sklearn.naive_bayes import MultinomialNB # 0.6825

from sklearn.ensemble import RandomForestClassifier # 0.80
from sklearn.ensemble import BaggingClassifier # 0.761583612896
from sklearn.ensemble import AdaBoostClassifier # 0.780406567954
from sklearn.ensemble import ExtraTreesClassifier # 0.790956715791 / 0.79249149196
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

In [44]:
# default : 0.781904988147 (n_estimators=10)
# n_estimators = 20 : 0.79831297856
# max_depth = 10 : 0.948359739662
# max_depth = 13 : 
# max_depth = 15 : 0.94 / 0.84
# max_depth = 17 : 0.91
# max_depth = 20 : 0.92 / 0.94 / 0.94 / 0.946 / 0.946
# max_depth = 25 : 0.915 / 0.910 / 0.941 / 0.910 / 0.924
# max_depth = 30 : 0.934 / 0.884 / 0.911


x_train, x_test, y_train, y_test = train_test_split(full_x, full_y, train_size=0.75)

In [45]:
%%time
#print(roc_auc_score(get_result(RidgeClassifier(), 
#                               append_front_back(x_train), y_train, append_front_back(x_test)), y_test))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 4.05 µs


In [42]:
%%time
print(roc_auc_score(get_result(LogisticRegression(), 
                              append_front_back(x_train), y_train, append_front_back(x_test)), y_test))

0.672819215384
CPU times: user 12.9 s, sys: 180 ms, total: 13 s
Wall time: 7.26 s


In [47]:
%%time
#print(roc_auc_score(get_result(RandomForestClassifier(max_depth=20), 
#                               append_front_back(x_train), y_train, append_front_back(x_test)), y_test))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


In [48]:
# %%time
# print(roc_auc_score(get_result(BaggingClassifier(), 
#                                append_front_back(x_train), y_train, append_front_back(x_test)), y_test))

In [49]:
%%time
#print(roc_auc_score(get_result(AdaBoostClassifier(), 
#                               append_front_back(x_train), y_train, append_front_back(x_test)), y_test))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


In [50]:
%%time
#print(roc_auc_score(get_result(ExtraTreesClassifier(), 
#                               append_front_back(x_train), y_train, append_front_back(x_test)), y_test))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.01 µs


In [51]:
%%time
#print(roc_auc_score(get_result(SGDClassifier(), 
#                               append_front_back(x_train), y_train, append_front_back(x_test)), y_test))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.91 µs


In [52]:
%%time
print(roc_auc_score(get_result(LogisticRegression(), 
                               append_front_back(x_train), y_train, append_front_back(x_test)), y_test))

0.887144398176
CPU times: user 16.7 s, sys: 308 ms, total: 17 s
Wall time: 9.45 s


In [53]:
%%time
result = get_result(LogisticRegression(), 
                               append_front_back(full_x), full_y, append_front_back(linear_test[0]))

CPU times: user 27.9 s, sys: 488 ms, total: 28.4 s
Wall time: 20.3 s


In [54]:
write_to_csv(result, "LogisticRegressionClassifier4-8Ngrams.csv")

In [2]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']

In [4]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories, shuffle=True, random_state=42)

In [22]:
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> count_vect = CountVectorizer(ngram_range=(3,8), analyzer='char')
>>> X_train_counts = count_vect.fit_transform(twenty_train.data)
>>> X_train_counts.shape
print(count_vect.get_feature_names()[8001])

109 178 23 ftp volvis92
