In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from stop_words import get_stop_words
from xgboost import XGBModel

import pandas as pd
import numpy as np

In [2]:
russian_stop_words = get_stop_words('russian')

In [3]:
#data reading and spliting on test and train
data = pd.read_csv('data/text_BoW_task.csv', encoding='utf-8')



X_train, X_test, y_train, y_test = train_test_split(data['text_stem'],
                                                    data['is_blocked'],
                                                    test_size=0.20,
                                                    random_state=42)

X_train = X_train.astype(unicode);X_test = X_test.astype(unicode)

In [4]:
#creating the bag of word:
vectorizer = CountVectorizer()#stop_words=russian_stop_words)
train_data_features = vectorizer.fit_transform(X_train)
test_data_features = vectorizer.transform(X_test)

In [5]:
train_data_features

<21605x54881 sparse matrix of type '<type 'numpy.int64'>'
	with 643008 stored elements in Compressed Sparse Row format>

In [None]:
#!/usr/bin/python
import xgboost as xgb
##
#  this script demonstrate how to fit generalized linear model in xgboost
#  basically, we are using linear model, instead of tree for our boosters
##
dtrain = xgb.DMatrix(train_data_features)
#dtest = xgb.DMatrix('../data/agaricus.txt.test')
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
         'alpha': 0.0001, 'lambda': 1 }

# normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
# there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
# param['eta'] = 1

##
# the rest of settings are the same
##
watchlist  = [(dtrain,'train')]
num_round = 4
bst = xgb.train(param, dtrain, num_round, watchlist)
preds = bst.predict(dtest)
labels = dtest.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))

In [5]:
#LogisticRegression training on vectorised features
clf1 = LogisticRegression()
clf1.fit(train_data_features, y_train)
y_pred_1 = clf1.predict(test_data_features)

In [6]:
LogRegression_auc = accuracy_score(y_test, y_pred_1)

LogRegression_accuracy = accuracy_score(y_test, y_pred_1)
print LogRegression_accuracy

0.935024065161


In [7]:
#SVM training on vectorised features
clf2 = LinearSVC()
clf2.fit(train_data_features, y_train)
y_pred_2 = clf2.predict(test_data_features)

In [8]:
SVM_auc = accuracy_score(y_test, y_pred_2)

In [9]:
#creating the bag of word with TF-IDF:
vectorizer_Tfidf = TfidfVectorizer()
train_data_features_tfidf = vectorizer_Tfidf.fit_transform(X_train)
test_data_features_tfidf = vectorizer_Tfidf.transform(X_test)

In [10]:
#LogisticRegression training on vectorised features with TF-IDF transform
clf1_tfidf = LogisticRegression()
clf1_tfidf.fit(train_data_features_tfidf, y_train)
y_pred_1_tfidf = clf1_tfidf.predict(test_data_features_tfidf)

In [11]:
LogRegression_tfidf_auc = accuracy_score(y_test, y_pred_1_tfidf)

In [12]:
#SVM training on vectorised features with TF-IDF transform
clf2_tfidf = LinearSVC()
clf2_tfidf.fit(train_data_features_tfidf, y_train)
y_pred_2_tfidf = clf2.predict(test_data_features_tfidf)

In [13]:
SVM_tfidf_auc = accuracy_score(y_test, y_pred_2_tfidf)

In [14]:
print 'LogRegression_auc: ', LogRegression_auc
print 'SVM_auc: ', SVM_auc
print 'LogRegression_tfidf_auc: ', LogRegression_tfidf_auc
print 'SVM_tfidf_auc: ', SVM_tfidf_auc

LogRegression_auc:  0.935024065161
SVM_auc:  0.92484265087
LogRegression_tfidf_auc:  0.930025916327
SVM_tfidf_auc:  0.91577193632
