In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from stop_words import get_stop_words

import pandas as pd
import numpy as np

In [2]:
russian_stop_words = get_stop_words('russian')

In [3]:
#data reading and spliting on test and train
data = pd.read_csv('./text_BoW_task.csv', encoding='utf-8')

X_train, X_test, y_train, y_test = train_test_split(data['text_stem'],
                                                    data['is_blocked'], 
                                                    test_size=0.20, 
                                                    random_state=42)

X_train = X_train.astype(unicode);X_test = X_test.astype(unicode)

In [4]:
#creating the bag of word:
vectorizer = CountVectorizer(max_features = 9500,stop_words=russian_stop_words)
train_data_features = vectorizer.fit_transform(X_train).toarray()
test_data_features = vectorizer.transform(X_test).toarray()

In [5]:
#LogisticRegression training on vectorised features
clf1 = LogisticRegression()
clf1.fit(train_data_features, y_train)
y_pred_1 = clf1.predict(test_data_features)

In [6]:
LogRegression_auc = roc_auc_score(y_test, y_pred_1)

In [7]:
#SVM training on vectorised features
clf2 = LinearSVC()
clf2.fit(train_data_features, y_train)
y_pred_2 = clf2.predict(test_data_features)

In [8]:
SVM_auc = roc_auc_score(y_test, y_pred_2)

In [9]:
#creating the bag of word with TF-IDF:
vectorizer_Tfidf = TfidfVectorizer(max_features = 9500,stop_words=russian_stop_words)
train_data_features_tfidf = vectorizer_Tfidf.fit_transform(X_train).toarray()
test_data_features_tfidf = vectorizer_Tfidf.transform(X_test).toarray()

In [11]:
#LogisticRegression training on vectorised features with TF-IDF transform
clf1_tfidf = LogisticRegression()
clf1_tfidf.fit(train_data_features_tfidf, y_train)
y_pred_1_tfidf = clf1_tfidf.predict(test_data_features_tfidf)

In [12]:
LogRegression_tfidf_auc = roc_auc_score(y_test, y_pred_1_tfidf)

In [13]:
#SVM training on vectorised features with TF-IDF transform
clf2_tfidf = LinearSVC()
clf2_tfidf.fit(train_data_features_tfidf, y_train)
y_pred_2_tfidf = clf2.predict(test_data_features_tfidf)

In [14]:
SVM_tfidf_auc = roc_auc_score(y_test, y_pred_2_tfidf)

In [15]:
print 'LogRegression_auc: ', LogRegression_auc
print 'SVM_auc: ', SVM_auc
print 'LogRegression_tfidf_auc: ', LogRegression_tfidf_auc
print 'SVM_tfidf_auc: ', SVM_tfidf_auc

LogRegression_auc:  0.932659755419
SVM_auc:  0.92142664833
LogRegression_tfidf_auc:  0.932867164119
SVM_tfidf_auc:  0.904097809172
