# ** Описание **

In [1]:
from __future__ import division

import base64
import csv
import gzip
import zlib

from collections import namedtuple

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
TRACE_NUM = 1000
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

def trace(items_num, trace_num=TRACE_NUM):
    if items_num % trace_num == 0: logging.info("Complete items %05d" % items_num)
        
def trace_worker(items_num, worker_id, trace_num=TRACE_NUM):
    if items_num % trace_num == 0: logging.info("Complete items %05d in worker_id %d" % (items_num, worker_id))

### Утилиты

#### Декораторы

In [3]:
def to_utf8(text):
    if isinstance(text, unicode): text = text.encode('utf8')
    return text

def convert2unicode(f):
    def tmp(text):
        if not isinstance(text, unicode): text = text.decode('utf8')
        return f(text)
    return tmp

def convert2lower(f):
    def tmp(text):        
        return f(text.lower())
    return tmp

#P.S. Декораторы могут усложнять отладку, так что от них вполне можно отказаться и воспользоваться copy-paste

### Извлечение текста из html

#### Извлечение текста при помощи встроенных модулей

In [4]:
from HTMLParser import HTMLParser
import re

###Извлечение текста из title можно вписать сюда

class TextHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self._text = []
        self._title = ""
        self._in_title = False

    def handle_data(self, data):
        text = data.strip()
        if len(text) > 0:
            text = re.sub('[ \t\r\n]+', ' ', text)
            self._text.append(text + ' ')

    def handle_starttag(self, tag, attrs):
        if tag == 'p':
            self._text.append('\n\n')
        elif tag == 'br':
            self._text.append('\n')
        elif tag == 'title':
            self._in_title = True

    def handle_startendtag(self, tag, attrs):
        if tag == 'br':
            self._text.append('\n\n')

    def text(self):
        return ''.join(self._text).strip()

@convert2unicode
def html2text_parser(text):
    parser = TextHTMLParser()
    parser.feed(text)
    return parser.text()

In [5]:
html2text = html2text_parser

#### Методы для токенизации текста

In [6]:
@convert2lower
@convert2unicode
def easy_tokenizer(text):
    word = unicode()
    for symbol in text:
        if symbol.isalnum(): word += symbol
        elif word:
            yield word
            word = unicode()
    if word: yield word

PYMORPHY_CACHE = {}
MORPH = None
#hint, чтобы установка pymorphy2 не была бы обязательной
def get_lemmatizer():
    import pymorphy2
    global MORPH
    if MORPH is None: MORPH = pymorphy2.MorphAnalyzer()
    return MORPH

@convert2lower
@convert2unicode
def pymorphy_tokenizer(text):
    global PYMORPHY_CACHE
    for word in easy_tokenizer(text):
        word_hash = hash(word)
        if word_hash not in PYMORPHY_CACHE:
            PYMORPHY_CACHE[word_hash] = get_lemmatizer().parse(word)[0].normal_form            
        yield PYMORPHY_CACHE[word_hash]

#### Основная функция, которая вызывается для преобразования html в список слов

In [7]:
def html2word(raw_html, to_text=html2text, tokenizer=easy_tokenizer):
    return tokenizer(to_text(raw_html).lower())

In [8]:
DocItem = namedtuple('DocItem', ['doc_id', 'is_spam', 'url', 'features'])
PreDocItem = namedtuple('PreDocItem', ['doc_id', 'is_spam', 'url', 'html_data'])

def load_csv(input_file_name):    
    """
    Загружаем данные и извлекаем на лету признаки
    Сам контент не сохраняется, чтобы уменьшить потребление памяти - чтобы
    можно было запускать даже на ноутбуках в классе
    """
    predocs = []
    with gzip.open(input_file_name) if input_file_name.endswith('gz') else open(input_file_name)  as input_file:            
        headers = input_file.readline()
        
        for i, line in enumerate(input_file):
            trace(i)
            parts = line.strip().split('\t')
            url_id = int(parts[0])                                        
            mark = int(parts[1])                   
            url = parts[2]
            pageInb64 = parts[3]
            html_data = base64.b64decode(pageInb64)
            predocs.append(PreDocItem(url_id, mark, url, html_data))                  
        trace(i, 1)
    return predocs

In [9]:
%%time

TRAIN_DATA_FILE  = '/home/vanyadeg/dev/ifmo-infosearch-hw/lab-spam/data/kaggle_train_data_tab.csv'

train_predocs = list(load_csv(TRAIN_DATA_FILE))

18:28:33 INFO:Complete items 00000
18:28:33 INFO:Complete items 01000
18:28:33 INFO:Complete items 02000
18:28:34 INFO:Complete items 03000
18:28:34 INFO:Complete items 04000
18:28:34 INFO:Complete items 05000
18:28:35 INFO:Complete items 06000
18:28:35 INFO:Complete items 07000
18:28:35 INFO:Complete items 07043


CPU times: user 2.03 s, sys: 234 ms, total: 2.27 s
Wall time: 2.26 s


In [10]:
%%time

TEST_DATA_FILE  = '/home/vanyadeg/dev/ifmo-infosearch-hw/lab-spam/data/kaggle_test_data_tab.csv'

test_predocs = load_csv(TEST_DATA_FILE)

18:28:35 INFO:Complete items 00000
18:28:35 INFO:Complete items 01000
18:28:36 INFO:Complete items 02000
18:28:36 INFO:Complete items 03000
18:28:37 INFO:Complete items 04000
18:28:37 INFO:Complete items 05000
18:28:37 INFO:Complete items 06000
18:28:38 INFO:Complete items 07000
18:28:38 INFO:Complete items 08000
18:28:38 INFO:Complete items 09000
18:28:39 INFO:Complete items 10000
18:28:39 INFO:Complete items 11000
18:28:39 INFO:Complete items 12000
18:28:40 INFO:Complete items 13000
18:28:40 INFO:Complete items 14000
18:28:40 INFO:Complete items 15000
18:28:41 INFO:Complete items 16000
18:28:41 INFO:Complete items 16038


CPU times: user 5.37 s, sys: 581 ms, total: 5.95 s
Wall time: 5.96 s


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

def count_avg(words):
    avg_word_len = 0
    for word in words:
        avg_word_len += len(word)
    return avg_word_len / len(words)

def count_in_tag(html_data, pattern):
    res = 0
    anchors = re.findall(pattern, html_data)
    for anchor in anchors:
        res += len(list(html2word(anchor)))
    return res
    
def calc_features(predocs):
    vectorizer = TfidfVectorizer(min_df=0.04)
    X = vectorizer.fit_transform([pd.html_data for pd in predocs]).toarray()
    for i, pd in enumerate(predocs):
        features = X[i]
        yield DocItem(pd.doc_id, pd.is_spam, pd.url, features)

In [12]:
train_size = len(train_predocs)
test_size = len(test_predocs)

docs = list(calc_features(train_predocs + test_predocs))

In [13]:
train_docs = docs[0 : train_size]
test_docs = docs[train_size:]
print len(train_docs)
print len(test_docs)
print len(docs[0].features)

7044
16039
2981


In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier

class Classifier:
    
    def __init__(self):
        self.clf = AdaBoostClassifier(n_estimators=500, learning_rate=0.8)

    def predict(self, doc):        
        return self.clf.predict([doc.features])[0]                     
    
    def predict_all(self, docs):
        res = []
        for doc_num, doc in enumerate(docs):
            trace(doc_num)
            prediction = self.predict(doc)            
            res.append( (doc.doc_id, doc.is_spam, doc.url, prediction) )
        return res
    
    def train(self, docs): 
        self.clf.fit([d.features for d in docs], [d.is_spam for d in docs])

In [15]:
import numpy as np

In [16]:
train = train_docs[:int(len(train_docs)*0.8)]
test = train_docs[int(len(train_docs)*0.8):]

In [17]:
X_train = np.array([d.features for d in train])
y_train = np.array([d.is_spam for d in train])
X_test = np.array([d.features for d in test])
y_test = np.array([d.is_spam for d in test])

In [18]:
#full mode

X_train = np.array([d.features for d in train_docs])
y_train = np.array([d.is_spam for d in train_docs])
X_test = np.array([d.features for d in test_docs])
y_test = np.array([d.is_spam for d in test_docs])

In [27]:
param = {
    'max_depth': 5,  
    'eta': 0.1,  
    'silent': 1, 
    'objective': 'multi:softprob', 
    'num_class': 2} 
num_round = 600

In [28]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


In [29]:
bst = xgb.train(param, dtrain, num_round)

In [30]:
bst.dump_model('dump.raw.txt')

In [31]:
preds = bst.predict(dtest)

In [32]:
import numpy as np
best_preds = np.asarray([np.argmax(line) for line in preds])


In [33]:
with open('my_submission_xgb7_004_5_500.csv' , 'wb') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Id','Prediction'])
    predictions = best_preds
    print len(predictions)
    for doc_id, res in zip([doc.doc_id for doc in test_docs], predictions):
        writer.writerow([doc_id, int(res)])

16039
