# Подготовка данных для обучения

Цель: написать генератор, принимающий батч твиттов из твиттер-датасета, и выдающий по этому батчу предобработанные, векторизованные данные с отобранными признаками. Необходимо использовать различные методы решения каждой из задач.

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
import re

from tqdm import tqdm

import ssl
ssl._create_default_https_context = ssl._create_stdlib_context

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('twitter_samples')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import twitter_samples
import gensim

[nltk_data] Downloading package wordnet to /Users/Dmitry/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Dmitry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/Dmitry/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


### Данные

Для ускорения исследования, скачаем весь датасет, и будем собирать батчи из него.

In [2]:
df = pd.read_csv('twitter_dataset.csv', low_memory=False)
df.dropna(inplace=True)

In [3]:
df.shape

(399424, 4)

In [4]:
def stream_generator(batch_size=10):
    
    # df = pd.read_csv('twitter_dataset.csv')
    for i in range(0, df.shape[0], batch_size):
        stream_batch = df.iloc[i : min(i + batch_size, df.shape[0])]
        yield stream_batch['content'].tolist(), stream_batch['novel'].tolist()

In [5]:
for batch in stream_generator():
    print(batch)
    break

(['Tune in 11:30 ET tomorrow for a live webcast of Families USA Presidential Forum on health care: http://presidentialforums.health08.org/', "Iowans, there's a good chance there's a Biden near you today on a cool 14 F day: http://blog.joebiden.com/?p=1625", 'Met with Judge Sotomayor today. Very impressive, experienced judge who is also very down to earth. Will do great work on Supreme Court.', 'PHOTOS: On June 12, Michael Bennet met with members of the Mile High Youth Corps.  http://tinyurl.com/kl8bvn', 'PHOTOS: Women for Bennet held a fundraiser for Michael with State Treasurer Cary Kennedy on Sunday.  http://tinyurl.com/la22ye', 'VIDEO: State Treasurer Cary Kennedy praises Michael Bennet at Women for Bennet event on June 14.  http://tinyurl.com/lts3m3', 'VIDEO: Michael Bennet Speaks at Women for Bennet event on June 14.  http://tinyurl.com/lejavb', "It's great to be back in Colorado with my family for the weekend.", 'PHOTOS: Tour with Sen. Mark Udall of Ascent Solar Technologies in T

### Предобработка

Будем удалять по возможности все теги(@) и ссылки. Некоторые стоп-слова также удалим. Применим в одном варианте лемматизацию, в другом - стемминг.

In [6]:
class Preprocessor:
    
    def __init__(self):
        self.methods = {
            'lemm' : self.lemmatization,
            'stem' : self.stemming
        }
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.swords = set( stopwords.words("english") )
     
    def preprocess1(self, content_batch, standard='lemm'):
        # stream_batch = ([content], [novel]), content = list(string), novel = list
    
        preprocessed_batch = []
        for doc in content_batch:
            doc = doc.lower()
            doc = self.delete_tags(doc)
            doc = self.delete_links(doc)
            doc = self.delete_garbage(doc)
            tokens = self.get_tokens(doc)
            tokens = self.methods[standard](tokens)
            tokens = self.delete_stop_words(tokens)
            preprocessed_batch.append( ' '.join(tokens) )
            
        return preprocessed_batch
    
    
    def delete_tags(self, doc):
        doc = re.sub(r'^@[\w]*', ' ', doc) 
        doc = re.sub(r'\s@[\w]*', ' ', doc)
        return doc
    
    
    def delete_links(self, doc):
        doc = re.sub(r'http\:\/\/[\w\-&\./?=\+;@#%]*', ' ', doc)
        doc = re.sub(r'https\:\/\/[\w\-&\./?=\+;@#%]*', ' ', doc)
        doc = re.sub(r'ftp\:\/\/[\w\-&\./?=\+;@#%]*', ' ', doc)
        doc = re.sub(r'www\.[\w\-&\./?=\+;@#%]*', ' ', doc)
        return doc
    
    
    def delete_garbage(self, doc):
        doc = re.sub(r'\s+', ' ', doc)
        doc = re.sub(r"[^a-zA-Z0-9\s\']*", '', doc) # TODO: посмотреть что оставляет CountVectorizer
        return doc
    
    
    def delete_stop_words(self, tokens): # TODO: create own sw list
        # new_tokens = []
        return list( filter(lambda sword: sword not in self.swords, tokens) )
#         for sword in swords:
#             if sword not in stop_words:
#                 new_tokens.append(sword)
#         return new_tokens
    
    
    def get_tokens(self, doc):
        return list(map(lambda token: token.lower(), doc.split()))
    
    
    def lemmatization(self, tokens):
        return list(map(lambda token: self.lemmatizer.lemmatize(token), tokens))
    
    
    def stemming(self, tokens):
        return list( map(lambda token: self.stemmer.stem(token), tokens) )

In [7]:
pp = Preprocessor()

doc = """Hello there, @dmkalash, @lom and others! I've written my pp, so i want to | test > it. 
    see my https://www.site.com website."""

In [8]:
pp.delete_tags(doc)

"Hello there, ,  and others! I've written my pp, so i want to | test > it. \n    see my https://www.site.com website."

In [9]:
pp.delete_links(doc)

"Hello there, @dmkalash, @lom and others! I've written my pp, so i want to | test > it. \n    see my   website."

In [10]:
pp.delete_garbage(doc)

"Hello there dmkalash lom and others I've written my pp so i want to  test  it see my httpswwwsitecom website"

In [11]:
tokens = pp.get_tokens(doc)
tokens

['hello',
 'there,',
 '@dmkalash,',
 '@lom',
 'and',
 'others!',
 "i've",
 'written',
 'my',
 'pp,',
 'so',
 'i',
 'want',
 'to',
 '|',
 'test',
 '>',
 'it.',
 'see',
 'my',
 'https://www.site.com',
 'website.']

In [12]:
lemm_tokens = pp.lemmatization(tokens)
lemm_tokens

['hello',
 'there,',
 '@dmkalash,',
 '@lom',
 'and',
 'others!',
 "i've",
 'written',
 'my',
 'pp,',
 'so',
 'i',
 'want',
 'to',
 '|',
 'test',
 '>',
 'it.',
 'see',
 'my',
 'https://www.site.com',
 'website.']

In [13]:
pp.stemming(tokens)

['hello',
 'there,',
 '@dmkalash,',
 '@lom',
 'and',
 'others!',
 "i'v",
 'written',
 'my',
 'pp,',
 'so',
 'i',
 'want',
 'to',
 '|',
 'test',
 '>',
 'it.',
 'see',
 'my',
 'https://www.site.com',
 'website.']

In [14]:
pp.delete_stop_words(tokens)

['hello',
 'there,',
 '@dmkalash,',
 '@lom',
 'others!',
 "i've",
 'written',
 'pp,',
 'want',
 '|',
 'test',
 '>',
 'it.',
 'see',
 'https://www.site.com',
 'website.']

Проверяем всё вместе:

In [15]:
content = ["""Hello there, @dmkalash, @lom and others! I've written my pp, so i want to | test > it. 
    see my https://www.site.com website.""",
          """Hello there, @dmkalash, @lom and others! I've written my pp, so i want to | test > it. 
    see my https://www.site.com website."""]
pp.preprocess1(content)

["hello others i've written pp want test see website",
 "hello others i've written pp want test see website"]

In [16]:
# import spacy
# nlp = spacy.load('en', disable=['parser', 'ner'])

# sentence = "The striped bats are hanging on their feet for best"

# doc = nlp(sentence)

# " ".join([token.lemma_ for token in doc])

### Векторизация

Скачаем твиттер-корпус, и создадим из него фиксированный словарь для векторизации.

In [17]:
twitter_samples.fileids()
vocab_corpus = ([(t, "pos") for t in twitter_samples.strings("positive_tweets.json")] + 
             [(t, "neg") for t in twitter_samples.strings("negative_tweets.json")] +
             [(t, "neg") for t in twitter_samples.strings("tweets.20150430-223406.json")]
            )

In [18]:
print( len(vocab_corpus) )
print( vocab_corpus[0] )

30000
('#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)', 'pos')


In [19]:
vocab_corpus = list( map( lambda pair: pair[0], vocab_corpus ) )
print( len(vocab_corpus) )
print( vocab_corpus[0] )

30000
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


Проверим как работает DocToVec.

In [20]:
tokens_train = ['hello', 'darkness', 'my', 'old', 'friend']
tokens_test = ['hello', 'world', 'and', 'others', 'friend']

train_corpus = [gensim.models.doc2vec.TaggedDocument(tokens_train, [1]),
                gensim.models.doc2vec.TaggedDocument(tokens_train, [2])]
print(train_corpus[:2])

[TaggedDocument(words=['hello', 'darkness', 'my', 'old', 'friend'], tags=[1]), TaggedDocument(words=['hello', 'darkness', 'my', 'old', 'friend'], tags=[2])]


In [21]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=3, min_count=1, epochs=1)

model.build_vocab(train_corpus)

In [22]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.03465455  0.13094941 -0.03809212]


Класс, в котором будут реализованы основные методы векторизации.

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

class Vectorizer():
    
    def __init__(self, method, train_corpus):
        self.methods = {
            'one-hot' : self.one_hot_vectorizer,
            'count' : self.count_vectorizer,
            'tf-idf' : self.tfidf_vectorizer,
            #'n-gramms' : self.n_gramms_vectorizer,
            'doc-to-vec' : self.doc_to_vec_vectorizer
        }
        if method not in self.methods:
            raise Exception('Wrong method: {}'.format(method))
        
        self.method = method
        self.model = None
        self.train_corpus = Preprocessor().preprocess1( train_corpus )
        
    
    def vectorize(self, batch, **args):
        return self.methods[self.method](batch, **args)
        
    
    def one_hot_vectorizer(self, batch, **args):
        
        if self.model is None:
            self.model = CountVectorizer(binary=True, **args)
            self.model.fit(self.train_corpus)
            
        return self.model.transform(batch)
    
    
    def count_vectorizer(self, batch, **args):
        
        if self.model is None:
            self.model = CountVectorizer(**args)
            self.model.fit(self.train_corpus)
            
        return self.model.transform(batch)
    
    
    def tfidf_vectorizer(self, batch, **args):
        
        if self.model is None:
            self.model = TfidfVectorizer(**args)
            self.model.fit(self.train_corpus)
            
        return self.model.transform(batch)
        
        
    def doc_to_vec_vectorizer(self, batch, **args):
        
        def extract_tokens(train = False):
            for i, doc in enumerate(batch):
                tokens = Preprocessor().get_tokens(doc)
                if train:
                    yield gensim.models.doc2vec.TaggedDocument(tokens, [i])    
                else:
                    yield tokens
                    
        
        if self.model is None:
            vocab = list( extract_tokens(train=True) )
            self.model = gensim.models.doc2vec.Doc2Vec(min_count=1)
            # self.model.build_vocab(train_corpus, update = True)
            self.model.build_vocab(train_corpus)
            self.model.train(vocab, total_examples=model.corpus_count, epochs=1, **args)
        
        return np.array( list( map( lambda token: self.model.infer_vector(token), extract_tokens() ) ) )
    
"""
One-Hot
CountVectorizer 
TF-IDF
DocToVec
n-грамм для 2 и 3
...

[0,1,0] -> svd
[0,0,0,0,1,0,1] -> svd

"""
print() 




Проверяем работоспособность:

In [24]:
docs = ['hello darkness my old friend', 'i have come to told with you again']

In [25]:
Vectorizer('one-hot', vocab_corpus).vectorize(docs)

<2x19748 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [26]:
Vectorizer('count', vocab_corpus).vectorize(docs)

<2x19748 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [27]:
Vectorizer('tf-idf', vocab_corpus).vectorize(docs)

<2x19748 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [28]:
Vectorizer('doc-to-vec', vocab_corpus).vectorize(docs).shape

(2, 100)

### Собираем вместе

In [29]:
pp = Preprocessor()
shapes = []
for batch, target in tqdm( stream_generator(10000) ):
    batch = pp.preprocess1(batch)
    v1 = Vectorizer('tf-idf', vocab_corpus)
    vectors = v1.vectorize(batch)
    shapes.append(vectors.shape)

print(len(shapes))

40it [05:56,  8.92s/it]

40



