In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
!pip install pymorphy2

Collecting pymorphy2
[?25l  Downloading https://files.pythonhosted.org/packages/a3/33/fff9675c68b5f6c63ec8c6e6ff57827dda28a1fa5b2c2d727dffff92dd47/pymorphy2-0.8-py2.py3-none-any.whl (46kB)
[K     |███████                         | 10kB 12.9MB/s eta 0:00:01[K     |██████████████▏                 | 20kB 2.0MB/s eta 0:00:01[K     |█████████████████████▎          | 30kB 2.8MB/s eta 0:00:01[K     |████████████████████████████▍   | 40kB 2.0MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 1.8MB/s 
[?25hCollecting dawg-python>=0.7
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-none-any.whl
Collecting pymorphy2-dicts<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/02/51/2465fd4f72328ab50877b54777764d928da8cb15b74e2680fc1bd8cb3173/pymorphy2_dicts-2.4.393442.3710985-py2.py3-none-any.whl (7.1MB)
[K     |████████████████████████████████| 7.1MB 9.9MB/s 

In [0]:
from pymorphy2 import MorphAnalyzer
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords 
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from math import log
from gensim.models.keyedvectors import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import joblib



import logging
logging.basicConfig(filename='preprocessing.log', 
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%a, %d %b %Y %H:%M:%S',level=logging.INFO)

In [1]:
def open_data():
    data = pd.read_csv("quora_question_pairs_rus.csv", index_col='Unnamed: 0')
    
    data = data.drop(['question2', 'is_duplicate'], axis=1)[:100]
    
    data['question1'] = data['question1'].apply(lambda x: preproc(x)) 
    data.to_csv('preprocessed_data.csv', index=True)
    return data

In [3]:
def tf_idf_indexing(d): 
    vec = TfidfVectorizer()
    X = vec.fit(d) 
    df_tfidf = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
    #print(X)
    df_tfidf.to_csv('tf_idf_index.csv', index=False)
    
    joblib.dump(vec, 'tf_idf_vectorizer.pkl') #создаем файл пикл, где все переменные
    return df_tfidf

In [4]:
def bm25_indexing(d, k=2, b=0.75): 
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(d)
    term_freq_counts = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    term_freq_counts['sum'] = term_freq_counts.sum(axis=1)
    tf_table = term_freq_counts.div(term_freq_counts['sum'], axis=0)
    tf_table = tf_table.fillna(0)    
    tf_table = tf_table.drop(['sum'], axis=1)
    
    bin_vectorizer = CountVectorizer(binary=True)
    bin_X = bin_vectorizer.fit_transform(d)
    bin_counts = pd.DataFrame(bin_X.toarray(), columns=bin_vectorizer.get_feature_names()) 
    word_counter_dict = {}
    for column in bin_counts.columns:
        col = bin_counts[column]
        sum_ = col.sum()
        word_counter_dict[column] = sum_
    inverse_counter = pd.DataFrame.from_dict(word_counter_dict, orient='index')
    inverse_counter = inverse_counter.transpose()
    
    #N = d.shape[0]
    N = len(d)
    idfs = {}
    for w in inverse_counter:
        idf = log((N - inverse_counter[w] + 0.5)/(inverse_counter[w] +0.5))
        idfs[w] = idf
    idf_table = pd.DataFrame.from_dict(idfs, orient='index')
    idf_table = idf_table.transpose()

    sums = term_freq_counts['sum']
    avg = term_freq_counts['sum'].mean()
    sums_normalized = sums.div(avg)

    conversion_table_numerator = tf_table.mul(k+1)
    coefficient = sums_normalized.mul(b)
    coefficient = coefficient.add(1-b)
    coefficient = coefficient.mul(k)
    
    conversion_table_denominator = tf_table.mul(coefficient, axis=0)
    tf_factor = conversion_table_numerator.divide(conversion_table_denominator) 
    tf_factor = tf_factor.fillna(0)
    n = tf_factor.shape[0]
    
    idf_table = pd.concat([idf_table]*n, ignore_index=True)
    bm25_table = tf_factor.mul(idf_table, axis=1)
    bm25_table = bm25_table.fillna(0)
    bm25_table.to_csv('bm25_index.csv', index=False)
    return bm25_table

def getting_fasttext(filepath):
    fasttext_model = KeyedVectors.load(filepath)
    return fasttext_model


In [5]:
def sent_vectorizer(sent, model):
    if type(sent) != str:
        sent_vector = np.zeros((model.vector_size,))
        return sent_vector
    sent = sent.split()
    lemmas_vectors = np.zeros((len(sent), model.vector_size))
    for idx, lemma in enumerate(sent):
        if lemma in model.vocab:
            lemmas_vectors[idx] = model[lemma]
    sent_vector = lemmas_vectors.mean(axis=0)
    return sent_vector

def fasttext_indexing(d):
    model = getting_fasttext('fasttext/model.model')
    vectors_dict = {}
    for idx, row in d.iterrows():
        sent_vec = sent_vectorizer(row.question1, model)
        vectors_dict[idx] = sent_vec
    data = pd.DataFrame.from_dict(vectors_dict, orient='index')
    data.to_csv('fasttext_index.csv', index=False)
    return data

In [0]:
def main():
    try:
        raw_df = dataframe_opening()
        logging.info('made preprocessed dataframe')
        del(raw_df)
        preproc_df = preproc_opening()
        tf_idf_index = tf_idf_indexing(list(preproc_df.question1))
        logging.info('made tf-idf dataframe')
        del(tf_idf_index)
        bm25_index = bm25_indexing(list(preproc_df.question1))
        logging.info('made bm25 dataframe')
        del(bm25_index)
        fasttext_index = fasttext_indexing(preproc_df)
        logging.info('made fasttext dataframe')
        del(fasttext_index)

    except Exception as e:
        logging.exception(repr(e) + ' while some function')


if __name__ == "__main__":
    main()

In [14]:
!wget 'https://www.dropbox.com/s/jaa5y82qzul6byn/quora_question_pairs_rus.csv'

--2019-10-24 10:14:09--  https://www.dropbox.com/s/jaa5y82qzul6byn/quora_question_pairs_rus.csv
Resolving www.dropbox.com (www.dropbox.com)... 162.125.8.1, 2620:100:601b:1::a27d:801
Connecting to www.dropbox.com (www.dropbox.com)|162.125.8.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/jaa5y82qzul6byn/quora_question_pairs_rus.csv [following]
--2019-10-24 10:14:09--  https://www.dropbox.com/s/raw/jaa5y82qzul6byn/quora_question_pairs_rus.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc8c8e03a20d8b36bf11099933e8.dl.dropboxusercontent.com/cd/0/inline/ArBkaIY-HUNKyYpy_ESEMdpM3QwkzDqqAZ4JEm1fjIi5wMEqSW92109AlmYtO3khdeSDq0ZBFffejFVdDNpRrFI4zQ5orarQNkQAgG0SN0-lq3K_jjBn10jFYpvWy5mUTYY/file# [following]
--2019-10-24 10:14:10--  https://uc8c8e03a20d8b36bf11099933e8.dl.dropboxusercontent.com/cd/0/inline/ArBkaIY-HUNKyYpy_ESEMdpM3QwkzDqqAZ4JEm1fjIi5wMEqSW92109AlmYtO3khde

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
from gensim.models import Word2Vec, KeyedVectors 

!wget 'http://vectors.nlpl.eu/repository/11/181.zip' 

!unzip '181.zip' -d 'fasttext'
 

--2019-10-24 10:19:13--  http://vectors.nlpl.eu/repository/11/181.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2622716217 (2.4G) [application/zip]
Saving to: ‘181.zip’


2019-10-24 10:21:42 (16.9 MB/s) - ‘181.zip’ saved [2622716217/2622716217]

Archive:  181.zip
  inflating: fasttext/meta.json      
  inflating: fasttext/model.model    
  inflating: fasttext/model.model.vectors_ngrams.npy  
  inflating: fasttext/model.model.vectors.npy  
  inflating: fasttext/model.model.vectors_vocab.npy  
  inflating: fasttext/README         


In [19]:
from gensim.models.keyedvectors import KeyedVectors 

fast_model = 'fasttext/model.model'
 
fasttext_model = KeyedVectors.load(fast_model)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [20]:
raw_df = open_data()
logging.info('made preprocessed dataframe')
del(raw_df)
preproc_df = preproc_opening()
tf_idf_index = tf_idf_indexing(list(preproc_df.question1))
logging.info('made tf-idf dataframe')
del(tf_idf_index)
bm25_index = bm25_indexing(list(preproc_df.question1))
logging.info('made bm25 dataframe')
del(bm25_index)
fasttext_index = fasttext_indexing(preproc_df)
logging.info('made fasttext dataframe')
del(fasttext_index)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
