In [None]:
!python -m spacy download ru_core_news_sm

In [2]:
import spacy
import nltk
import pandas as pd
from string import punctuation

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
class Preprocessing:
  """
  Preprocesses texts to tokenized and lemmatized format
  Save preprocessed DataFrame through get_preprocessed_df()
  """
  def __init__(self, df):
    self.nlp = spacy.load("ru_core_news_sm")
    self.stop_words = stopwords.words('russian')
    self.preprocessed = df

    self.preprocessed['tokens'], self.preprocessed['tokens_without_stops'] = zip(*self.preprocessed['transcript'].apply(self.tokenize))
    self.preprocessed['lemmas'], self.preprocessed['lemmas_without_stops'] = zip(*self.preprocessed['transcript'].apply(self.lemmatize))

    self.preprocessed.columns = ['ID',
                                'fileID',
                                'transcript',
                                'discourse.type',
                                'stimulus',
                                'time.point',
                                'tokens',
                                'tokens_without_stops',
                                'lemmas',
                                'lemmas_without_stops']

  def tokenize(self, text):
    """
    Getting all tokens, except punctuation marks
    Return: list of tokens with stopwords, list of tokens without stopwords
    """
    tokens = word_tokenize(text)
    tokens_w_stops = ', '.join([i.lower() for i in tokens if (i not in punctuation)])
    tokens_wo_stops = ', '.join([i.lower() for i in tokens if (i not in punctuation) and (i not in self.stop_words)])
    return tokens_w_stops, tokens_wo_stops

  def lemmatize(self, text):
    """
    Getting lemmas from text with and without stopwords
    Return: list of lemmas with stopwords, list of lemmas without stopwords
    """
    doc = self.nlp(text)
    lemmas = ', '.join([token.lemma_.lower() for token in doc if (token.text not in punctuation)])
    lemmas_without_stops = ', '.join([token.lemma_.lower() for token in doc if (token.text not in punctuation) and (token.text not in self.stop_words)])
    return lemmas, lemmas_without_stops

  def get_preprocessed_df(self):
    """
    Getter for preprocessed dataframe
    Return: pd.DataFrame
    """
    return self.preprocessed

  def save_preprocessed_df(self, path):
    """
    Save preprocessed DataFrame to given path
    Return: None
    """
    self.preprocessed.to_excel(path)