In [None]:
%pip install stanza



In [None]:
import os
import zipfile
from google.colab import drive
import stanza
import pandas as pd

In [None]:
!mkdir -p /content/my_drive
from google.colab import drive
drive.mount('/content/my_drive')


Drive already mounted at /content/my_drive; to attempt to forcibly remount, call drive.mount("/content/my_drive", force_remount=True).


In [None]:
def extract_zip_file(zip_file_path, extract_folder):
    """
    Extracts a ZIP file to a specified folder.

    Parameters:
    zip_file_path (str): The path to the ZIP file.
    extract_folder (str): The directory where the files will be extracted.

    Returns:
    None
    """
    # Create the directory if it doesn't exist
    os.makedirs(extract_folder, exist_ok=True)

    # Unzip the file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

    return extract_folder

In [None]:
def extract_meta(text):
    """
    Extracts meta information about a file.

    Parameters:
    text (str): Whole text from txt file.

    Returns:
    meta_info (str): Meta information in dictionary format put into the string.
    """
    original_string = text
    start_marker = "{"
    end_marker = "}"

    # Find the start and end indices of the part to extract
    start_index = original_string.find(start_marker)
    end_index = original_string.find(end_marker, start_index) + len(end_marker)

    # Extract the part
    meta_info = original_string[start_index :end_index]

    return meta_info

In [None]:
def extract_text(text):
    """
    Extracts meta information about a file.

    Parameters:
    text (str): Whole text from txt file.

    Returns:
    text_no_meta (str): Text without meta information.
    """

    original_string = text
    start_marker = "{"
    end_marker = "}"

    # Find the start and end indices of the part to extract
    start_index = original_string.find(start_marker)
    end_index = original_string.find(end_marker, start_index) + len(end_marker)

    # Remove the extracted part from the original string
    text_no_meta = original_string[:start_index] + original_string[end_index:]

    return text_no_meta

In [None]:
# Path to your ZIP file in Google Drive
zip_file_path = '/content/my_drive/MyDrive/zip2/corpus.zip'
# Directory where you want to extract the files
extract_folder = '/content/my_drive/MyDrive/extracted/extracted_files/corpus/'
extract_zip_file(zip_file_path, extract_folder)


'/content/my_drive/MyDrive/extracted/extracted_files/corpus/'

In [None]:
def file_info(extract_folder):
  """
  Fills dataframe df_info with information about files: their id in format 'file_id',  name of file, and meta data.

  Parameters:
  extract_folder (str): The directory where are extracted files.

  Returns:
  df_info (df): dataframe with id and metadata about file.
  all_texts (set): set, where elements are texts from files
  """
  df_info = pd.DataFrame(columns=['id', 'name', 'meta'])
  all_texts = set()
  n = 1
  for filename in os.listdir(extract_folder):
    if filename.endswith('.txt'):  # Проверяем, что файл - текстовый
        with open(os.path.join(extract_folder, filename), 'r') as file:
            text = file.read()
            meta = extract_meta(text)
            df_info.loc[n] = [f'file_{n}', filename, meta]
            new_text = extract_text(text)
            all_texts.add(new_text)
        n += 1
  return df_info, all_texts


In [None]:
file_info(extract_folder)[0]

Unnamed: 0,id,name,meta
1,file_1,849538.txt,"{""author"": ""zubkovase"", ""title"": ""35 образоват..."
2,file_2,850232.txt,"{""author"": ""Homyakin"", ""title"": ""Во имя богов ..."
3,file_3,848640.txt,"{""author"": ""svistunov"", ""title"": ""Server Side ..."
4,file_4,846886.txt,"{""author"": ""badcasedaily1"", ""title"": ""Перегруз..."
5,file_5,846498.txt,"{""author"": ""spring_aio"", ""title"": ""Критическая..."
...,...,...,...
210,file_210,846368.txt,"{""author"": ""PatientZero"", ""title"": ""Почему сли..."
211,file_211,850696.txt,"{""author"": ""Cloud4Y"", ""title"": ""Гик-блогер воз..."
212,file_212,849758.txt,"{""author"": ""RodionGork"", ""title"": ""Erlang — кл..."
213,file_213,845974.txt,"{""author"": ""Cloud4Y"", ""title"": ""Doom на дисков..."


In [None]:
def process_stanza(nlp, text):
    """
    Parses syntax.

    Parameters:
    nlp (?): Parsing model.
    text (str): Text without meta information from txt file.

    Returns:
    stanza (set): Set with token and it's syntactic role
    """
    doc = nlp(text)
    stanza = [(word.text, word.deprel, ) for sent in doc.sentences for word in sent.words]
    return stanza

In [None]:
def parse_files(extract_folder, nlp):
  """
  Parses all files.

  Parameters:
  etract folder(str): The directory where are extracted files.
  nlp (?): Parsing model.

  Returns:
  dfs (dict): Keys are names of dataframes, values are dataframes.
  """

  n = 1
  dfs = {}
  all_texts = file_info(extract_folder)[1]
  for text in all_texts:
      nlp_stanza = nlp
      data = process_stanza(nlp_stanza, text)
      dfs[f'df_file_{n}'] = pd.DataFrame(data, columns=['token', 'deprel'])
      print(f'made df_file_{n}')
      n += 1

  return dfs

In [None]:
nlp_stanza = stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma,depparse')

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ru (Russian):
| Processor | Package            |
----------------------------------
| tokenize  | syntagrus          |
| pos       | syntagrus_charlm   |
| lemma     | syntagrus_nocharlm |
| depparse  | syntagrus_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: depparse
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Done loading processors!


In [None]:
dataframes = parse_files(extract_folder, nlp_stanza)

made df_file_1
made df_file_2
made df_file_3
made df_file_4
made df_file_5
made df_file_6
made df_file_7
made df_file_8
made df_file_9
made df_file_10
made df_file_11
made df_file_12
made df_file_13
made df_file_14
made df_file_15
made df_file_16
made df_file_17
made df_file_18
made df_file_19
made df_file_20
made df_file_21
made df_file_22
made df_file_23
made df_file_24
made df_file_25
made df_file_26
made df_file_27
made df_file_28
made df_file_29
made df_file_30
made df_file_31
made df_file_32
made df_file_33
made df_file_34
made df_file_35
made df_file_36
made df_file_37
made df_file_38
made df_file_39
made df_file_40
made df_file_41
made df_file_42
made df_file_43
made df_file_44
made df_file_45
made df_file_46
made df_file_47
made df_file_48
made df_file_49
made df_file_50
made df_file_51
made df_file_52
made df_file_53
made df_file_54
made df_file_55
made df_file_56
made df_file_57
made df_file_58
made df_file_59
made df_file_60
made df_file_61
made df_file_62
made df_file_63
m

In [None]:
dataframes['df_file_53']

Unnamed: 0,token,deprel
0,Похоже,parataxis
1,",",punct
2,популярный,amod
3,способ,nsubj
4,преобразования,nmod
...,...,...
778,использовать,xcomp
779,u_strToUpper,flat:foreign
780,и,cc
781,u_strToLower,flat:foreign
