In [65]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.util import ngrams
import re

### Load dataframe:

In [66]:
with open('../data_per_simpCode_00_clean_anamnesa.txt', 'rb') as f:
    lines = f.read().decode("utf-8", "backslashreplace").splitlines()

df_anamnesa = pd.DataFrame(lines, dtype='string')

with open('../data_per_simpCode_00_clean_status.txt', 'r') as f:
    lines = f.read().splitlines()

df_status = pd.DataFrame(lines, dtype='string')

### Define statistics functions

In [67]:
def count_all_symbols(df):
    return df[0].str.len().sum()

def punctuation_marks_statistics(df, all_symbols_count):
    punctuation_marks = [' ', '\.', '\?', '\!', '\-', '\:', '\;', '\'', '\"', '\(', '\)', '\{', '\}', '\[', '\]', '\*']
    mark_statistic = {}
    for mark in punctuation_marks:
        count = df[0].str.count(mark).sum()
        mark_statistic[mark] = {
            'count': count,
            'percentage': (count/all_symbols_count)*100
        }
    return {k: v for k, v in sorted(mark_statistic.items(), key=lambda item: item[1]['count'], reverse=True)}

def digits_statistics(df, all_symbols_count):
    count = df[0].str.count(r"[0-9]").sum()
    return {
        'count': count,
        'percentage': (count/all_symbols_count)*100
    }

def latin_letters_statistics(df, all_symbols_count):
    count = df[0].str.count(r"[A-Za-z]").sum()
    return {
        'count': count,
        'percentage': (count/all_symbols_count)*100
    }

def cyrrilic_letters_statistics(df, all_symbols_count):
    count = df[0].str.count(r"[а-яА-Я]").sum()
    return {
        'count': count,
        'percentage': (count/all_symbols_count)*100
    }

def average_sentence_length(df):
    total_length_chars = 0
    total_length_words = 0
    count = 0
    for row in df[0]:
        rows = row.split('\. ')
        for split_row in rows:
            total_length_chars += len(split_row)
            total_length_words += len(split_row.split(' '))
            count += 1

    return {
        'chars_average': total_length_chars/count,
        'words_average': total_length_words/count,
    }

def get_statistics(df):
    count_all = count_all_symbols(df)
    print(f"Total symbols count: {count_all}")
    punctuation_mark_stats = punctuation_marks_statistics(df, count_all)
    for mark in punctuation_mark_stats:
        print(f"Total count for {mark} is {punctuation_mark_stats[mark]}")
    print(f"Digits statistic is {digits_statistics(df, count_all)}")
    print(f"Latin letters statistic is {latin_letters_statistics(df, count_all)}")
    print(f"Cyrrilic letters statistic is {cyrrilic_letters_statistics(df, count_all)}")
    print(f"Average sentence length is {average_sentence_length(df)}")


### Perform statistics on anamnesa.txt:

In [68]:
get_statistics(df_anamnesa)

Total symbols count: 485501
Total count for   is {'count': 66630, 'percentage': 13.723967612837049}
Total count for \. is {'count': 6194, 'percentage': 1.2757955184438343}
Total count for " is {'count': 4612, 'percentage': 0.9499465500585993}
Total count for \- is {'count': 1919, 'percentage': 0.395261801726464}
Total count for \; is {'count': 1206, 'percentage': 0.2484031958739529}
Total count for \! is {'count': 129, 'percentage': 0.02657049110094521}
Total count for \: is {'count': 105, 'percentage': 0.02162714391937401}
Total count for \? is {'count': 100, 'percentage': 0.02059727992321334}
Total count for \( is {'count': 30, 'percentage': 0.006179183976964002}
Total count for \) is {'count': 30, 'percentage': 0.006179183976964002}
Total count for \* is {'count': 4, 'percentage': 0.0008238911969285336}
Total count for ' is {'count': 3, 'percentage': 0.0006179183976964002}
Total count for \[ is {'count': 2, 'percentage': 0.0004119455984642668}
Total count for \{ is {'count': 0, 'per

### Perform statistics on status.txt:

In [69]:
get_statistics(df_status)

Total symbols count: 8227378
Total count for   is {'count': 1121323, 'percentage': 13.629165938407109}
Total count for \. is {'count': 243213, 'percentage': 2.9561422849418126}
Total count for \- is {'count': 186162, 'percentage': 2.262713588703473}
Total count for " is {'count': 89786, 'percentage': 1.091307583047722}
Total count for \: is {'count': 27130, 'percentage': 0.32975268670042873}
Total count for \; is {'count': 24209, 'percentage': 0.29424927358388053}
Total count for \) is {'count': 2901, 'percentage': 0.035260322304384216}
Total count for \( is {'count': 2866, 'percentage': 0.034834913383097264}
Total count for ' is {'count': 214, 'percentage': 0.0026010716901545064}
Total count for \* is {'count': 198, 'percentage': 0.002406599040423328}
Total count for \! is {'count': 140, 'percentage': 0.0017016356851478076}
Total count for \] is {'count': 130, 'percentage': 0.0015800902790658214}
Total count for \[ is {'count': 123, 'percentage': 0.0014950084948084311}
Total count for