In [1]:
import os
from bs4 import BeautifulSoup
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style("whitegrid")

from nltk.tokenize import WhitespaceTokenizer
from string import punctuation

# HW1:  Сравнение стилей текстов
### Выполнили:  Булгаков Дмитрий, Тефикова Алие
### Группа ИАД-2

# 1. Loading data from file 

Составьте самостоятельно как минимум две коллекции
текстов разных стилей (например, коллекция текстов в публицистическом
стиле и коллекция текстов в научном стиле). Коллекции текстов
должны быть достаточно большие (порядка 5000 токенов). Посчитайте
количество токенов и типов в каждой коллекции.

### 1.1 Loading data from files

In [2]:
def remove_control_characters(text_string):
    return ''.join(filter(None, text_string.splitlines()))

### 1.1.1 Reading fiction text from file (War and Peace by Leo Tolstoy)

War and Peace (pre-reform Russian: Война́ и миръ; post-reform Russian: Война́ и мир, translit. Voyná i mir [vɐjˈna i ˈmʲir]) is a novel by the Russian author Leo Tolstoy, which is regarded as a central work of world literature and one of Tolstoy's finest literary achievements

<b>Link:<b> http://www.gutenberg.org/files/2600/2600-0.txt

In [3]:
fiction = open('data/wap.txt', encoding='utf-8').read()
fiction = remove_control_characters(fiction)

### 1.1.2 Parsing journalistic style texts from Returers article corpus

Currently the most widely used test collection for text categorization research, though likely to be superceded over the next few years by RCV1.  The data was originally collected and labeled by Carnegie Group, Inc. and Reuters, Ltd. in the course of developing the CONSTRUE text categorization system. 

<b>Link<b>: http://www.daviddlewis.com/resources/testcollections/reuters21578/

In [4]:
tags_to_remove = ['date', 'topics', 'places', 'people', 'orgs', 'exchanges', 'companies', 
                  'unknown', 'title', 'dateline']
reuters_stopwords = ['reuters']

In [5]:
def parseXmlFile(path, tags, stopwords):
    xml_soup = BeautifulSoup(open(path), 'lxml')
    
    for trash in xml_soup(tags):
        trash.extract()
        
    parsed_text = xml_soup.get_text() # getting text from file 
    parsed_text = remove_control_characters(parsed_text) # removing control characters    
    parsed_text_words = [word for word in parsed_text.split() if word.lower() not in stopwords] # removing stop words
    return ' '.join(parsed_text_words)

In [6]:
journalistic = parseXmlFile('data/reuters.txt', tags=tags_to_remove, stopwords=reuters_stopwords)

### 1.1.3 Reading scientific style texts from PhenoCHF  corpus

PhenoCHF - A corpus consisting of biomedical articles and clinincal records, annotated with phenotypic information related with congestive heart failure (CHF). Various levels of anonotation are included, i.e., entity mentions, their normalisation to concept IDs in the UMLS Metathesarus, and relations involving entity mentions.

<b>Link<b>: http://www.nactem.ac.uk/phenotype/

In [20]:
scientific = open('data/phenoCHF.txt', encoding='utf-8').read()
scientific = remove_control_characters(scientific)

### 1.2 Counting tokens and types

In [21]:
exclude_symbols = set(punctuation + '0123456789'+u'–—'+u'«»'+u'“')

In [22]:
def tokenize(text, exlude_symb):
    text = text.lower()
    text_merged = ''.join(ch for ch in text if ch not in exlude_symb)
    text_tokens = WhitespaceTokenizer().tokenize(text_merged.lower())
    return text_tokens

In [23]:
def print_results(tokens):
    print('N of tokens: ', len(tokens))
    types = nltk.FreqDist(tokens)
    print('N of types:', len(types))
    print(types)

### 1.2.1 Fiction

In [24]:
fiction_tokens = tokenize(fiction, exclude_symbols)

In [25]:
for i in fiction_tokens[:10]: 
    print(i) 

well
prince
so
genoa
and
lucca
are
now
just
family


In [26]:
print_results(fiction_tokens)

N of tokens:  510360
N of types: 56583
<FreqDist with 56583 samples and 510360 outcomes>


### 1.2.2 Journalistic

In [27]:
journalistic_tokens = tokenize(journalistic, exclude_symbols)

In [28]:
for i in journalistic_tokens[:10]: 
    print(i) 

showers
continued
throughout
the
week
inthe
bahia
cocoa
zone
alleviating


In [29]:
print_results(journalistic_tokens)

N of tokens:  308598
N of types: 34354
<FreqDist with 34354 samples and 308598 outcomes>


### 1.2.3 Scientific

In [30]:
scientific_tokens = tokenize(scientific, exclude_symbols)

In [31]:
for i in scientific_tokens[:10]: 
    print(i) 

left
ventricular
disease
occurs
frequentlyin
dialysis
patients
it
may
be


In [32]:
print_results(scientific_tokens)

N of tokens:  32623
N of types: 5222
<FreqDist with 5222 samples and 32623 outcomes>
