In [1]:
import os
from bs4 import BeautifulSoup
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style("whitegrid")

from nltk.tokenize import WhitespaceTokenizer
from string import punctuation

# HW1:  Сравнение стилей текстов
### Выполнили:  Булгаков Дмитрий, Тефикова Алие
### Группа ИАД-2

# 1. Loading data from file 

Составьте самостоятельно как минимум две коллекции
текстов разных стилей (например, коллекция текстов в публицистическом
стиле и коллекция текстов в научном стиле). Коллекции текстов
должны быть достаточно большие (порядка 5000 токенов). Посчитайте
количество токенов и типов в каждой коллекции.

### 1.1 Loading data from files

In [2]:
def remove_control_characters(text_string):
    return ''.join(filter(None, text_string.splitlines()))

### 1.1.1 Reading fiction text from file (War and Peace by Leo Tolstoy)

War and Peace (pre-reform Russian: Война́ и миръ; post-reform Russian: Война́ и мир, translit. Voyná i mir [vɐjˈna i ˈmʲir]) is a novel by the Russian author Leo Tolstoy, which is regarded as a central work of world literature and one of Tolstoy's finest literary achievements

<b>Link:<b> http://www.gutenberg.org/files/2600/2600-0.txt

In [3]:
fiction = open('data/wap.txt', encoding='utf-8').read()
fiction = remove_control_characters(fiction)

### 1.1.2 Parsing journalistic style texts from Returers article corpus

Currently the most widely used test collection for text categorization research, though likely to be superceded over the next few years by RCV1.  The data was originally collected and labeled by Carnegie Group, Inc. and Reuters, Ltd. in the course of developing the CONSTRUE text categorization system. 

<b>Link<b>: http://www.daviddlewis.com/resources/testcollections/reuters21578/

In [4]:
tags_to_remove = ['date', 'topics', 'places', 'people', 'orgs', 'exchanges', 'companies', 
                  'unknown', 'title', 'dateline']
reuters_stopwords = ['reuters']

In [5]:
def parseXmlFile(path, tags, stopwords):
    xml_soup = BeautifulSoup(open(path), 'lxml')
    
    for trash in xml_soup(tags):
        trash.extract()
        
    parsed_text = xml_soup.get_text() # getting text from file 
    parsed_text = remove_control_characters(parsed_text) # removing control characters    
    parsed_text_words = [word for word in parsed_text.split() if word.lower() not in stopwords] # removing stop words
    return ' '.join(parsed_text_words)

In [6]:
journalistic = parseXmlFile('data/reuters.txt', tags=tags_to_remove, stopwords=reuters_stopwords)

### 1.1.3 Reading scientific style texts from PhenoCHF  corpus

PhenoCHF - A corpus consisting of biomedical articles and clinincal records, annotated with phenotypic information related with congestive heart failure (CHF). Various levels of anonotation are included, i.e., entity mentions, their normalisation to concept IDs in the UMLS Metathesarus, and relations involving entity mentions.

<b>Link<b>: http://www.nactem.ac.uk/phenotype/

In [7]:
scientific = open('data/phenoCHF.txt', encoding='utf-8').read()
scientific = remove_control_characters(scientific)

### 1.1.4 Reading conversational style texts from Skam, Shameless TV shows

Skam (Norwegian pronunciation: [skɑm]; English: shame) is a Norwegian young adult TV series about the daily life of teenagers at the Hartvig Nissen School (Hartvig Nissens skole), a gymnasium in the wealthy borough of Frogner in West End Oslo. It is produced by NRK P3, which is part of NRK.

<b>Link:<b> https://drive.google.com/drive/folders/0Bxy61gL9aCrhQldNd0E4Vk8tWUk

In [8]:
conversational = open('data/skam_shameless.txt', encoding='utf-8').read()
conversational = remove_control_characters(conversational)

### 1.2 Counting tokens and types

In [9]:
exclude_symbols = set(punctuation + '0123456789'+u'–—'+u'«»'+u'“')

In [10]:
def tokenize(text, exlude_symb):
    text = text.lower()
    text_merged = ''.join(ch for ch in text if ch not in exlude_symb)
    text_tokens = WhitespaceTokenizer().tokenize(text_merged.lower())
    return text_tokens

In [11]:
def print_results(tokens):
    print('N of tokens: ', len(tokens))
    types = nltk.FreqDist(tokens)
    print('N of types:', len(types))
    print(types)

### 1.2.1 Fiction

In [12]:
fiction_tokens = tokenize(fiction, exclude_symbols)

In [13]:
for i in fiction_tokens[:10]: 
    print(i) 

well
prince
so
genoa
and
lucca
are
now
just
family


In [14]:
print_results(fiction_tokens)

N of tokens:  510360
N of types: 56583
<FreqDist with 56583 samples and 510360 outcomes>


### 1.2.2 Journalistic

In [15]:
journalistic_tokens = tokenize(journalistic, exclude_symbols)

In [16]:
for i in journalistic_tokens[:10]: 
    print(i) 

showers
continued
throughout
the
week
inthe
bahia
cocoa
zone
alleviating


In [17]:
print_results(journalistic_tokens)

N of tokens:  308598
N of types: 34354
<FreqDist with 34354 samples and 308598 outcomes>


### 1.2.3 Scientific

In [18]:
scientific_tokens = tokenize(scientific, exclude_symbols)

In [19]:
for i in scientific_tokens[:10]: 
    print(i) 

left
ventricular
disease
occurs
frequentlyin
dialysis
patients
it
may
be


In [20]:
print_results(scientific_tokens)

N of tokens:  32623
N of types: 5222
<FreqDist with 5222 samples and 32623 outcomes>


### 1.2.4 Conversational

In [21]:
conversational_tokens = tokenize(conversational, exclude_symbols)

In [22]:
for i in conversational_tokens[:10]: 
    print(i) 

i
didnt
think
she
was
his
typeneither
did
iwhen
did


In [23]:
print_results(conversational_tokens)

N of tokens:  270145
N of types: 23778
<FreqDist with 23778 samples and 270145 outcomes>


### 2. Counting parts of speech

Используя любой морфологический процессор, которыйвам нравится (pymorphy2, mystem), определите к какой части речиотносятся слова из каждой коллекции текстов. При помощи nltk.FreqDist()составьте частотные словари: часть речи – количество слов, к нейотносящихся

In [33]:
from nltk.corpus import stopwords

for i in stopwords.words('english'):
    print(i)
mystopwords = stopwords.words('english')

i
me
my
myself
we
our
ours
ourselves
you
your
yours
yourself
yourselves
he
him
his
himself
she
her
hers
herself
it
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
s
t
can
will
just
don
should
now
d
ll
m
o
re
ve
y
ain
aren
couldn
didn
doesn
hadn
hasn
haven
isn
ma
mightn
mustn
needn
shan
shouldn
wasn
weren
won
wouldn


In [34]:
import pymorphy2

#### fiction

In [40]:
morph = pymorphy2.MorphAnalyzer()
lemmata = nltk.FreqDist()

mystopwords = stopwords.words('english')

f_types = nltk.FreqDist(fiction_tokens)
for t in f_types:
    try:
        l = morph.parse(t)[0].normal_form
        if l in lemmata:
            lemmata[l] += f_types[t]
        else:
            lemmata[l] = f_types[t]
    except IndexError:
        if t in lemmata:
            lemmata[t] += f_types[t]
        else:
            lemmata[t] = f_types[t]
print('N of lemmata:', len(lemmata))
for i in lemmata.most_common(10):
    print(i[0], i[1])

N of lemmata: 56583
the 28554
and 18642
to 14574
of 13021
a 9085
he 8335
in 7623
his 6770
that 6620
was 6347


In [41]:
lemmata_no_sw = nltk.FreqDist()
for l in lemmata:
    if not l in mystopwords:
        lemmata_no_sw[l] = lemmata[l]
for i in lemmata_no_sw.most_common(20):
    print(i[0], i[1])

said 2463
one 1697
prince 1383
pierre 1214
would 1157
could 941
man 867
andrew 829
went 769
time 748
natásha 722
old 699
know 688
face 687
french 681
men 647
eyes 633
princess 630
thought 625
like 620


#### conversational

In [42]:
morph = pymorphy2.MorphAnalyzer()
lemmata = nltk.FreqDist()

mystopwords = stopwords.words('english')

c_types = nltk.FreqDist(conversational_tokens)
for t in c_types:
    try:
        l = morph.parse(t)[0].normal_form
        if l in lemmata:
            lemmata[l] += c_types[t]
        else:
            lemmata[l] = c_types[t]
    except IndexError:
        if t in lemmata:
            lemmata[t] += c_types[t]
        else:
            lemmata[t] = c_types[t]
print('N of lemmata:', len(lemmata))
for i in lemmata.most_common(10):
    print(i[0], i[1])

N of lemmata: 23778
you 8331
the 7514
a 6606
to 6151
i 4950
of 2836
your 2361
it 2304
in 2249
my 2232


In [43]:
lemmata_no_sw = nltk.FreqDist()
for l in lemmata:
    if not l in mystopwords:
        lemmata_no_sw[l] = lemmata[l]
for i in lemmata_no_sw.most_common(20):
    print(i[0], i[1])

get 1615
dont 1552
know 1358
got 1164
im 1139
like 1131
want 1065
youre 968
go 878
gonna 864
need 801
good 610
cant 590
think 585
take 578
come 558
really 508
one 504
right 501
going 494


#### scientific

In [44]:
morph = pymorphy2.MorphAnalyzer()
lemmata = nltk.FreqDist()

mystopwords = stopwords.words('english')

s_types = nltk.FreqDist(scientific_tokens)
for t in s_types:
    try:
        l = morph.parse(t)[0].normal_form
        if l in lemmata:
            lemmata[l] += s_types[t]
        else:
            lemmata[l] = s_types[t]
    except IndexError:
        if t in lemmata:
            lemmata[t] += s_types[t]
        else:
            lemmata[t] = s_types[t]
print('N of lemmata:', len(lemmata))
for i in lemmata.most_common(10):
    print(i[0], i[1])

N of lemmata: 5222
of 1441
the 1347
in 1114
and 1000
with 643
to 580
a 519
patients 476
is 456
for 319


In [45]:
lemmata_no_sw = nltk.FreqDist()
for l in lemmata:
    if not l in mystopwords:
        lemmata_no_sw[l] = lemmata[l]
for i in lemmata_no_sw.most_common(20):
    print(i[0], i[1])

patients 476
risk 250
disease 223
renal 210
ckd 167
cvd 163
may 161
heart 150
failure 145
dialysis 124
associated 122
esrd 119
levels 117
factors 116
mortality 116
increased 116
studies 107
also 103
kidney 96
cardiovascular 95


#### journalistic

In [50]:
morph = pymorphy2.MorphAnalyzer()
lemmata = nltk.FreqDist()

mystopwords = stopwords.words('english') + [u'mln', u'dlrs', u'vs', u'pct', u'cts', u'us', u'would']

j_types = nltk.FreqDist(journalistic_tokens)
for t in j_types:
    try:
        l = morph.parse(t)[0].normal_form
        if l in lemmata:
            lemmata[l] += j_types[t]
        else:
            lemmata[l] = j_types[t]
    except IndexError:
        if t in lemmata:
            lemmata[t] += j_types[t]
        else:
            lemmata[t] = j_types[t]
print('N of lemmata:', len(lemmata))
for i in lemmata.most_common(10):
    print(i[0], i[1])

N of lemmata: 34354
the 16142
of 8666
to 8377
said 6688
and 6345
a 6179
in 6140
mln 3658
for 3190
dlrs 2873


In [51]:
lemmata_no_sw = nltk.FreqDist()
for l in lemmata:
    if not l in mystopwords:
        lemmata_no_sw[l] = lemmata[l]
for i in lemmata_no_sw.most_common(20):
    print(i[0], i[1])

said 6688
year 1400
billion 1257
company 990
net 974
inc 842
loss 840
bank 765
new 741
corp 714
also 653
last 652
one 643
march 631
sales 601
share 593
stock 570
shares 556
market 553
profit 534
