In [17]:
from pattern.web import Wikipedia, plaintext
import string
import re
from nltk.util import ngrams
from collections import Counter

In [56]:
class WikiParser:
    def __init__(self):
        pass

    def text_cleaning(self, text):
        exclude = string.punctuation + '0123456789–«»(?:[]|$)'        
        text = ''.join([ch for ch in text if ch not in exclude])
        text = re.sub('\s{2,}',' ', text)
        text = text.lower()
        return text
        
    def get_articles(self, start, depth=1, max_count=1):
        article = Wikipedia().article(start)
        links = article.links
        list_of_strings = []
        for link in links:
            text = Wikipedia().article(link)
            text = self.text_cleaning(plaintext(text.source))
            list_of_strings.append(text)
        return list_of_strings

In [72]:
class TextStatistics:
    def __init__(self, articles):
        self.articles = articles
     
    def get_top_3grams(self, n):
        all_3grams = []
        for a in self.articles:
            all_3grams += ngrams(a.split(), 3)
        c = Counter(all_3grams)
        most_common_n = c.most_common(n)
        list_of_3grams_in_descending_order_by_freq = [x[0] for x in most_common_n]
        list_of_their_corresponding_freq = [x[1] for x in most_common_n]
        return (list_of_3grams_in_descending_order_by_freq, list_of_their_corresponding_freq)
    
    def get_top_words(self, n):
        all_words = []
        stop_words = ['a', 'an', 'the', 'as', 'in', 'out', 'on', 'off', 'until', 'of', 'at', 'by', 'for', 'with', 
                      'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'and', 'but', 'or',
                      'to', 'from', 'up', 'down', 'since', 'over', 'under', 'about', 'against', 'like', 'via', 'not']
        for a in self.articles:
            all_words += [x for x in a.split() if x not in stop_words]
        c = Counter(all_words)
        most_common_n = c.most_common(n)
        list_of_words_in_descending_order_by_freq = [x[0] for x in most_common_n]
        list_of_their_corresponding_freq = [x[1] for x in most_common_n]
        return (list_of_words_in_descending_order_by_freq, list_of_their_corresponding_freq)

In [77]:
class Experiment:
    def __init__(self, article):
        self.article = article
        self.parser = WikiParser()
        
    def show_results(self):
        statistics_links = TextStatistics(self.parser.get_articles(self.article))
        top_3grams_links = statistics_links.get_top_3grams(20)
        top_words_links = statistics_links.get_top_words(20)
        print('For links in article\nTop 20 3grams:')
        print('\n'.join([' '.join(w)+' : '+str(n) for w, n in zip(top_3grams_links[0], top_3grams_links[1])]))
        print('\nTop 20 words:')
        print('\n'.join([w+' : '+str(n) for w, n in zip(top_words_links[0], top_words_links[1])]))
        
        statistics = TextStatistics([self.parser.text_cleaning(plaintext(Wikipedia().article(self.article).source))])
        top_3grams = statistics.get_top_3grams(5)
        top_words = statistics.get_top_words(5)
        print('\nFor article\nTop 5 3grams:')
        print('\n'.join([' '.join(w)+' : '+str(n) for w, n in zip(top_3grams[0], top_3grams[1])]))
        print('\nTop 5 words:')
        print('\n'.join([w+' : '+str(n) for w, n in zip(top_words[0], top_words[1])]))

In [78]:
x = Experiment('Natural language processing')
x.show_results()

For links in article
Top 20 3grams:
natural language processing : 336
from the original : 306
archived from the : 296
v t e : 277
the original on : 238
the use of : 238
as well as : 223
one of the : 205
a b c : 186
proceedings of the : 182
the european union : 163
cambridge university press : 158
of the european : 155
such as the : 151
the number of : 143
a number of : 141
university press isbn : 140
for example the : 136
a set of : 131
based on the : 130

Top 20 words:
is : 8681
that : 4821
are : 4300
language : 4073
be : 3397
it : 2726
this : 2527
which : 2204
can : 1965
english : 1801
was : 1796
speech : 1739
languages : 1709
retrieved : 1708
such : 1700
words : 1666
also : 1658
have : 1655
other : 1629
word : 1560

For article
Top 5 3grams:
natural language processing : 16
chunk of text : 6
a chunk of : 6
of natural language : 5
proceedings of the : 4

Top 5 words:
language : 59
is : 48
natural : 39
such : 30
processing : 28
