In [3]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud, STOPWORDS
import sys, os
import json

In [4]:
os.chdir(sys.path[0])

<h4>Loading the json-list</h4>

In [5]:
def load_data(title):
    with open(title, encoding = 'utf-8') as f:
        return json.load(f)

In [6]:
news_list = load_data('all_text_about_russia.json')

<h4>merge into common text and clean</h4>

In [7]:
text = ' '.join(news_list)
text = text.lower()

In [8]:
import string

# special chars to be removed
spec_chars = string.punctuation + '\n\xa0«»\t—…\r€’”“' 
# function to remove special chars
def remove_chars_from_text(text, chars):
    return "".join([ch for ch in text if ch not in chars])

In [9]:
# in here i remove special chars using remove_chars_from_text()
text = remove_chars_from_text(text, spec_chars)
text = remove_chars_from_text(text, string.digits)

In [10]:
text



<h4>Transform words into tokens using nltk</h4>

In [11]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize
text_tokens = word_tokenize(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\germa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
text = nltk.Text(text_tokens)
len(text)

73450

In [13]:
from nltk.probability import FreqDist
fdist = FreqDist(text)

In [14]:
fdist

FreqDist({'the': 4232, 'to': 2205, 'and': 2064, 'of': 1984, 'in': 1783, 'a': 1525, 'that': 762, 'for': 731, 'on': 687, 'is': 673, ...})

<h5>There are too many useless words in the text</h5>

<h4>Remove stop-words using nltk</h4>

<h5>I'm going to analyze the text and expand the nltk stopwords dictionary</h5>

In [45]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
stopwords.extend(['russia', 'russian', 'byrussia', 'new', 'russias', 'said', 'us', 'also', 'news', 'one', 'would', 'could', 
                 'since', 'united', 'foreign', 'two', 'may', 'years', 'even', 'last', 'use', 'first', 'second', 'including',
                 'many', 'well', 'see', 'says', 'year', 'take', 'used', 'mr', 'according', 'still', 'around', 'countries',
                  'country', 'state', 'states', 'russians', 'official', 'three', 'made', 'official', 'time', 'group'])
filtered_words = [word for word in text if word not in stopwords]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\germa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
fdist = FreqDist(filtered_words)
fdist

FreqDist({'moscow': 169, 'china': 130, 'military': 119, 'people': 112, 'international': 100, 'putin': 97, 'government': 94, 'economic': 85, 'media': 85, 'vaccine': 83, ...})

<h5>50 most mentioned words</h5>

In [47]:
fdist.most_common(50)

[('moscow', 169),
 ('china', 130),
 ('military', 119),
 ('people', 112),
 ('international', 100),
 ('putin', 97),
 ('government', 94),
 ('economic', 85),
 ('media', 85),
 ('vaccine', 83),
 ('reuters', 78),
 ('cookies', 77),
 ('data', 76),
 ('information', 74),
 ('president', 73),
 ('covid', 71),
 ('authorities', 68),
 ('region', 68),
 ('security', 67),
 ('biden', 66),
 ('trade', 66),
 ('navalny', 65),
 ('power', 63),
 ('rights', 62),
 ('mediterranean', 59),
 ('officials', 57),
 ('world', 57),
 ('soviet', 57),
 ('policy', 57),
 ('political', 56),
 ('kremlin', 55),
 ('influence', 55),
 ('gas', 55),
 ('part', 55),
 ('space', 55),
 ('global', 55),
 ('defense', 54),
 ('egypt', 53),
 ('countrys', 51),
 ('vladimir', 51),
 ('oil', 51),
 ('war', 50),
 ('july', 50),
 ('central', 50),
 ('sputnik', 50),
 ('times', 48),
 ('v', 48),
 ('like', 48),
 ('internet', 48),
 ('turkey', 48)]

<h4>Word cloud visualization</h4>

In [48]:
import matplotlib.pyplot as plt
text = filtered_words
text_raw = " ".join(text)

In [49]:
wc = WordCloud(
    background_color= '#c8dbcd',
    height = 600,
    width = 400
)

In [50]:
wc.generate(text_raw)

<wordcloud.wordcloud.WordCloud at 0x21953588100>

<h4>Save to .jpg</h4>

In [51]:
wc.to_file('wordcloud_output.jpg')

<wordcloud.wordcloud.WordCloud at 0x21953588100>