# Wordcloud example for a PDF file retrieved from Web

In [None]:
import os, re, string

### Download PDF

In [None]:
from urllib.request import urlopen

urllink = 'http://www.ysk.gov.tr/doc/karar/dosya/78053/2019-4219.pdf'
url = urlopen(urllink)
filename = urllink.split('/')[-1]
filename

### Convert PDF to text

In [None]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

def convert_pdf_to_txt(path, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    return text

In [None]:
text = convert_pdf_to_txt(filename)
#text = convert_pdf_to_txt(filename, pages=[0,1])

### Text data munging

In [None]:
text = re.sub(r'[^\w\s]', ' ', text)
text = text.replace('\n','')                                                  # remove \n
text = text.replace('T C  YÜKSEK SEÇİM KURULU         Karar No   4219', '')   # remove headings
text = text.replace('K A R A R','')
text = re.sub(r'\d+', r'', str(text))                                         # remove digits
text = text.replace('\x0c', '')                                               # replace special-chars
text = re.sub(r' +', r' ', str(text))                                         # remove dublicate spaces
text = text.strip()                                                           # strip leading/trailing spaces

### Define wordcloud function

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

def drawWordCloud(data, color = 'black'):
    corpus =' '.join(data)
    wordcloud = WordCloud(background_color=color,
                          max_words=100, width=2000, height=1500
                         ).generate_from_text(corpus)
    plt.figure(figsize=(12, 12))
    plt.title(filename, fontsize=22)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad = 0)
    plt.show()

### Create a corpus and plot wordcloud for all words in file

In [None]:
from nltk.corpus import stopwords

corpus_with_split = text.split(' ')
stopwords_list = set(stopwords.words('turkish'))
filtered_words = [word for word in corpus_with_split if word not in stopwords_list]
print(filtered_words)

### Create a corpus by using TurkishStemmer and plot wordcloud for all words in file

In [None]:
from TurkishStemmer import TurkishStemmer

# function to find root of individual words
def TurkishStemmerAnalysis(data):
    stemmer = TurkishStemmer()
    return [stemmer.stem(t) for t in data]

In [None]:
drawWordCloud(TurkishStemmerAnalysis(filtered_words),'white')

### Create a corpus by using snowballstemmer and plot wordcloud for all words in file

In [None]:
from snowballstemmer import stemmer

# function to find root of individual words
def SnowballStemmerAnalysis(data):
    findRoot = stemmer('turkish')
    return findRoot.stemWords(data)

In [None]:
drawWordCloudText(SnowballStemmerAnalysis(filtered_words), 'blue')

### Create a corpus by using Zemberek and plot wordcloud for all words & adjectives  in file

In [None]:
import jpype
javaPath = r'C:\Program Files\Java\jdk1.8.0_212\jre\bin\server\jvm.dll'
classPath = r'D:\data-science\zemberek\zemberek-tum-2.0.jar'
jpype.startJVM(javaPath, '-ea', '-Djava.class.path=%s' % classPath)

In [None]:
# function to find root of individual words
def ZemberekAnalysis(data):
    Tr = jpype.JClass("net.zemberek.tr.yapi.TurkiyeTurkcesi")  # load TurkiyeTurkcesi class
    tr = Tr()                                                  # initiate tr object
    Zemberek = jpype.JClass("net.zemberek.erisim.Zemberek")    # load Zemberek class
    zemberek = Zemberek(tr)                                    # initiate zemberek object
    words=[]; adjcs=[]; nouns=[]
    verbs=[]; specs=[]; abbrs=[]
    for t in data:
        ans = zemberek.kelimeCozumle(t)
        if ans:
            root = str(ans[0].kok()).split()[0]
            tipo = str(ans[0].kok()).split()[1]
            words.append(root)
            if tipo == 'SIFAT': adjcs.append(root)
            if tipo == 'ISIM': nouns.append(root)
            if tipo == 'FIIL': verbs.append(root)
            if tipo == 'OZEL': specs.append(t)
            if tipo == 'KISALTMA' and len(t) > 1: abbrs.append(t) 
    else:
        pass
    return words, adjcs, nouns, verbs, specs, abbrs

In [None]:
allwords, adjectives, nouns, verbs, specials, abbreviations = ZemberekAnalysis(filtered_words)

In [None]:
from collections import Counter

def corpa(data, max_freq=1):
    stopwords_list = set(stopwords.words('turkish'))
    words = [word for word in data if word not in stopwords_list]
    dict_data = {}
    for key,value in dict(Counter(words)).items():
        if value > max_freq: dict_data[key] = int(value)
    return dict_data

In [None]:
def drawWordCloudFreq(data, max_freq=1, color='black', title=''):
    dict_corpa = corpa(data, max_freq)
    wordcloud = WordCloud(background_color=color,
                          max_words=100, width=2000, height=1500
                         ).generate_from_frequencies(dict_corpa)
    plt.figure(figsize=(12, 12))
    plt.title('T.C. YÜKSEK SEÇİM KURULU - Karar No:4219 - {}'.format(title), fontsize=22)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad = 0)
    plt.show()

In [None]:
drawWordCloudFreq(specials, 2, color='white', title='Pronouns Frequency')

In [None]:
drawWordCloudFreq(abbreviations, 1, color='blue')

In [None]:
drawWordCloudFreq(adjectives, 2, color='black', title='Adjectives Frequency')

In [None]:
drawWordCloudFreq(nouns, 2, color='red', title='Nouns Frequency')

In [None]:
drawWordCloudFreq(allwords, 2, color='black')

In [None]:
#jpype.shutdownJVM()