In [1]:
import sys
from io import StringIO
from pdfminer.layout import LAParams
from pdfminer.high_level import extract_text_to_fp
import os
from natsort import natsorted
import PyPDF2
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
from collections import defaultdict
import string
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.metrics import classification_report
from sklearn.preprocessing import FunctionTransformer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV

import joblib
import stop_words #greek stopwords

In [2]:
mypath = 'stoch'

files = natsorted([os.path.join(mypath, f) for f in os.listdir(mypath) if f.endswith('.pdf')])

In [20]:
files

['stoch\\ElearningStoch1.pdf',
 'stoch\\ElearningStoch2.pdf',
 'stoch\\ElearningStoch3.pdf',
 'stoch\\Periodikotita.pdf',
 'stoch\\Stoch1.pdf',
 'stoch\\Stoch2e.pdf',
 'stoch\\StochasticAskiseis.pdf',
 'stoch\\basic.pdf',
 'stoch\\eksisoseisdiaforon.pdf',
 'stoch\\mesosxronosipopriop.pdf',
 'stoch\\perigrafi.pdf',
 'stoch\\tixaiosperipatos.pdf',
 'stoch\\ypologismosxaraktiristikou.pdf']

In [1]:
# Merge pdf files
pdfWriter = PyPDF2.PdfFileWriter()

chapters = []
#loop through all PDFs
for filename in files:
    #rb for read binary
    pdfFileObj = open(filename, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    
    #Opening each page of the PDF
    for pageNum in range(pdfReader.numPages):
        pageObj = pdfReader.getPage(pageNum)
        pdfWriter.addPage(pageObj)
    chapters.append(pageNum+1)
    
chapters = np.cumsum(chapters)
#save PDF to file, wb for write binary
pdfOutput = open('merged_stoch'+'.pdf', 'wb')
#Outputting the PDF
pdfWriter.write(pdfOutput)
#Closing the PDF writer
pdfOutput.close()

In [4]:
print(chapters)

array([  7,  11,  19,  23,  30,  36,  45,  46,  63,  72,  74,  87, 104],
      dtype=int32)

In [5]:
output_string = StringIO()
with open('./merged_stoch.pdf', 'rb') as fin:
    extract_text_to_fp(fin, output_string, laparams=LAParams(),output_type='html', codec=None)

In [6]:
contents = output_string.getvalue()
bs = BeautifulSoup(contents, "html.parser")

In [8]:
spans = bs.find_all('span', style=re.compile(r"(border: black 1px solid|border: gray 1px solid|border: figure 1px solid)"))
for sp in spans:
    sp.decompose()
    
spans = bs.find_all('div', style=re.compile(r"(border: figure 1px solid)"))
for sp in spans:
    sp.decompose()

In [9]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"")
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    rem_symb = re.sub(r"\W+|_", " ", rem_num)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_symb)
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stop_words.el()]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)
# df['cleanText']=df['Text'].map(lambda s:preprocess(s))

In [4]:
d = defaultdict(list)
divs = bs.find_all('div')
y = []
chapt = 0
for tst in divs:
    if re.search("^position:absolute; top:.*px;$", tst.attrs['style']):
        try:
            d[tst.a['name']] = ''
            index = str(tst.a['name'])
            y.append(chapt)
        except:
            continue
    else:
        d[index] += ' ' + preprocess(tst.text)
        
    try:
        if int(index) >= chapters[chapt]:
            chapt += 1
    except: pass

In [19]:
df = pd.DataFrame.from_dict(d, orient='index', dtype='str')
df = df.reset_index(drop=True)
df['y'] = y
df = df.rename(columns={0: 'text'})
df.head(12)

Unnamed: 0,text,y
0,κεφÿαλαιο μαρκοβιανÿες αλυσÿιδες διακριτοÿυ ...,0
1,επαναληπτικÿες μεταβατικÿες καταστÿασεις επι...,0
2,κεφÿαλαιο μαρκοβιανÿες αλυσÿιδες διακριτοÿυ ...,0
3,πιθανÿοτητες χρÿονοι πρÿωτης εισÿοδου καταστ...,0
4,πιθανÿοτητες χρÿονοι πρÿωτης εισÿοδου υπολογ...,0
5,πιθανÿοτητες χρÿονοι πρÿωτης εισÿοδου θέτοντ...,0
6,κεφÿαλαιο μαρκοβιανÿες αλυσÿιδες διακριτοÿυ ...,0
7,πιθανÿοτητες χρÿονοι πρÿωτης εισÿοδου υπολογ...,1
8,κεφÿαλαιο μαρκοβιανÿες αλυσÿιδες διακριτοÿυ ...,1
9,πιθανÿοτητες χρÿονοι πρÿωτης εισÿοδου θέτοντ...,1


In [2]:
def vectorize_df(x_data=pd.Series([])):
    if not x_data.empty:
        tfidf_tweets = x_data

        tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocess)
        tfidf_matrix = tfidf_vectorizer.fit_transform(tfidf_tweets)

        dense = pd.DataFrame(tfidf_matrix.todense(), columns=tfidf_vectorizer.get_feature_names())

        return dense

# vectorize_dt(df.iloc[0], df['y'].iloc[0])

In [3]:
pipelineLR = Pipeline([
    ('vectorize', TfidfVectorizer(preprocessor=preprocess)),
    ('classifier', LogisticRegression()),
])

pipelineNB = Pipeline([
    ('vectorize', TfidfVectorizer(preprocessor=preprocess)),
    ('classifier', MultinomialNB()),
])

pipelineLR.fit(df['text'], df['y']);
pipelineNB.fit(df['text'], df['y']);

In [15]:
joblib.dump(pipelineNB, 'pdf_stoch100%_NV.joblib');
joblib.dump(pipelineLR, 'pdf_stoch100%_LogReg.joblib');

In [27]:
NB = joblib.load('pdf_stoch100%_NV.joblib')
LR = joblib.load('pdf_stoch100%_LogReg.joblib')

txt = input()
print('Naive Bayes:', NB.predict(pd.Series(txt))[0])
print('Logistic Regression:', LR.predict(pd.Series(txt))[0])

Υπολογίστε τις οριακές πιθανότητες σε όλες τις Μαρκοβιανές αλυσίδες των προηγούμενων ϕυλλαδίων.
Naive Bayes: 12
Logistic Regression: 12
