In [41]:
%config IPCompleter.greedy=True
from pptx import Presentation
import operator
from pprint import pprint

prs = Presentation("./ch16.pptx")

paragraphs = {}


class Paragraph():
    def __init__(self, level,paragraph_count, text, isBold, block_count):
        self.block_count = block_count
        self.paragraph_count = paragraph_count
        self.level = level
        self.text = text
        self.isBold = isBold
        self.context = None
        self.entities = None
    def getColoredText(self):
        text = []
        for word in self.text.split():
            if word.lower() in self.entities:
                text .append('''<a style="color:red;">''' + word + "</a>")
            else:
                text.append(word)
        return " ".join(text)
    def __str__(self):
        html = "<div>"
        content = "&emsp;"*(self.block_count) + "&#8226;" + self.getColoredText()
        if self.block_count == 0:
            html += "<b>" + content + "</b>"
        else:
            html +=  "</tab>" + content 
        html += str(self.context) + "</div>"
        return html
    def setWordsToHighlight(self,entities):
        if type(entities) == str:
            entities = entities.split()
        self.entities = entities

slides = []
d = {}
slide_count = 0
for ppSlide in prs.slides:
    d[slide_count] = {}
    shape_count = 0
    for shape in ppSlide.shapes:
        if not shape.has_text_frame:
            continue
        else:
            d[slide_count][shape_count] = []
            paragraph_count = 0
            for block in shape.text_frame.paragraphs:
                paragraph = Paragraph(block.level, 
                                      paragraph_count, block.text, block.font.bold, shape_count)
                if block.text == "Chapter 16 Component-based software engineering" or block.text == "19/11/2014" or block.text.isnumeric() or block.text.isspace() or block.text == "":
                    break
                d[slide_count][shape_count].append(paragraph)
                paragraph_count += 1
            shape_count += 1
    slide_count += 1



In [42]:
#sort title blocks
paragraphs = {}
for a in d:
    for b in d[a]:
        for c in d[a][b]:
            if c.block_count == 0:
                paragraphs[c.text] = c

# associated a number with each title block
context_cnt = 0
for title_block in sorted(paragraphs):
    paragraphs[title_block].context = context_cnt
    context_cnt += 1

# assign a number to each paragraph, denoting its title block #
for a in d:
    for b in sorted(d[a]):
        for c in d[a][b]:
            if c.block_count == 0:
                context_count = c.context
            if c.block_count > 0:
                c.context = context_count


In [43]:
general_data = {}
# sort general data
general_count = 0
for a in d:
    for b in sorted(d[a]):
        for c in d[a][b]:
            if c.block_count == 0:
                points = []
            if c.block_count > 0:
                points.append(c)
    general_data[general_count] = points
    general_count += 1



In [44]:
import math
from textblob import TextBlob as tb

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [51]:
output = '''
CBSE
Component-based
effective reuse
stand-alone
Independent components 
Middleware
inter-operability
reuse
interfere
well-defined
interfaces
infrastructures
inter-operate
competing
Enterprise
Beans
hindered
communicating
executable entity
published
Heinmann
Composable
Deployable
Documented
Independent
Standardized
parameterized
component
deployability
independence
remote
RPC
URL
EJB
Usage
Deployment
container
generalizing
service
processes
acquisition
developed
external
certification
specification
generaldomain
abstraction
fundamental
trade-off
application-specific
broaden
adaptation
Integrate
rewriting
legacy
enhancement
certification
integrate
ideal
outline
Trust
Requirements
Validation
Navigation
assembling
glue code
Sequential
Hierarchical
Additive
incompatibilities
Parameter
Operation
incompatibility
incompleteness
documentation
Constraint
OCL
library
catalogue
functional
non-functional
emergent
reuse-based
interleaved
wiring
Distributed
Client–server 
Architectural
service
Virtually
distributed
confined
enterprise
resource
openness
concurrency
scalability
fault tolerance
complex
arises
top-down control
security
policies
failure
management
practice
achieve
transparency
logically
open
accepted
programming
web service
open standards
reflects
cope
manage a system
distinction
scaling-up
scaling-out
system instances
attack
incompatible
interception
interruption
modification
fabrication
QoS
threshold
degraded
service
critical
inevitable
procedural
Message-based
sending
remote procedure calls
middleware
callee
parses
sender
receiver
communicate
provision
interaction
internet
remote
external
presentation
data handling
processing
database
multi-tier
client-server
master-slave
architecture
peer-to-peer
real-time
master
computation
aquisition
sensors
two-tier
fat-client
thin-client
legacy
disadvantages
network
javascript
mobile
auto-updates
blurred
p2p
decentralised
semi-centralised
computationally-intensive
stored
managed
security
SaaS
service
configurability
tenancy
multi-tenancy


'''.lower()
#cheat sheet top
html = '''
<head>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bulma/0.6.2/css/bulma.min.css">
<style>
body {
    column-count:4;
    margin:1px;
    column-gap: 40px;
}
div {
    font-size:80%;
}
.group {
    margin:3px;
    border:1px solid black;
}
</style>
</head>
<body>
'''



for paragraph in sorted(general_data):
    html += '''<div class="group">'''
    for point in general_data[paragraph]:
        point.setWordsToHighlight(output)
        print(str(point))
        print()
        html += str(point)
    html += '''</div>'''
html += '</body>'

with open("cheat_sheet.html","w") as f:
    f.write(html)




<div></tab>&emsp;&#8226;<a style="color:red;">Components</a> and <a style="color:red;">component</a> models50</div>

<div></tab>&emsp;&#8226;<a style="color:red;">CBSE</a> <a style="color:red;">processes</a>50</div>

<div></tab>&emsp;&#8226;<a style="color:red;">Component</a> composition50</div>

<div></tab>&emsp;&#8226;<a style="color:red;">Component-based</a> software engineering (CBSE) is an approach to software development that relies on the <a style="color:red;">reuse</a> of entities called ‘software components’.26</div>

<div></tab>&emsp;&#8226;It emerged from the <a style="color:red;">failure</a> of object-oriented development to support <a style="color:red;">effective</a> reuse. Single object classes are too detailed and specific.26</div>

<div></tab>&emsp;&#8226;<a style="color:red;">Components</a> are more abstract than object classes and can be considered to be <a style="color:red;">stand-alone</a> <a style="color:red;">service</a> providers. They can exist as <a style="colo

In [46]:
print(str(point))

<div></tab>&emsp;&#8226;When choosing compositions, you have to consider required functionality, <a style="color:red;">non-functional</a> <a style="color:red;">requirements</a> and <a style="color:red;">system</a> evolution.37</div>
