# Section 1 - Summary of blog posts

## We start this exploration with a summary of all of the blog posts on Monzo as of June 4th, 2019. We scraped them using a python script, the code for which is available at https://github.com/emielver

The goal of this section is to provide an insight into the nature of the blog posts, and hopefully to see some interesting results. We will look at when the blog posts were written, who wrote them, and in what capacity each author was working when they wrote them. If we see any ideas that we deem worth exploring, we will attempt to do so.

We start by defining some user-created classes, which will aid in our analysis.

In [1]:
class Post:
    
    def __init__(self, author, position, title, date, text):
        self.author = author
        self.position = position
        self.title = title
        self.date = date
        self.text = text

    def get_author(self):
        return self.author

    def get_position(self):
        return self.position
    
    def get_title(self):
        return self.title

    def get_date(self):
        return self.date

    def get_text(self):
        return self.text


In [2]:
class Author:

    def __init__(self, name, position):
        self.name = name
        self.positions = [position]
        self.gender = ''
        self.blogs = []


    def add_blog(self, blog):
        self.blogs.append(blog)

    def add_position(self, position):
        self.positions.append(position)

    def set_gender(self, gender):
        self.gender = gender

    def get_name(self):
        return self.name

    def get_cmp_name(self):
        return "".join(self.name.lower().split())

    def get_position(self):
        return self.position[0]

    def get_cmp_position(self):
        return "".join(self.positions[0].lower().split())

    def get_all_positions(self):
        return self.positions

    def get_blogs(self):
        return self.blogs

    def get_number_blogs(self):
        return len(self.blogs)

    def get_gender(self):
        return self.gender

    def cmp_strings(self, str1, str2):
        return "".join(str1.lower()) == "".join(str2.lower())

    def has_name(self, potential):
        return self.cmp_strings(self.name, potential)

    def has_position(self, potential):
        for position in self.positions:
            if self.cmp_strings(position, potential):
                return True
        return False

We start by importing the necessary libraries.

In [3]:
import glob
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from scipy import stats
import collections
import numpy as np
import datetime
import colorlover as cl
import pickle
import operator
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [4]:
monzo_blue = 'rgb(20, 70, 115)'
grey = 'rgb(204,204,204)'

We set up our API credentials for Plotly

In [5]:
# user, key = open('Secret/secret.txt', 'r').readlines()
# plotly.tools.set_credentials_file(username=user.strip(), api_key=key.strip())
# user = ''
# key = ''

We define our user defined functions (UDFs) which allow us to store all information regarding blogs and authors

In [13]:
def get_blogs():
    blogs = []
    file_names = glob.glob('Data/*.txt')
    for file_name in file_names:
        lines = open(file_name, 'r').readlines()
        title = lines[0].split(":")[1].strip()
        author = lines[1].split(":")[1].strip()
        position = lines[2].split(":")[1].strip()
        date = lines[3].split(":")[1].strip()
        text = lines[4:]
        if len(author.split(" ")) > 3:
            print(file_name)
        new_blog = Post(author, position, title, date, text)
        blogs.append(new_blog)
##        print("Added %s to the list of blogs." % (title)
##              + "Written by %s, working as %s, on %s"
##              % (author, position, date))
    return blogs

def get_authors(blogs):
    authors = []
    for blog in blogs:
        added = False
        author_name = blog.get_author()
        author_position = blog.get_position()
        for author in authors:
            if author.has_name(author_name):
                if not author.has_position(author_position):
                    author.add_position(author_position)
                author.add_blog(blog)
                added = True
        if not added:
            new_author = Author(author_name, author_position)
            new_author.add_blog(blog)
            authors.append(new_author)
            
            
    return authors

We now read in the blogs and the authors, and sort the authors based on how many blog posts they have written in descending order.

In [14]:
blogs = get_blogs()
authors = get_authors(blogs)
authors.sort(key = lambda x: x.get_number_blogs(), reverse = True)
# for author in authors:
#     print("Author %s wrote %d blogs"
#           % (author.get_name(), author.get_number_blogs()))

We manually went through each other to determine their gender. We did this on the basis of their associated picture on their blog posts, however there may be some margin for error. Additionally, we gave Team Monzo a gender of 'None', as we do not know who on the Monzo team actually wrote the blog post.

In [8]:
author_genders = pickle.load(open('authors.pkl', 'rb'))
for author in authors:
    for name, gender in author_genders:
        if author.has_name(name):
            author.set_gender(gender)
            break

FileNotFoundError: [Errno 2] No such file or directory: 'authors.pkl'

Now that the data has been processed and prepared, we look into the first question we want to ask of our data set.

## How active have the blog posts been over the years?

In [9]:
years = {}
for blog in blogs:
    year = blog.get_date().split(" ")[2]
    if year in years:
        years[year] += 1
    else:
        years[year] = 1
intermediate_years = sorted(years.items(), key=lambda kv: kv[0])
years = collections.OrderedDict(intermediate_years)
labels = []
values = []
for key, value in years.items():
    labels.append(key)
    values.append(value)


data = [go.Bar(x=labels,y=values,
               marker = {'color' : monzo_blue})]
layout = go.Layout(
    title=go.layout.Title(
        text='Monzo blog posts per year',
        x=0
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Year',
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Blog posts',
        )
    ))
fig = go.Figure(data= data, layout=layout)
py.iplot(fig, filename='basic-bar')


Consider using IPython.display.IFrame instead



Here we obviously see an increasing trend in the number of blog posts per year. We also have to note that this analysis was done on all blog posts up to and including the 4th of June. We also know that the first blog post was made on the 1st of June, 2015, so the number of blog posts in 2015 is also not completely representative. This then begs the question...

## How many blog posts do we expect in 2019?

### According to a simple linear trend

In [11]:
xi = [float(labels[i]) for i in range(4,1,-1)]
y = [float(values[i]) for i in range(4,1,-1)]
slope, intercept, r_value, p_value, std_err = stats.linregress(xi, y)

In [12]:
print("According to a linear fit, we find a slope of %f and an intercept of %f, with an R^2 of %f"
     % (slope, intercept, r_value))

According to a linear fit, we find a slope of 10.000000 and an intercept of -20031.333333, with an R^2 of 0.151509


Having all the relevant coefficients, we see what our prediction would be for 2019

In [13]:
intercept + slope * 2019

158.66666666666788

That isn't very good, but we also only have an R^2 of 0.15, and we can see just by eyeballing the graph that a linear trend is not present. It looks much more like an exponential trend, or some sort of higher order polynomial. We first start with a quadratic trend, and then explore the explanatory power of an exponential trend.

### According to a quadratic trend

In [14]:
coefs = np.polyfit(xi, y, deg = 2)

In [17]:
coefs[0]* 2019 **2 + coefs[1] * 2019 + coefs[2]

120.99999982118607

Maybe a quadratic equation also doesn't help. Let's look at an exponential trend. One way we can do this is by taking the logarithm of the y values, and then fitting a linear trend. We can then reverse this transformation to get the expected number of blog posts in 2019 according to an exponential trend.

### According to an exponential trend

In [18]:
logged_y = np.log(y)
slope, intercept, r_value, p_value, std_err = stats.linregress(xi, logged_y)

In [19]:
print("According to an exponential fit, we find a slope of %f and an intercept of %f, with an R^2 of %f"
     % (slope, intercept, r_value))

According to an exponential fit, we find a slope of 0.090335 and an intercept of -177.355207, with an R^2 of 0.216316


Plugging in 2019 and taking the exponential to this result yields the expected number of blog posts in 2019.

In [20]:
np.exp(intercept + slope*2019)

153.1147110362509

Again, we do not find a very realistic result, however this is more promising than previous results, suggesting that there is stronger evidence for an exponential trend. We can also chalk up the lacklustre results to a lack of data, since we are only looking over 3 complete years. Hopefully we find better results once we have a larger data set to look at. Therefore, the logical next step is to look at the blog posts per month.

## Which month has the highest number of blog posts, and how does this vary over the months?

In [21]:
months = {"Jan" : 0, "Feb" : 0, "Mar" : 0, "Apr" : 0, 
         "May" : 0, "Jun" : 0, "Jul" : 0, "Aug" : 0, 
         "Sep" : 0, "Oct" : 0, "Nov" : 0,
         "Dec" : 0}
for blog in blogs:
    month = blog.get_date().split(" ")[1]
    months[month] += 1
labels = []
values = []
for key, value in months.items():
    labels.append(key)
    values.append(value)


data = [go.Bar(x=labels,y=values, marker = {'color' : monzo_blue})]
layout = go.Layout(
    title=go.layout.Title(
        text='Monzo blog posts per month',
        x=0
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Month',
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Blog posts',
        )
    ))
fig = go.Figure(data= data, layout=layout)
py.iplot(fig, filename='basic-bar')

Here, we don't get many insights. We see that in May has the highest number of blog posts, and that the first 5 months on average have more blog posts than the last 7 months. An obvious explanation for this could be that this analysis was carried out at the beginning of June in 2019, and so we are still missing the data for June onwards in 2019. We can quickly remedy this by only looking up until December 2018, which yields the following:

In [23]:
months = {"Jan" : 0, "Feb" : 0, "Mar" : 0, "Apr" : 0, 
         "May" : 0, "Jun" : 0, "Jul" : 0, "Aug" : 0, 
         "Sep" : 0, "Oct" : 0, "Nov" : 0,
         "Dec" : 0}
for blog in blogs:
    day, month, year = blog.get_date().split(" ")
    if year != '2019':
        months[month] += 1
labels = []
values = []
for key, value in months.items():
    labels.append(key)
    values.append(value)


data = [go.Bar(x=labels,y=values, marker = {'color' : monzo_blue})]
layout = go.Layout(
    title=go.layout.Title(
        text='Monzo blog posts per month (excluding 2019)',
        x=0
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Month',
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Blog posts',
        )
    ))
fig = go.Figure(data= data, layout=layout)
py.iplot(fig, filename='basic-bar')

This looks a little more like we expected. We now have a bias against the first 5 months, because Monzo's first blog post was in June 2015. If we wanted a more fair representation, we would only look at blog posts in full years, so 2016 until 2018. However, considering there were only 16 blog posts in 2015 in total, we do not expect much to change. Furthermore, it is quite straightforward to adapt the above code to exclude 2015 along with 2019.

Instead of looking at the monthly figures aggregated over the years, we can instead create month/year pairings, and see the natural evolution over time a bit better. We do this in the following code:

In [25]:
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
names = month_names[:]
months = {}
for year in range(2015,2020):
    if year == 2015:
        names = month_names[5:]
    elif year == 2019:
        names = month_names[:5]
    else:
        names = month_names
    for name in names:
        month = name + " " + str(year)
        months[month] = 0
for blog in blogs:
    month = " ".join(blog.get_date().split(" ")[1:])
    if month not in months:
        pass
    else:
        months[month] += 1

labels = []
values = []
for key, value in months.items():
    labels.append(key)
    values.append(value)


data = [go.Bar(x=labels,y=values, marker = {'color' : monzo_blue})]
layout = go.Layout(
    title=go.layout.Title(
        text='Monzo blog posts per month/year combination',
        x=0
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Month',
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Blog posts',
        )
    ))
fig = go.Figure(data= data, layout=layout)
py.iplot(fig, filename='basic-bar')

Here we see a very clear progression. August 2018 does seem to peak sharply above the rest, with a reason for this being that August was the deadline for "The Big List", initially published in May of 2018 (https://monzo.com/blog/2018/05/22/making-monzo-better/). "The Big List" set up a list of features Monzo was determined to introduce within three months, and by August 2018 they had managed to implement 11 out of the 14 features (https://monzo.com/blog/2018/08/22/end-of-the-big-list/). It is fair to say this can largely explain the spike in the number of blogs posted in August, and so if Monzo wants to increase its blog posts again it may be helpful to introduce another wishlist of features with a deadline.

# What happens when we aggregate to quarters of the year? Do we find other interesting results?

In [27]:
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
names = quarters[:]
quarter_year = {}
for year in range(2015,2020):
    if year == 2015:
        names = quarters[1:]
    elif year == 2019:
        names = quarters[:2]
    else:
        names = quarters
    for quarter in names:
        quarter = quarter + " " + str(year)
        quarter_year[quarter] = 0
quarter = ""
for blog in blogs:
    month, year = blog.get_date().split(" ")[1:]
    if month in months[:3]:
        quarter = quarters[0]
    elif month in months[3:6]:
        quarter = quarters[1]
    elif month in months[6:9]:
        quarter = quarters[2]
    else:
        quarter = quarters[3]
    name = quarter + " " + year
    quarter_year[name] += 1

labels = []
values = []
for key, value in quarter_year.items():
    labels.append(key)
    values.append(value)


data = [go.Bar(x=labels,y=values, marker = {'color' : monzo_blue})]
layout = go.Layout(
    title=go.layout.Title(
        text='Monzo blog posts per quarter/year combination',
        x=0
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Quarter',
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Blog posts',
        )
    ))
fig = go.Figure(data= data, layout=layout)
py.iplot(fig, filename='basic-bar')

We obviously see the same general increasing pattern as we did using monthly data, but would not be able to immediately identify the spike in blog posts in Q3 of 2018, like we could using monthly data. We do, however, see a surge in blog post volume in 2015 as the company is starting up, with a fairly equal distribution in 2016. In 2017 we see a general uptick, with Q1 blog posts doubling in volume compared to the previous year. In 2018, we see a ramping up in Q2 and Q3, which is sustained in Q4. This could again be as a result of the introduction and follow-through of "The Big List" that Monzo introduced. 

Now we can also look at if there are distinct differences in the number of blog posts per day of the month. This should be (largely) unaffected by incomplete years, and so we should be able to run this analysis over the whole sample.

## Looking at day of the month

In [30]:
days = {}
for blog in blogs:
    day = blog.get_date().split(" ")[0]
    if day in days:
        days[day] += 1
    else:
        days[day] = 1
intermediate_days = sorted(days.items(), key=lambda kv: int(kv[0]))
days = collections.OrderedDict(intermediate_days)
labels = []
values = []
for key, value in days.items():
    labels.append(key)
    values.append(value)


data = [go.Bar(x=labels,y=values, marker = {'color' : monzo_blue})]
layout = go.Layout(
    title=go.layout.Title(
        text='Monzo blog posts per day of the month',
        x=0
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Day',
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Blog posts',
        )
    ))
fig = go.Figure(data = data, layout=layout)
py.iplot(fig, filename='basic-bar')

It looks pretty normal, but we can't infer too much from this. Obviously we expect the number of blog posts on the 31st to be low, since this is the least commonly occuring day of the month. However, it might be more interesting to look at the days of the week.

## Which day of the week is most productive?

In [31]:
days_of_week = {"Monday" : 0, "Tuesday" : 0, "Wednesday" : 0,
               "Thursday" : 0, "Friday" : 0, "Saturday" : 0, 
               "Sunday" : 0}
for blog in blogs:
    day = datetime.datetime.strptime(blog.get_date(), '%d %b %Y').strftime('%A')
    days_of_week[day] += 1
    if day == 'Sunday' or day == 'Saturday':
        print("%s was posted on %s, %s." 
              % (blog.get_title(), day, blog.get_date()))
labels = []
values = []
for key, value in days_of_week.items():
    labels.append(key)
    values.append(value)


data = [go.Bar(x=labels,y=values, marker = {'color': monzo_blue})]
layout = go.Layout(
    title=go.layout.Title(
        text='Monzo blog posts per day of the week',
        x=0
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='day',
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Blog posts',
        )
    ))
fig = go.Figure(data= data, layout=layout)
py.iplot(fig, filename='basic-bar')

How to manage money when you’re terrible with money was posted on Sunday, 3 Mar 2019.
Why Monzo Cards Aren't Working Today was posted on Sunday, 5 Mar 2017.
#mondohack was posted on Sunday, 22 Nov 2015.


## Surprise, surprise, Monzonauts don't like to work on weekends
If they have to then they post on Sundays. However, this only happens once every two years, and since it has already happened in 2019, no one should be posting on a Sunday until 2021.

# So who is actually writing all of these blogposts?

In [33]:
labels = []
values = []
for author in authors[:10]:
    labels.append(author.get_name())
    values.append(author.get_number_blogs())
labels.append("Other")
rest = 0
for author in authors[10:]:
    rest += author.get_number_blogs()
values.append(rest)

colours = [monzo_blue for x in range(10)]
colours.append(grey)

data = [go.Bar(x=labels,y=values, marker = {'color' : colours})]
layout = go.Layout(
    title=go.layout.Title(
        text='Monzo blog posts per author',
        x=0
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Author',
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Number of blog posts',
        )
    ))
fig = go.Figure(data= data, layout=layout)
py.iplot(fig, filename='basic-bar')

# Look into gender balance of blog posts over time

In [34]:
gender_count = {'female' : 0, 'male' : 0, 'None' : 0}
for author in authors:
    gender_count[author.get_gender()] += author.get_number_blogs()
    
labels = []
values = []
for key, value in gender_count.items():
    labels.append(key)
    values.append(value)
colours = [monzo_blue]
colours.append(grey)
trace = go.Bar(x = labels[:2], y = values[:2], 
               marker = {'color' : colours},)
py.iplot([trace], filename = 'gender_bar_chart')

In [35]:
all_labels = []
all_values = []
gender_count = {}
for author in authors:
    gender = author.get_gender()
    authors_blogs = author.get_blogs()
    for blog in authors_blogs:
        year = blog.get_date().split(" ")[2]
        gender_year = gender + " " + str(year)
        if gender_year in gender_count:
            gender_count[gender_year] += 1
        else:
            gender_count[gender_year] = 1

years = [2015, 2016, 2017, 2018, 2019]
for year in years:
    all_labels.append([key.split(" ")[0] for key, value in gender_count.items() if key.split(" ")[1] == str(year)])
    all_values.append([value for key, value in gender_count.items() if key.split(" ")[1] == str(year)])

trace1 = go.Bar(
    y=['2015', '2016', '2017', '2018', '2019'],
    x=[all_values[0][0], all_values[1][0], all_values[2][1],
       all_values[3][1], all_values[4][1]],
    name='Males',
    orientation = 'h',
    marker = dict(
        color = grey,
        line = dict(
            color = grey,
            width = 3)
    )
)
trace2 = go.Bar(
    y=['2015', '2016', '2017', '2018', '2019'],
    x=[all_values[0][1], all_values[1][1], all_values[2][0],
       all_values[3][0], all_values[4][0]],
    name='Females',
    orientation = 'h',
    marker = dict(
        color = monzo_blue,
        line = dict(
            color = monzo_blue,
            width = 3)
    )
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='stack',
    title = "Absolute Male/Female split of authorship"
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='marker-h-bar')

Now we can see a definitive increase in the number of posts made by women over the last couple of years. To put things more into perspective, we can look at the change in the percentage of total blog posts contributed by women over the last five years.

In [36]:
female_ratio = []
for i in range(len(years)):
    for j in range(len(all_labels[i])):
        if all_labels[i][j] == 'female':
            break
    females = all_values[i][j]
    total = sum([int(x) for x in all_values[i]])
    female_ratio.append((females/total) * 100)
    
trace0 = go.Scatter(
    x = years,
    y = female_ratio,
    mode = 'lines+markers',
    name = 'lines'
)
data = [trace0]
layout = go.Layout(
    title=go.layout.Title(
        text='Percentage of female authored blog posts per year',
        x=0
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Year',
        ),
        dtick = 1.0
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='% of Female authored blog posts')))
fig = go.Figure(data = data, layout=layout)
py.iplot(fig, filename='basic-bar')

We see a huge increase in the percentage of women authoring blog posts after a consistently low authorship of around 22% for the first three years of Monzo's operation. In 2018, we see the balance is fairly even, with roughly 47% of posts being penned by women, a huge upward surge from the years prior. In 2019 this upwards trend continues, with 72% of all blog posts this year being authored by women.

## Having looked at gender, we now look at the positions of the authors

In [38]:
df = pd.DataFrame(None, columns = ['Position', 'Count15', 'Count16', 'Count17',
                                   'Count18', 'Count19'])
position_count = {}
for blog in blogs:
    position = blog.get_position()
    year = blog.get_date().split(" ")[2]
    year_index = (int(year) - 2015)
    if position in position_count:
        position_count[position][year_index] += 1
    else:
        position_count[position] = [0 for x in range(5)]
        position_count[position][year_index] += 1
for key, value in position_count.items():
    df = df.append({'Position' : key, 'Count15' : value[0], 'Count16' : value[1],
                   'Count17' : value[2], 'Count18' : value[3], 'Count19' : value[4]}, ignore_index=True)

In [39]:
trace = go.Table(
    header=dict(values=['Position', '2015', '2016', '2017', '2018', '2019'],
                fill = dict(color='#C2D4FF'),
                align = ['left'] * 5),
    cells=dict(values=[df.Position, df.Count15, df.Count16, df.Count17,
                       df.Count18, df.Count19],
               fill = dict(color='#F5F8FF'),
               align = ['left'] * 5))
data = [trace] 
py.iplot(data, filename = 'pandas_table')

Here we can infer the introduction of new roles and titles within Monzo, and we can also see that the official titles stated on blog posts are constantly changing as time goes on.

# Section 2 - Analysis of text data

The goals we are setting out to achieve in this section are as follows:

a) Topic modeling to see if we can create buckets of words which characterise certain blog posts

b) Automated Document Summarisation to make summaries of blog posts

c) Clustering of blog posts and seeing if we can identify the overarching theme in each cluster


## Start with part 2a

We start by importing all the libraries we will need

In [41]:
import gensim
from gensim import corpora, models, similarities
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import NormModel
from gensim.sklearn_api import Text2BowTransformer
from gensim.models.wrappers import DtmModel
from gensim.similarities import MatrixSimilarity
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from operator import itemgetter

In [42]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emielv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emielv/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/emielv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/emielv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### An important part of any data analysis, but especially when doing Machine Learning, is the preprocessing of the data. Luckily, we have functions in nltk and gensim that will aide us in this.

In [43]:
documents = []
for blog in blogs:
    temp_text = blog.get_text()
    all_words = " ".join(temp_text)
    documents.append(all_words)

525


Now that we have loaded our documents into an array, we can begin preprocessing. We start by simplifying the penn tags that will be supplied to us by nltk. We use the coding of 'a' for adjective, 'r' for adverb, 'v' for verb, and 'n' for noun. Then we preprocess by removing stop words, getting rid of punctuation and making every word lower case. We also lemmatize every word, which means reducing it to its stem. This is why we have to classify every word into a noun, adjective, adverb, or verb.

In [44]:
def simplify(penn_tag):
    pre = penn_tag[0]
    if (pre == 'J'):
        return 'a'
    elif (pre == 'R'):
        return 'r'
    elif (pre == 'V'):
        return 'v'
    else:
        return 'n'

def pre_process(text):
    stop_words = stopwords.words('english')
    tokens = gensim.utils.simple_preprocess(str(text), deacc=True)
    lem = WordNetLemmatizer()
    return [lem.lemmatize(token, simplify(pos)) for token, pos in nltk.pos_tag(tokens)
           if token not in stop_words]

In [45]:
corpus = [pre_process(document) for document in documents]
dictionary = Dictionary(corpus)
bag_of_words = [dictionary.doc2bow(line) for line in corpus]

In [46]:
def flatten_corpus(corpus):
    flat = []
    for doc in corpus:
        for word in doc:
            flat.append(word)
    return flat

def compute_ngrams(sequence, n = 1):
    return zip(*[sequence[index:] for index in range(n)])

def get_top_ngrams(corpus, ngram_length = 1, limit = 10):
    if len(corpus) < 1000:
        corpus = flatten_corpus(corpus)

    ngrams = compute_ngrams(corpus, ngram_length)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(),
                             key = itemgetter(1), reverse = True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq)
                    for text, freq in sorted_ngrams]
    return sorted_ngrams


In [51]:
get_top_ngrams(corpus, 3, 20)

[('let u know', 118),
 ('monzo money tip', 67),
 ('current account switch', 59),
 ('recently update name', 54),
 ('update name monzo', 54),
 ('name monzo read', 54),
 ('account switch service', 52),
 ('pic twitter com', 48),
 ('u know think', 46),
 ('account tab app', 42),
 ('follow monzo money', 42),
 ('want make sure', 38),
 ('monzo current account', 38),
 ('customer support team', 36),
 ('love hear think', 35),
 ('discussion community forum', 35),
 ('high interest rate', 34),
 ('set direct debit', 34),
 ('set money aside', 33),
 ('head account tab', 33)]

Having found the most common sequences of words of whatever length we want, this allows us to get a quick overview of what's being written on Monzo blogs. Using the words we found an ngram length of 1, we get a pretty nice summary of what Monzo is all about. But maybe we can use extra information to see more about what specifically is being written about. We do this using the concept of weighted tag-based phrase extraction.

In [52]:
def get_only_type(corpus, code):
    tag = ''
    if code.lower() == 'adjective':
        tag = 'J'
    elif code.lower() == 'verb':
        tag = 'V'
    elif code.lower() == 'adverb':
        tag = 'R'
    else:
        tag = 'N'
    flat = flatten_corpus(corpus)
    all_tags = nltk.pos_tag(flat)
    return [word for index, word in enumerate(flat)
           if all_tags[index][1][0] == tag]

In [53]:
noun_corpus = get_only_type(corpus, 'Noun')
get_top_ngrams(noun_corpus, 3, 20)

[('monzo money tip', 55),
 ('account switch service', 52),
 ('pic twitter com', 37),
 ('customer support team', 36),
 ('debit stand order', 33),
 ('money work everyone', 28),
 ('join discussion community', 24),
 ('email email protect', 23),
 ('customer operation team', 23),
 ('sort code account', 23),
 ('money tip monzo', 23),
 ('account number sort', 22),
 ('number sort code', 22),
 ('code account number', 22),
 ('use monzo account', 21),
 ('support video tag', 21),
 ('guide tip money', 21),
 ('tip money monzo', 21),
 ('money monzo money', 21),
 ('tip monzo money', 21)]

This should give a better idea of what the keyphrases are from Monzo's blog posts, and we can tweak this number to look at longer or shorter phrases. We can also look at what kinds of verbs are used, or whatever other kind of word we want.

## Now we move on to Topic Modeling

First, we perform Latent Semantic Indexing (LSI) on the corpus. We let the model find up to 25 topics, although this is chosen quite arbitrarily. We then print out the top 10 topics, with the 10 most commonly occuring words. We then see if we can identify some sort of theme within the topics created.

In [54]:
lsi = models.lsimodel.LsiModel(corpus=bag_of_words, id2word=dictionary, num_topics=25)
lsi.print_topics(10)

[(0,
  '0.284*"monzo" + 0.262*"make" + 0.228*"account" + 0.223*"money" + 0.197*"use" + 0.196*"work" + 0.184*"get" + 0.156*"help" + 0.150*"people" + 0.149*"pay"'),
 (1,
  '-0.320*"money" + -0.258*"interest" + -0.251*"rate" + -0.244*"pay" + -0.198*"saving" + 0.182*"team" + 0.182*"monzo" + -0.174*"isa" + -0.171*"credit" + 0.160*"customer"'),
 (2,
  '-0.572*"account" + -0.320*"bank" + -0.227*"payment" + -0.206*"monzo" + -0.186*"card" + 0.152*"pension" + -0.151*"current" + 0.137*"team" + -0.135*"switch" + 0.126*"work"'),
 (3,
  '0.393*"credit" + 0.234*"rate" + -0.231*"monzo" + 0.225*"interest" + 0.217*"use" + -0.196*"money" + 0.193*"card" + -0.159*"save" + -0.158*"pot" + 0.158*"score"'),
 (4,
  '0.340*"rate" + 0.293*"interest" + -0.293*"money" + 0.244*"isa" + -0.200*"get" + 0.169*"people" + 0.162*"account" + 0.147*"team" + 0.143*"customer" + -0.131*"go"'),
 (5,
  '-0.343*"isa" + 0.263*"credit" + 0.201*"pay" + -0.187*"saving" + -0.180*"use" + 0.172*"card" + -0.171*"investment" + 0.160*"monzo

At a first glance, it is hard to distinguish between the topics presented. A large reason for this is the limited scope of our data set. Furthermore, LSI works much better on larger data sets, due to the fact that we are reducing the dimensionality of the data. The larger the data set and the more varied the sources of the data set, the easier it is to generate overarching topics or clusters of words, which again is not really the case for our data. However, we can see that the third topic has something to do with switching to Monzo as the primary current account of customers. This was a move that happened in the last year, and we can see evidence for it by the fact that words such as current, account, switch, work, and pension are included in this topic. Furthermore, we can definitely see that the fourth topic seems to be about savings, with words like credit, interest, isa (Individual Savings Account), and pot (the name for a savings account in Monzo terminology) being group together. Additionally, perhaps the last topic looked more at additional crowdfunding of customers, as this is how Monzo was initially funded. We can see this in the inclusion of words like investment, help, invest, and crowdfunding in the topic.

Since LSI is mostly for very large data sets,We repeat the procedure with a Latent Dirichlet Analysis (LDA). The difference in LSI and LDA is that in LDA we assume a generative process for the document, using Bayesian inference to make inference on the parameters. Keep in mind the difference in LSI vs LDA, where LSI uses maximum likelihood estimation and LDA uses Bayes estimation. Bayes estimation takes the priors for parameters into account, and if those priors are accurate then better parameter estimates are yielded. This is often of benefit with a small corpus, which we have. Therefore, we expect the results of the LDA to be more fitting than those of LSI.

In [55]:
lda = models.ldamodel.LdaModel(corpus=bag_of_words, id2word=dictionary,
                               num_topics=25, update_every=1, passes=100)
lda.print_topics(10)

[(1,
  '0.015*"make" + 0.013*"use" + 0.011*"data" + 0.011*"api" + 0.009*"system" + 0.009*"time" + 0.009*"build" + 0.009*"framework" + 0.008*"could" + 0.007*"need"'),
 (4,
  '0.018*"make" + 0.018*"term" + 0.017*"bank" + 0.016*"write" + 0.013*"condition" + 0.011*"europe" + 0.010*"c" + 0.010*"need" + 0.009*"sepa" + 0.008*"t"'),
 (16,
  '0.021*"insurance" + 0.020*"office" + 0.015*"cost" + 0.012*"dog" + 0.011*"policy" + 0.011*"home" + 0.010*"cardiff" + 0.010*"may" + 0.009*"vega" + 0.009*"cover"'),
 (2,
  '0.024*"money" + 0.020*"get" + 0.013*"go" + 0.010*"like" + 0.009*"work" + 0.008*"buy" + 0.008*"spend" + 0.008*"cost" + 0.007*"thing" + 0.007*"year"'),
 (23,
  '0.040*"pot" + 0.034*"money" + 0.025*"set" + 0.023*"monzo" + 0.022*"save" + 0.021*"saving" + 0.020*"add" + 0.020*"payment" + 0.018*"month" + 0.018*"spending"'),
 (17,
  '0.021*"node" + 0.016*"envoy" + 0.013*"etcd" + 0.012*"cluster" + 0.011*"peer" + 0.010*"use" + 0.010*"internal" + 0.008*"service" + 0.007*"new" + 0.006*"zone"'),
 (18,


In [56]:
labels = []
values = []
for number_topics in range(10,102):
    lda = models.ldamodel.LdaModel(corpus=bag_of_words, id2word=dictionary, random_state = 12, 
                                   num_topics=number_topics, update_every=1, 
                                   passes=10)
    labels.append(number_topics)
    values.append(lda.log_perplexity(bag_of_words))

In [57]:
trace0 = go.Scatter(
    x = labels,
    y = values,
    mode = 'lines+markers',
    name = 'lines'
)
data = [trace0]
layout = go.Layout(
    title=go.layout.Title(
        text='log perplexity against number of topics',
        x=0
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text='Number of topics',
        ),
        dtick = 1.0
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text='Log perplexity')))
fig = go.Figure(data = data, layout=layout)
py.iplot(fig, filename='perplexity-scatter')


Consider using IPython.display.IFrame instead



The graph seems to suggest that we should take 96 as the number of topics, and so we do so. Then, we get the following 10 most significant topics.

In [62]:
lda = models.ldamodel.LdaModel(corpus=bag_of_words, id2word=dictionary, random_state=12, 
                               num_topics=96, update_every=1, passes=100)
lda.print_topics(10)

[(73,
  '0.109*"account" + 0.042*"switch" + 0.037*"bank" + 0.026*"monzo" + 0.024*"current" + 0.016*"money" + 0.013*"new" + 0.013*"open" + 0.013*"payment" + 0.012*"make"'),
 (25,
  '0.098*"flight" + 0.063*"airline" + 0.049*"book" + 0.034*"travel" + 0.024*"get" + 0.023*"holiday" + 0.022*"back" + 0.021*"claim" + 0.020*"contact" + 0.018*"cancel"'),
 (15,
  '0.044*"money" + 0.026*"get" + 0.016*"go" + 0.014*"like" + 0.014*"save" + 0.013*"spend" + 0.011*"want" + 0.011*"buy" + 0.011*"work" + 0.010*"think"'),
 (88,
  '0.024*"bank" + 0.022*"use" + 0.021*"api" + 0.018*"data" + 0.015*"customer" + 0.014*"monzo" + 0.012*"make" + 0.012*"information" + 0.010*"developer" + 0.009*"fraud"'),
 (18,
  '0.178*"name" + 0.024*"new" + 0.024*"mark" + 0.024*"trade" + 0.018*"idea" + 0.018*"register" + 0.018*"potential" + 0.016*"shortlist" + 0.016*"sound" + 0.015*"suggestion"'),
 (95,
  '0.032*"customer" + 0.025*"card" + 0.019*"monzo" + 0.017*"use" + 0.017*"u" + 0.016*"ticketmaster" + 0.014*"accessibility" + 0.011

One limitation of LDA is that we need to specify the number of topics beforehand, and the model will fit the data to these parameters. An obvious consequence of this is if you underspecify the number of topics, then unrelated terms may be grouped together, which could lead to false inference of the relatedness of terms. Additionally, overspecifying the number of topics could lead to very homogenous topics created by the model, and therefore leads to an inability to say anything about the results found. That is where the Hierarchical Dirichlet Process (HDP) comes in. Here we do not specify the number of topics, with this instead being derived from the data itself. For the HDP model function from gensim, we need to specify a bag-of-words model and a Dictionary

In [63]:
model = models.HdpModel(bag_of_words, id2word=dictionary)
print(len(model.get_topics()))

150


In [64]:
model.print_topics(10)

[(0,
  '0.014*monzo + 0.010*make + 0.010*use + 0.009*get + 0.009*account + 0.008*money + 0.007*card + 0.007*like + 0.007*pay + 0.006*work'),
 (1,
  '0.017*monzo + 0.012*make + 0.011*account + 0.010*work + 0.008*pay + 0.008*get + 0.007*use + 0.007*month + 0.007*money + 0.007*payment'),
 (2,
  '0.009*request + 0.009*crowdfunding + 0.009*make + 0.009*use + 0.008*u + 0.007*could + 0.006*app + 0.006*go + 0.006*time + 0.006*load'),
 (3,
  '0.014*monzo + 0.010*account + 0.008*make + 0.007*card + 0.007*use + 0.006*work + 0.006*u + 0.006*bank + 0.006*customer + 0.006*new'),
 (4,
  '0.011*monzo + 0.009*make + 0.007*april + 0.007*twitter + 0.007*pot + 0.007*work + 0.006*com + 0.006*help + 0.006*pic + 0.006*get'),
 (5,
  '0.004*make + 0.004*use + 0.004*data + 0.004*monzo + 0.003*time + 0.003*team + 0.003*one + 0.002*build + 0.002*u + 0.002*work'),
 (6,
  '0.005*monzo + 0.004*account + 0.004*money + 0.004*bank + 0.003*work + 0.003*get + 0.003*people + 0.003*u + 0.003*make + 0.003*help'),
 (7,
  '0.

Having looked at LSA, LDA, and HDP, we now look at Dynamic Topic Modeling. This will allow us to see if the topics change over time, as Monzo has grown as a company.

Lastly, we can look at Guided LDA. In Guided LDA, we specify certain terms and give them topic numbers. Therefore, we can steer the direction that LDA will shift in by selecting certain keywords that we think will appear in different topics. To do so, we can simply pass on a dictionary of certain terms and their topic numbers in the 'eta' argument of the LDA model function that we were using previously. However, we should first create a function to generate this eta, which we do now.

In [65]:
def create_eta(priors, dictionary, number_topics):
    eta = np.full(shape=(number_topics, len(dictionary)), fill_value=1) # create a (ntopics, nterms) matrix and fill with 1
    for word, topic in priors.items(): # for each word in the list of priors
        key_index = [index for index,term in dictionary.items() if term==word] # look up the word in the dictionary
        if len(key_index): # if it's in the dictionary
            eta[topic,key_index[0]] = 1e10  # put a large number in there
    eta = np.divide(eta, eta.sum(axis=0)) # normalize so that the probabilities sum to 1 over all topics
    return eta

Using this function, we can now create an eta and pass it along to our model creation function. It is important to note that if we simply pass along an empty eta, we will get the same output as with normal LDA, as we are not 'guiding' the process in any sense.

In [80]:
prior = {'plus' : 0, 'pot' : 1, 'isa' : 1, 'android' : 2, 'ios' : 2, 'golden' : 3,
        'crowdfunding' : 4, 'support' : 5,  'apple' : 6}
number_topics = 100
eta = create_eta(prior, dictionary, number_topics)
guided_lda = models.ldamodel.LdaModel(corpus=bag_of_words, id2word=dictionary, random_state = 12,
                               num_topics=number_topics, update_every=1, passes=100,
                              eta=eta)
guided_lda.print_topics(10)

[(66,
  '0.176*"community" + 0.084*"event" + 0.068*"host" + 0.023*"wider" + 0.022*"office" + 0.022*"new" + 0.018*"idea" + 0.018*"space" + 0.016*"involve" + 0.015*"look"'),
 (96,
  '0.030*"monzo" + 0.026*"make" + 0.016*"feature" + 0.015*"work" + 0.015*"help" + 0.015*"u" + 0.014*"see" + 0.014*"app" + 0.013*"add" + 0.012*"month"'),
 (28,
  '0.029*"android" + 0.024*"pay" + 0.018*"give" + 0.018*"top" + 0.017*"card" + 0.014*"app" + 0.013*"today" + 0.011*"store" + 0.011*"debit" + 0.011*"update"'),
 (33,
  '0.011*"service" + 0.011*"cost" + 0.009*"wedding" + 0.009*"people" + 0.008*"option" + 0.008*"design" + 0.008*"old" + 0.007*"even" + 0.007*"year" + 0.007*"user"'),
 (89,
  '0.030*"loan" + 0.026*"overdraft" + 0.022*"car" + 0.019*"unsecured" + 0.018*"secure" + 0.016*"use" + 0.016*"card" + 0.016*"credit" + 0.014*"borrow" + 0.013*"purchase"'),
 (36,
  '0.048*"money" + 0.017*"get" + 0.015*"think" + 0.015*"couple" + 0.014*"go" + 0.013*"say" + 0.013*"would" + 0.013*"year" + 0.012*"pay" + 0.011*"u"')

In [81]:
guided_lda.print_topics(100)

[(0,
  '0.011*"plus" + 0.000*"get" + 0.000*"money" + 0.000*"save" + 0.000*"work" + 0.000*"mortgage" + 0.000*"pay" + 0.000*"need" + 0.000*"year" + 0.000*"use"'),
 (1,
  '0.061*"golden" + 0.059*"ticket" + 0.034*"wait" + 0.033*"friend" + 0.032*"list" + 0.031*"monzo" + 0.028*"alpha" + 0.023*"email" + 0.021*"invite" + 0.018*"vote"'),
 (2,
  '0.122*"mondo" + 0.024*"name" + 0.023*"recently" + 0.019*"update" + 0.018*"transaction" + 0.015*"read" + 0.015*"every" + 0.012*"user" + 0.011*"student" + 0.010*"get"'),
 (3,
  '0.013*"make" + 0.012*"need" + 0.010*"monzo" + 0.008*"one" + 0.008*"write" + 0.008*"use" + 0.007*"way" + 0.007*"thing" + 0.007*"u" + 0.007*"team"'),
 (4,
  '0.055*"matt" + 0.047*"sian" + 0.024*"monzo" + 0.022*"next" + 0.020*"student" + 0.015*"go" + 0.013*"card" + 0.013*"user" + 0.013*"maxed" + 0.013*"wonder"'),
 (5,
  '0.079*"investment" + 0.072*"invest" + 0.068*"crowdfunding" + 0.048*"round" + 0.041*"investor" + 0.027*"million" + 0.023*"monzo" + 0.019*"raise" + 0.015*"pre" + 0.015

However, given the arbirtrary ordering of the blogs, as a result of the glob library used to read them all in, this does not allow us to easily verify if the results are accurate. Therefore, we take a concrete example. Let us see for example what the predicted topics are for the second blog post in our data set, titled ‘I put £1 in a Pot every time I swore during Ramadan’. Now obviously, we expect this document to be about savings, since the 'Pot' is Monzo terminology for a savings account. First, we need to find the document in our data set.

In [82]:
blog_index = -1
for index, blog in enumerate(blogs):
    if blog.get_title() == "Block gambling transactions from your Monzo account":
        blog_index = index
print(blogs[blog_index].get_title())

Block gambling transactions from your Monzo account


Having found the correct blog, we now see what document topics our LDA believes it belongs to, which is relatively straightforward to do:

In [83]:
lda.get_document_topics(bag_of_words[blog_index])

[(8, 0.090974025),
 (13, 0.5695669),
 (55, 0.22475727),
 (65, 0.06227863),
 (78, 0.05018204)]

In [84]:
lda.show_topic(78)

[('monzo', 0.0607184),
 ('app', 0.025906827),
 ('pay', 0.023707861),
 ('feature', 0.023042062),
 ('android', 0.020099118),
 ('account', 0.016735727),
 ('share', 0.01527054),
 ('work', 0.014750406),
 ('add', 0.014287875),
 ('bill', 0.013862641)]

In [85]:
blog_index = -1
for index, blog in enumerate(blogs):
    if blog.get_title() == "Should I use a credit card for my holiday?":
        blog_index = index
print(blogs[blog_index].get_title())

Should I use a credit card for my holiday?


In [86]:
lda.get_document_topics(bag_of_words[blog_index])

[(17, 0.10478089),
 (25, 0.06570986),
 (36, 0.04847568),
 (43, 0.18249317),
 (51, 0.29793718),
 (56, 0.028525414),
 (87, 0.099248886),
 (89, 0.1707452)]

In [91]:
lda.show_topic(87)

[('bank', 0.035058785),
 ('charge', 0.02711987),
 ('abroad', 0.01717822),
 ('account', 0.016829802),
 ('fee', 0.016416285),
 ('free', 0.015912335),
 ('withdrawal', 0.014838539),
 ('make', 0.01450894),
 ('atm', 0.013768934),
 ('money', 0.013627977)]

## Now, we look at Automated Document Summarization

We start by using the conventional summarisation function from gensim.

In [92]:
from gensim.summarization import summarize, keywords
from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import networkx

def text_summarization_gensim(text, summary_ratio=0.5):
    summary = summarize(text, split=True, ratio=summary_ratio)
    for sentence in summary:
        print(sentence)



Since we want to see how this works in a more detailed manner, we start by creating our own summariser using Latent Semantic Analysis

In [93]:
def parse_document(document):
    document = re.sub('\n', ' ', document)
    if isinstance(document, str):
        document = document
    elif isinstance(document, unicode):
        return unicodedata.normalize('NFKD', document).encode('ascii',
        'ignore')
    else:
        raise ValueError('Document is not string or unicode!')
    document = document.strip()
    sentences = nltk.sent_tokenize(document)
    sentences = [sentence.strip() for sentence in sentences]
    return sentences

def low_rank_svd(matrix, singular_count=2):
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt

def build_feature_matrix(documents, feature_type='frequency'):
    feature_type = feature_type.lower().strip()
    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=1,
                                     ngram_range=(1, 1))
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=1,
                                     ngram_range=(1, 1))
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=1,
                                     ngram_range=(1, 1))
    else:
        pass
    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    return vectorizer, feature_matrix


In [94]:
def lsa_text_summarizer(documents, num_sentences=2,
                        num_topics=2, feature_type='frequency',
                        sv_threshold=0.5):
    vec, dt_matrix = build_feature_matrix(documents,
                                          feature_type=feature_type)
    td_matrix = dt_matrix.transpose()
    td_matrix = td_matrix.multiply(td_matrix > 0)
    u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)
    min_sigma_value = max(s) * sv_threshold
    s[s < min_sigma_value] = 0
    salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))
    top_sentence_indices = (salience_scores.argsort()
                            [-num_sentences:][::-1])
    top_sentence_indices.sort()
    for index in top_sentence_indices:
        print(documents[index])

In [95]:
print(blogs[blog_index].get_title())
lsa_text_summarizer(parse_document(documents[blog_index]), num_sentences = 4)

Should I use a credit card for my holiday?
There are three main reasons why you might use a credit card for your holiday: to borrow money, protect your purchases, and access your money abroad.
Try estimating costs by:    Tracking flight prices with Google Flights  Using an app to predict flight and hotel prices, such as Hopper    Browsing credit deals on comparison sites – just remember that you may not get the advertised rate (the representative APR)    How much interest will you pay?
Even though you could stand to save money by taking advantage of deals when you see them, once you factor in the interest you’ll pay for borrowing money, you could actually end up spending more than you would if you just saved up and waited.
Holidays aren’t cheap, so it’s important to get your money back if something goes wrong – like if your flight is cancelled or your hotel isn’t up to scratch.


In [96]:
print(blogs[blog_index].get_title())
summary = text_summarization_gensim(documents[blog_index], summary_ratio = 0.05)

Should I use a credit card for my holiday?
There are three main reasons why you might use a credit card for your holiday: to borrow money, protect your purchases, and access your money abroad.
Should I use a credit card to pay for my holiday?
Most debit and credit cards charge a fee for withdrawing or spending money abroad.


The detecting of new sentences could do with some work, but for the rest the results are quite impressive. However, I would say the gensim function works better. You can adapt it using the summary ratio parameter, just like with our function and the number of sentences and topics parameters. Lastly, we look at an algorithm based on Google's TextRank algorithm

In [97]:
def textrank_text_summarizer(document, num_sentences=None,
                             feature_type='frequency',
                            summary_ratio = None):
    if not (num_sentences or summary_ratio):
        print("Enter either a maximum number of sentences or a summary ratio")
        return
    elif not num_sentences:
        num_sentences = int(summary_ratio * len(document))
    vec, dt_matrix = build_feature_matrix(document,
                                      feature_type='tfidf')
    similarity_matrix = (dt_matrix * dt_matrix.T)
    similarity_graph = networkx.from_scipy_sparse_matrix(
        similarity_matrix)
    scores = networkx.pagerank(similarity_graph)
    ranked_sentences = sorted(((score, index)
                                for index, score
                                in scores.items()),
                              reverse=True)
    top_sentence_indices = [ranked_sentences[index][1]
                            for index in range(num_sentences)]
    top_sentence_indices.sort()
    for index in top_sentence_indices:
        print(document[index])

In [100]:
print(blogs[blog_index].get_title())
textrank_text_summarizer(parse_document(documents[blog_index]),
                         summary_ratio = 0.10)

Should I use a credit card for my holiday?
There are three main reasons why you might use a credit card for your holiday: to borrow money, protect your purchases, and access your money abroad.
It depends on how fast the price of your holiday goes up, how much by, and what you pay to use credit.
Even though you could stand to save money by taking advantage of deals when you see them, once you factor in the interest you’ll pay for borrowing money, you could actually end up spending more than you would if you just saved up and waited.
Only use credit to buy your holiday if you’re sure you can repay it.
The difference is they aren’t a form of credit, so you won’t be borrowing money to use them.


These results are very impressive! This is a very good summary of the blog post, and answers the question in the title of the post.

## Here we start clustering the documents, but to first get an overview we compute the cosine similarity

In [101]:
def get_cosine_similarity(bag_of_words, num_features):
    index = MatrixSimilarity(bag_of_words, num_features = num_features)
    cosine_similarity = []
    for document in bag_of_words:
        cosine_similarity.append(index[document])
    return cosine_similarity

In [102]:
cosine_similarity = get_cosine_similarity(bag_of_words, num_features=len(dictionary))

trace = go.Heatmap(z=cosine_similarity)
data=[trace]
layout = go.Layout(
    title='Frequency cosine similarity heatmap',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='basic-heatmap')

The logical progression is now to look at the cosine similarity of all documents, now using the TF-IDF representation rather than the simple BOW representation. Luckily, there are functions that allow us to convert between the two fairly quickly.

In [103]:
tfidf_bag = TfidfVectorizer().fit_transform(documents)
cosine_similarity = get_cosine_similarity(tfidf_bag.todense(), tfidf_bag.shape[1])

trace = go.Heatmap(z=cosine_similarity)
data=[trace]
layout = go.Layout(
    title='TF-IDF cosine similarity Heatmap',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='tfidf-heatmap')

We see a much sharper and more refined image here, but this is to be expected as a result of using the TF-IDF instead of simple counts. TF-IDF highlights the differences between documents by putting extra weight on those terms which are common in the document but not common in the rest of the corpus.

## Now we look at clustering methods for these documents


In [104]:
from sklearn.cluster import KMeans, AffinityPropagation
from collections import Counter

In [105]:
def k_means_cluster(matrix, num_clusters=10):
    k_means = KMeans(n_clusters = num_clusters,
                     random_state = 12,
                    max_iter = 1000000)
    k_means.fit(matrix)
    clusters = k_means.labels_
    return k_means, clusters


def affinity_propagation_cluster(matrix):
    sim = matrix * matrix.T
    sim = sim.todense()
    ap = AffinityPropagation()
    ap.fit(sim)
    clusters = ap.labels_
    return ap, clusters

def print_clusters(documents, clusters, num_clusters):
    for cluster_number in range(num_clusters):
        print('~~~~~~~Cluster Number %d~~~~~' % cluster_number)
        for index, cluster in enumerate(clusters):
            if cluster == cluster_number:
                print(documents[index].get_title(), documents[index].get_date())
            


In [106]:
num_km_clusters = 8
km, km_clusters = k_means_cluster(tfidf_bag, num_km_clusters)
km_cluster_count_placeholder = Counter(km_clusters)
km_cluster_count = [0 for x in km_cluster_count_placeholder.keys()]
for key, value in km_cluster_count_placeholder.items():
    km_cluster_count[int(key)] = value

print_clusters(blogs, km_clusters, num_km_clusters)


~~~~~~~Cluster Number 0~~~~~
See your Year in Monzo! 8 Jan 2019
Keep track of what you’re owed with Shared Tabs 13 Dec 2018
New Monzo Update 12 Sep 2016
Monzo Monthly Update 6 Feb 2018
Monzo Monthly Update 5 Jul 2018
See your receipts in Monzo with Flux 31 Jan 2019
How Much?! Introducing  Your Spending 22 Jan 2016
Monzo Monthly Update 28 Feb 2018
Monzo Monthly Update 9 Jan 2019
Pulse Comes to Android 14 Feb 2018
Monzo with Friends 14 Nov 2016
How We Build the Monzo App 2 Mar 2018
Custom Monzo app icons on iOS 3 Jul 2018
Monzo Monthly Update 6 Mar 2019
Pay on the go using Monzo with Apple Pay 17 May 2018
No more waiting list! 24 May 2018
We’re halfway through The Big List! An update 3 Jul 2018
Testing Marketplace Beta 1 Mar 2018
Spending on Android! New Update Out Now 14 Nov 2016
Give the gift of Monzo this Christmas 🎁 11 Dec 2018
Save together with Pots on joint accounts! 10 Dec 2018
Payments on Android! 6 Dec 2016
Half a Million Current Accounts! 19 Mar 2018
Mondo on Android 25 May 20

In [107]:
data = [go.Bar(
            x=[i for i in range(len(km_cluster_count))],
            y=km_cluster_count,
            text=km_cluster_count,
            textposition='auto'
    )]
layout = go.Layout(
    title='Number of articles per cluster (K-means Propagation)',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='cluster-size-bar')


Consider using IPython.display.IFrame instead



In [108]:
ap, ap_clusters = affinity_propagation_cluster(tfidf_bag)
ap_cluster_count_placeholder = Counter(ap_clusters)
num_ap_clusters = len(ap_cluster_count_placeholder)
ap_cluster_count = [0 for x in ap_cluster_count_placeholder.keys()]
for key, value in ap_cluster_count_placeholder.items():
    ap_cluster_count[int(key)] = value

data = [go.Bar(
            x=[i for i in range(len(ap_cluster_count))],
            y=ap_cluster_count,
            text=ap_cluster_count,
            textposition='auto'
    )]
layout = go.Layout(
    title='Number of articles per cluster (Affinity Propagation)',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='cluster-size-bar')


Consider using IPython.display.IFrame instead



In [109]:
print_clusters(blogs, ap_clusters, len(ap_cluster_count))

~~~~~~~Cluster Number 0~~~~~
Flight delays and cancellations – know your rights 18 Dec 2018
How to be vegan and stay within budget 3 Jun 2019
Just Landed 25 May 2018
How do card payments work? 13 Jun 2018
Train delays and cancellations – know your rights 22 Jan 2019
7 ways to save money on train tickets 22 Jan 2019
What is a guarantor? 4 Oct 2018
Launching Android Pay Top Ups! 22 May 2017
​​Do you donate? You explain how you give to charity 9 May 2019
Inflation 30 Oct 2018
Search Launches on Android! 11 Apr 2017
Home insurance – do you really need it? 19 Feb 2019
Icelandic airline WOW Air has gone bust. Have you been affected? 28 Mar 2019
~~~~~~~Cluster Number 1~~~~~
How to sell stuff online 13 Mar 2019
How to look after your mental health for free 20 Dec 2018
Returning purchases – know your rights 14 Dec 2018
What to do if your card's been lost or stolen 11 Jun 2018
How to protect yourself against APP fraud 23 May 2019
Things we wish we’d known about money 7 Aug 2018
How to rent stude

Now that we have clustered the blog posts, let us look at topic modeling and cosine similairty of the clusters. First, we look at the cosine similarity. If we reorder the blogs according to cluster membership and replot the heatmap of cosine similarity, do we see an increase in the level of structure in the map? We will compare the clusterings we found using affinity propagation and using k-means

In [110]:
def order_documents(cluster, documents, n_clusters):
    ordered_documents = []
    switch_indices = []
    for i in range(n_clusters):
        for ind, document in enumerate(documents):
            if cluster[ind] == i:
                ordered_documents.append(document)
        switch_indices.append(len(ordered_documents))
    return switch_indices, ordered_documents

In [111]:
km_indices, km_ordered_documents = order_documents(km_clusters, documents, num_km_clusters)
tfidf_bag = TfidfVectorizer().fit_transform(km_ordered_documents)
index = MatrixSimilarity(tfidf_bag.todense(), num_features= tfidf_bag.shape[1])
cosine_similarity  = []
for document in tfidf_bag:
    cosine_similarity.append(index[document])


trace = go.Heatmap(z=cosine_similarity)
data=[trace]
layout = go.Layout(
    title='TF-IDF K-means ordered cosine similarity heatmap',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='tfidf-heatmap')

In [112]:
ap_indices, ap_ordered_documents = order_documents(ap_clusters, documents, num_ap_clusters)
tfidf_bag = TfidfVectorizer().fit_transform(ap_ordered_documents)
index = MatrixSimilarity(tfidf_bag.todense(), num_features= tfidf_bag.shape[1])
cosine_similarity  = []
for document in tfidf_bag:
    cosine_similarity.append(index[document])


trace = go.Heatmap(z=cosine_similarity)
data=[trace]
layout = go.Layout(
    title= 'TF-IDF Affinity Propogation ordered cosine similarity heatmap',
    shapes = [
        # Line Vertical
        {
            'type': 'line',
            'x0': 23,
            'y0': 0,
            'x1': 23,
            'y1': 525,
            'line': {
                'color': 'rgb(0, 0, 0)',
                'width': 1.5,
            },
        }])
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='tfidf-heatmap')

While there is more structure, it is still not very clear. What if we looked at the cosine similarty within and between the clusters. For the sake of convenience, let us only analyse this for the case of affinity propogation, but our functions will also work when clustering using k-means.

In [113]:
def get_documents_in_cluster(clusters, documents, cluster_number):
    documents_list = []
    for document, cluster in zip(documents, clusters):
        if cluster == cluster_number:
            documents_list.append(document)
    return documents_list

def get_all_clusters(clusters, documents, num_clusters):
    cluster_master_list = []
    for i in range(num_clusters):
        cluster_master_list.append(get_documents_in_cluster(clusters, documents, i))
    return cluster_master_list

In [114]:
new_corpus = []
for doc in corpus:
    new_corpus.append(' '.join(doc))

In [115]:
cluster_list = get_all_clusters(km_clusters, new_corpus, num_km_clusters)

In [116]:
cosine_within = []
for cluster in cluster_list:
    tfidf_bag = TfidfVectorizer().fit_transform(cluster)
    cosine_similarity = get_cosine_similarity(tfidf_bag.todense(), tfidf_bag.shape[1])
    cosine_within.append(cosine_similarity)

In [117]:
cluster_number = 2
trace = go.Heatmap(z=cosine_within[cluster_number])
data=[trace]
layout = go.Layout(
    title='TF-IDF K-means ordered cosine similarity heatmap within cluster number ' + str(cluster_number),
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='tfidf-heatmap-cluster-1')

We can play around with this to see the different clusters and their heatmaps, and we can do the same with the clusters generated by Affinity Propagation. Now, we move on to looking at the clustering between clusters. To do this, we aggregate all text in all documents within a cluster, and treat this aggregated data as one document. Then, we compute the cosine similarity between all the documents. We should have as many documents as there are clusters, and so now we expect 8 documents.

In [118]:
def aggregate_clusters(clusters, documents, num_clusters,):
    new_documents = []
    cluster_master_list = get_all_clusters(clusters, documents, num_clusters)
    for cluster in cluster_master_list:
        new_doc = ''
        for document in cluster:
            new_doc += document
        new_documents.append(new_doc)
    return new_documents

In [119]:
new_documents = aggregate_clusters(km_clusters, new_corpus, num_km_clusters)

In [120]:
tfidf_bag = TfidfVectorizer().fit_transform(new_documents)
cosine_similarity = get_cosine_similarity(tfidf_bag.todense(), tfidf_bag.shape[1])
trace = go.Heatmap(z=cosine_similarity)
data=[trace]
layout = go.Layout(
    title='TF-IDF K-means cosine similarity heatmap between clusters',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='tfidf-heatmap-between-clusters')

So as we can see, the cosine similarity is quite high between clusters. Note that in this scale, the lowest cosine-similarity is about 0.30, in stark contrast to the cosine similarities between individual documents which reaches about 0.02. In the end, all blog posts are talking about Monzo so this is also not a very surprising result. Lastly, we look at if we can find differences when performing topic modeling within and between clusters, and again we look at only using clusters that we found using K-means, although the AP approach is analogous. 

In [121]:
lda_list = []
for cluster in cluster_list:
    new_corp = [elem.split(" ") for elem in cluster]
    new_dic = Dictionary(new_corp)
    new_bag = [new_dic.doc2bow(line) for line in new_corp]
    lda = models.ldamodel.LdaModel(corpus=new_bag, 
                               id2word=new_dic,
                               random_state=12, num_topics=10, update_every=1, passes=100)
    lda_list.append(lda)



In [122]:
lda_list[0].print_topics(5)

[(4,
  '0.019*"monzo" + 0.017*"make" + 0.017*"feature" + 0.017*"work" + 0.016*"app" + 0.015*"u" + 0.013*"get" + 0.010*"new" + 0.010*"help" + 0.009*"team"'),
 (6,
  '0.023*"account" + 0.016*"monzo" + 0.015*"work" + 0.014*"current" + 0.012*"card" + 0.010*"get" + 0.009*"make" + 0.008*"u" + 0.007*"people" + 0.007*"register"'),
 (3,
  '0.032*"energy" + 0.020*"supplier" + 0.019*"apple" + 0.013*"wallet" + 0.013*"switch" + 0.012*"pay" + 0.009*"card" + 0.008*"option" + 0.007*"instruction" + 0.007*"ovo"'),
 (0,
  '0.016*"update" + 0.014*"app" + 0.013*"new" + 0.013*"search" + 0.012*"transaction" + 0.011*"screen" + 0.011*"answer" + 0.010*"support" + 0.010*"card" + 0.009*"user"'),
 (9,
  '0.020*"make" + 0.012*"see" + 0.012*"pin" + 0.011*"purchase" + 0.011*"u" + 0.011*"transaction" + 0.010*"monzo" + 0.009*"information" + 0.009*"work" + 0.008*"app"')]

In [123]:
lda_list[1].print_topics(5)

[(7,
  '0.032*"return" + 0.023*"product" + 0.017*"buy" + 0.016*"get" + 0.016*"may" + 0.015*"right" + 0.014*"day" + 0.014*"seller" + 0.011*"refund" + 0.011*"item"'),
 (5,
  '0.036*"pay" + 0.021*"debt" + 0.020*"loan" + 0.019*"tax" + 0.018*"income" + 0.016*"get" + 0.015*"much" + 0.014*"earn" + 0.014*"money" + 0.013*"year"'),
 (1,
  '0.020*"insurance" + 0.011*"get" + 0.011*"free" + 0.010*"damage" + 0.008*"may" + 0.008*"home" + 0.008*"help" + 0.008*"building" + 0.008*"cover" + 0.007*"offer"'),
 (6,
  '0.022*"bank" + 0.016*"fraud" + 0.015*"monzo" + 0.012*"make" + 0.011*"number" + 0.011*"money" + 0.011*"get" + 0.008*"password" + 0.008*"customer" + 0.008*"someone"'),
 (4,
  '0.025*"monzo" + 0.020*"money" + 0.017*"pay" + 0.015*"payment" + 0.014*"use" + 0.013*"month" + 0.013*"budget" + 0.012*"bill" + 0.012*"like" + 0.012*"get"')]

In [124]:
lda_list[2].print_topics(5)

[(2,
  '0.080*"interest" + 0.075*"rate" + 0.024*"may" + 0.023*"account" + 0.022*"pay" + 0.019*"saving" + 0.015*"credit" + 0.014*"earn" + 0.014*"borrow" + 0.014*"good"'),
 (6,
  '0.083*"inflation" + 0.035*"price" + 0.030*"rate" + 0.019*"bank" + 0.015*"rise" + 0.013*"student" + 0.013*"rpi" + 0.013*"cpi" + 0.013*"base" + 0.012*"loan"'),
 (1,
  '0.001*"rate" + 0.001*"interest" + 0.001*"saving" + 0.001*"pension" + 0.001*"pay" + 0.001*"money" + 0.001*"get" + 0.001*"base" + 0.001*"account" + 0.001*"isa"'),
 (9,
  '0.001*"pension" + 0.001*"pay" + 0.001*"rate" + 0.001*"saving" + 0.001*"tax" + 0.001*"money" + 0.001*"interest" + 0.001*"get" + 0.001*"may" + 0.001*"account"'),
 (8,
  '0.001*"pension" + 0.001*"rate" + 0.001*"saving" + 0.001*"money" + 0.001*"get" + 0.001*"isa" + 0.001*"tax" + 0.001*"pay" + 0.001*"account" + 0.001*"may"')]

In [125]:
new_documents = aggregate_clusters(ap_clusters, new_corpus, num_ap_clusters)
tfidf_bag = TfidfVectorizer().fit_transform(new_documents)
cosine_similarity = get_cosine_similarity(tfidf_bag.todense(), tfidf_bag.shape[1])
trace = go.Heatmap(z=cosine_similarity)
data=[trace]
layout = go.Layout(
    title='TF-IDF AP cosine similarity heatmap between clusters',
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='tfidf-heatmap-between-clusters')

In [126]:
cluster_list = get_all_clusters(ap_clusters, new_corpus, num_ap_clusters)
lda_list = []
for cluster in cluster_list:
    new_corp = [elem.split(" ") for elem in cluster]
    new_dic = Dictionary(new_corp)
    new_bag = [new_dic.doc2bow(line) for line in new_corp]
    lda = models.ldamodel.LdaModel(corpus=new_bag, 
                               id2word=new_dic,
                               random_state=12, num_topics=10, update_every=1, passes=100)
    lda_list.append(lda)



In [127]:
lda_list[0].print_topics(5)

[(2,
  '0.052*"insurance" + 0.025*"damage" + 0.020*"cover" + 0.020*"home" + 0.020*"may" + 0.020*"building" + 0.015*"claim" + 0.015*"content" + 0.015*"policy" + 0.014*"pay"'),
 (7,
  '0.001*"may" + 0.001*"ticket" + 0.001*"train" + 0.001*"give" + 0.001*"get" + 0.001*"money" + 0.001*"charity" + 0.001*"grimshaw" + 0.001*"goshcharity" + 0.001*"gmfa_uk"'),
 (4,
  '0.036*"flight" + 0.023*"travel" + 0.023*"book" + 0.021*"get" + 0.021*"airline" + 0.018*"back" + 0.018*"contact" + 0.018*"wow" + 0.016*"air" + 0.014*"atol"'),
 (9,
  '0.049*"inflation" + 0.045*"rate" + 0.024*"interest" + 0.021*"price" + 0.015*"bank" + 0.014*"money" + 0.012*"may" + 0.011*"use" + 0.011*"base" + 0.011*"go"'),
 (3,
  '0.034*"may" + 0.021*"charity" + 0.020*"give" + 0.013*"flight" + 0.012*"support" + 0.012*"get" + 0.011*"month" + 0.011*"donate" + 0.011*"airline" + 0.010*"money"')]

In [128]:
lda_list[1].print_topics(5)

[(0,
  '0.034*"product" + 0.028*"return" + 0.022*"seller" + 0.019*"right" + 0.018*"get" + 0.017*"day" + 0.017*"refund" + 0.016*"may" + 0.015*"buy" + 0.013*"store"'),
 (6,
  '0.018*"challenge" + 0.017*"saving" + 0.017*"week" + 0.014*"set" + 0.013*"get" + 0.012*"help" + 0.012*"save" + 0.011*"one" + 0.011*"bank" + 0.011*"piggy"'),
 (5,
  '0.031*"budget" + 0.026*"monzo" + 0.020*"use" + 0.017*"money" + 0.017*"expense" + 0.016*"need" + 0.015*"pay" + 0.015*"bill" + 0.014*"account" + 0.014*"zero"'),
 (8,
  '0.001*"monzo" + 0.001*"pay" + 0.001*"free" + 0.001*"need" + 0.001*"money" + 0.001*"get" + 0.001*"charge" + 0.001*"like" + 0.001*"budget" + 0.001*"use"'),
 (4,
  '0.008*"go" + 0.008*"make" + 0.008*"save" + 0.008*"get" + 0.007*"try" + 0.007*"less" + 0.007*"reduce" + 0.007*"food" + 0.007*"year" + 0.007*"could"')]

In [129]:
lda_list[2].print_topics(5)

[(5,
  '0.077*"account" + 0.034*"bank" + 0.026*"money" + 0.026*"open" + 0.026*"new" + 0.022*"switch" + 0.017*"payment" + 0.015*"want" + 0.015*"one" + 0.014*"monzo"'),
 (1,
  '0.002*"account" + 0.002*"card" + 0.002*"money" + 0.002*"overdraft" + 0.002*"bank" + 0.002*"use" + 0.002*"utility" + 0.002*"yet" + 0.002*"water" + 0.002*"walk"'),
 (2,
  '0.069*"account" + 0.061*"bank" + 0.015*"open" + 0.015*"money" + 0.013*"monzo" + 0.013*"credit" + 0.013*"one" + 0.011*"get" + 0.011*"take" + 0.011*"want"'),
 (9,
  '0.075*"overdraft" + 0.024*"credit" + 0.020*"limit" + 0.020*"account" + 0.018*"charge" + 0.018*"use" + 0.017*"app" + 0.013*"want" + 0.013*"check" + 0.013*"switch"'),
 (8,
  '0.002*"overdraft" + 0.002*"account" + 0.002*"monzo" + 0.002*"make" + 0.002*"like" + 0.002*"limit" + 0.002*"know" + 0.002*"need" + 0.002*"money" + 0.002*"find"')]