# BBC Articles Spinner
* This is a small NLP project target on using Markov Models in building article spinner applications.
* Here, we'll be dealing with a BBC News corpus containing business texts.

In [1]:
# Loading our corpus.
import pandas as pd
import numpy as np

corpus = pd.read_csv('/kaggle/input/bbc-business/06_bbc_text_cls.csv')
corpus = corpus[corpus.labels=='business']['text'] # As mentioned, we'll be just focusing in business articles.
corpus.head()

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

## Data Treatment
* Now, we must apply some transformations that are going to optimize the model's learning.

In [2]:
from nltk import word_tokenize
from string import punctuation
from typing import List

def separate_paragraphs(s:str)->List[List[str]]:
    '''
        Turns a given string into list of tokens per paragraph.
        
        Note: It already lowercases the text.
        
        Parameter
        ---------
        s: str
            The provided text
            
        Returns
        -------
        An array mentioning the tokens from each of the string's paragraphs.
    '''
    paragraphs = s.split('\n\n') 
    return list(map(word_tokenize, paragraphs))
     

def treat(series:pd.Series)->List[List[str]]:
    '''
        Treats the corpus looking for better model fitting.
        
        Parameter
        ---------
        series: `pd.Series`
            A series contaning our documents.
            
        Returns
        -------
        The treated texts ready for model consumption.
    '''
    series = map(str.lower, series)
    return list(map(separate_paragraphs, series))

# Applying the transformations to our news corpus. 
corpus = treat(corpus)

In [3]:
import numpy as np
from collections import Counter
from typing import List

class MarkovModel:
    '''
        A Second-Order Markov Model designed for writing poems.
        
        Method
        ------
        `fit`: It fits the model's probability distributions  according to a provided list of texts tokens.
        
        Attributes
        ----------
        `pi`: A storage for first token probabilities.
        `a`: The first order transition matrix for second token probabilities.
        `a2`: The second order transition matrix
    '''
    def __pi(self, X:List[List[str]]):
        '''
            Builds the model's pi vector.
            
            Parameter
            ---------
            `X`: List[List[str]]
        '''
        self.pi = Counter(len(p) for x in X for p in x)
        self.pi = {token:count/len(X) for token, count in self.pi.items()}
        
    def __a(self, X:List[List[str]]):
        '''
            Builds the first order transition matrix.
            
            Parameter
            ---------
            `X`: List[List[str]]
        '''
        counter = Counter(x[0]+'<sep>'+x[1] for x in X) 
        denom = Counter(x[0] for x in X)
        self.a = {}
        for key in counter.keys():
            i,j = key.split('<sep>')
            if i not in self.a.keys():
                self.a[i] = {}
            self.a[i][j] = counter[key]/denom[i]
        
            
    def __a2(self, X:List[List[str]]):
        '''
            Builds the second order transition matrix.
            
            Parameter
            ---------
            `X`: List[List[str]]
        '''
        counter = Counter(x[i-2]+'<sep>'+x[i-1]+'<sep>'+x[i] for x in X for i in range(2, len(x)))
        denom = Counter(x[i-1]+'<sep>'+x[i] for x in X for i in range(1, len(x)-1))
        self.a2 = {}
        for key in counter.keys():
            i,j,k = key.split('<sep>')
            if i not in self.a2.keys():
                self.a2[i] = {}
            if j not in self.a2[i].keys():
                self.a2[i][j] = {}
            self.a2[i][j][k] = counter[key]/denom[i+'<sep>'+j]
        
    def fit(self, X:List[List[str]]):
        '''
            Constructs the model's vector and state transition matrices.
            
            Parameter
            ---------
            `X`: List[List[str]]
        '''
        self.__pi(X)
#         self.__a(X)
#         self.__a2(X)
        
    def write(self):
        '''
            Writes a poem
        '''
        first_token = np.random.choice(list(self.pi.keys()), p=list(self.pi.values()))
        second_token = np.random.choice(list(self.a[first_token].keys()), p=list(self.a[first_token].values()))
        sentence = [first_token, second_token]
        while True:
            penultimate, last = sentence[-2], sentence[-1]
            next_probas = self.a2[penultimate][last]
            next_token = np.random.choice(list(next_probas.keys()), p=list(next_probas.values()))
            if next_token=='<eos>':
                break
            else:
                sentence.append(next_token)
        return ' '.join(sentence)
            
model = MarkovModel()
model.fit(corpus)
model.pi
# model.fit(corpus[0])
# print(model.write())

{6: 0.37254901960784315,
 28: 0.08431372549019608,
 63: 0.03725490196078431,
 127: 0.021568627450980392,
 140: 0.013725490196078431,
 126: 0.0196078431372549,
 5: 0.34509803921568627,
 29: 0.07254901960784314,
 168: 0.011764705882352941,
 231: 0.0058823529411764705,
 154: 0.00784313725490196,
 105: 0.027450980392156862,
 7: 0.16470588235294117,
 15: 0.011764705882352941,
 81: 0.03333333333333333,
 20: 0.0784313725490196,
 186: 0.00980392156862745,
 163: 0.0058823529411764705,
 210: 0.00784313725490196,
 4: 0.09019607843137255,
 21: 0.0784313725490196,
 60: 0.0196078431372549,
 128: 0.00980392156862745,
 27: 0.10392156862745099,
 146: 0.03137254901960784,
 120: 0.027450980392156862,
 25: 0.10392156862745099,
 61: 0.047058823529411764,
 279: 0.00196078431372549,
 33: 0.01764705882352941,
 54: 0.00392156862745098,
 51: 0.01568627450980392,
 122: 0.023529411764705882,
 99: 0.011764705882352941,
 121: 0.025490196078431372,
 23: 0.09607843137254903,
 79: 0.021568627450980392,
 108: 0.0215686

In [4]:
a = []
for text in corpus:
    for p in text:
        try:
            a.append(p[0])
        except:
            print(p)
a[:5]

[]
[]
[]


['ad', 'quarterly', 'the', 'time', 'time']

<p style='color:red'>Existem parágrafos vazios que estão atrapalhando a construção do pi. </p>