# BBC Articles Spinner
* This is a small NLP project target on using Markov Models in building article spinner applications.
* Here, we'll be dealing with a BBC News corpus containing business texts.

In [1]:
# Loading our corpus.
import pandas as pd
import numpy as np

corpus = pd.read_csv('/kaggle/input/bbc-business/06_bbc_text_cls.csv')
corpus = corpus[corpus.labels=='business']['text'] # As mentioned, we'll be just focusing in business articles.
corpus.head()

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

## Data Treatment
* Now, we must apply some transformations that are going to optimize the model's learning.

In [2]:
from nltk import word_tokenize
from string import punctuation
from typing import List

def separate_paragraphs(s:str)->List[List[str]]:
    '''
        Turns a given string into list of tokens per paragraph.
        
        Note: It already lowercases the text.
        
        Parameter
        ---------
        s: str
            The provided text
            
        Returns
        -------
        An array mentioning the tokens from each of the string's paragraphs.
    '''
    paragraphs = s.split('\n\n') 
    paragraphs = [p for p in paragraphs if len(p)>0] # Filtering out problematic paragraphs that contain no tokens (possibly because of
    return list(map(word_tokenize, paragraphs))      # the use of two consecutive '\n\n's)
     

def treat(series:pd.Series)->List[List[str]]:
    '''
        Treats the corpus looking for better model fitting.
        
        Parameter
        ---------
        series: `pd.Series`
            A series contaning our documents.
            
        Returns
        -------
        The treated texts ready for model consumption.
    '''
    series = map(str.lower, series)
    return list(map(separate_paragraphs, series))

# Applying the transformations to our news corpus. 
corpus = treat(corpus)

In [3]:
import numpy as np
from collections import Counter
from typing import List

class MarkovModel:
    '''
        A Second-Order Markov Model designed for performing article spinning.
        
        Method
        ------
        `fit`: It fits the model's probability distributions according to a provided list of paragraphs tokens from a myriad of texts.
        
        Attributes
        ----------
        `a2`: The second order transition matrix
    '''
        
            
    def __a2(self, X:List[List[List[str]]]):
        '''
            Builds the second order transition matrix.
            
            Parameter
            ---------
            `X`: List[List[str]]
                The collection of texts as lists of tokens from each paragraph.
        '''
        counter = Counter(p[i-1]+'<sep>'+p[i+1]+'<sep>'+p[i] for x in X for p in x for i in range(1, len(p)-1))
        denom = Counter(p[i-1]+'<sep>'+p[i+1] for x in X for p in x for i in range(1, len(p)-1))
        self.a2 = {}
        for key in counter.keys():
            i,k,j = key.split('<sep>')
            if i not in self.a2.keys():
                self.a2[i] = {}
            if k not in self.a2[i].keys():
                self.a2[i][k] = {}
            self.a2[i][k][j] = counter[key]/denom[i+'<sep>'+k]
        
    def fit(self, X:List[List[List[str]]]):
        '''
            Constructs the model's vector and state transition matrices.
            
            Parameter
            ---------
            `X`: List[List[str]]
                The collection of texts as lists of tokens from each paragraph.
        '''
        #self.__pi(X)
        #self.__a(X)
        self.__a2(X)
        
    def spin(self, X:List[List[str]], tokens_per_paragraph:int=2):
        '''
            Writes a poem
        '''
        boundaries = [[[p[i-1], p[+1]] for p in X ]for i in range(1, len(p)-1)]
        return boundaries
            
model = MarkovModel()
model.fit(corpus)
#list(model.a2.keys())[:5]
# model.fit(corpus[0])
# print(model.write())

In [4]:
import numpy as np

np.random.choice(['oi', 'tudo'], 2, replace=False)

array(['tudo', 'oi'], dtype='<U4')

<p style='color:red'> Montar a função de spinning </p>