# BBC Articles Spinner
* This is a small NLP project target on using Markov Models in building article spinner applications.
* Here, we'll be dealing with a BBC News corpus containing business texts.

In [1]:
# Loading our corpus.
import pandas as pd
import numpy as np

corpus = pd.read_csv('/kaggle/input/bbc-business/06_bbc_text_cls.csv')
corpus = corpus[corpus.labels=='business']['text'] # As mentioned, we'll be just focusing in business articles.
corpus.head()

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

## Data Treatment
* Now, we must apply some transformations that are going to optimize the model's learning.

In [2]:
from nltk import word_tokenize
from string import punctuation
from typing import List

def separate_paragraphs(s:str)->List[List[List[str]]]:
    '''
        Turns a given string into list of tokens per paragraph.
        
        Note: It already lowercases the text.
        
        Parameter
        ---------
        s: str
            The provided text
            
        Returns
        -------
        An array mentioning the tokens from each of the string's paragraphs.
    '''
    paragraphs = s.split('\n\n') 
    paragraphs = [p for p in paragraphs if len(p)>0] # Filtering out problematic paragraphs that contain no tokens (possibly because of
    return list(map(word_tokenize, paragraphs))      # the use of two consecutive '\n\n's)
     

def treat(series:pd.Series)->List[List[List[str]]]:
    '''
        Treats the corpus looking for better model fitting.
        
        Parameter
        ---------
        series: `pd.Series`
            A series contaning our documents.
            
        Returns
        -------
        The treated texts ready for model consumption.
    '''
    series = map(str.lower, series)
    return list(map(separate_paragraphs, series))

# Applying the transformations to our news corpus. 
corpus = treat(corpus)

In [3]:
import numpy as np
from collections import Counter
from nltk.tokenize.treebank import TreebankWordDetokenizer
from typing import List

class MarkovModel:
    '''
        A Second-Order Markov Model designed for performing article spinning.
        
        Method
        ------
        `fit`: It fits the model's probability distributions according to a provided list of paragraphs tokens from a myriad of texts.
        
        Attributes
        ----------
        `a2`: The second order transition matrix
    '''
        
            
    def __a2(self, X:List[List[List[str]]]):
        '''
            Builds the second order transition matrix.
            
            Parameter
            ---------
            `X`: List[List[str]]
                The collection of texts as lists of tokens from each paragraph.
        '''
        counter = Counter(p[i-1]+'<sep>'+p[i+1]+'<sep>'+p[i] for x in X for p in x for i in range(1, len(p)-1))
        denom = Counter(p[i-1]+'<sep>'+p[i+1] for x in X for p in x for i in range(1, len(p)-1))
        self.a2 = {}
        for key in counter.keys():
            i,k,j = key.split('<sep>')
            if i not in self.a2.keys():
                self.a2[i] = {}
            if k not in self.a2[i].keys():
                self.a2[i][k] = {}
            self.a2[i][k][j] = counter[key]/denom[i+'<sep>'+k]
        
    def fit(self, X:List[List[List[str]]]):
        '''
            Constructs the model's vector and state transition matrices.
            
            Parameter
            ---------
            `X`: List[List[List[str]]]
                The collection of texts as lists of tokens from each paragraph.
        '''
        self.__a2(X)
        
    def spin(self, X:List[List[str]], tokens_per_paragraph:int=2):
        '''
            Performs the Article Spinning
            
            Parameters
            ----------
            `X`: List[List[str]]
                An instance of text tokens segregated by paragraphs.
            `tokens_per_paragraph`: int, defaults to 2
                The amount of tokens to spin.
                
            Returns
            -------
            The spinned article.
        '''
        X = X.copy()
        detokenizer = TreebankWordDetokenizer()
        for idx, p in enumerate(X):
            to_remove_idxs = np.random.choice([i for i in range(1, len(p)-1)], size=tokens_per_paragraph, replace=False)
            idx_token = {i:None for i in to_remove_idxs}
            for idx_ in to_remove_idxs:
                tokens_probas = self.a2[p[idx_-1]][p[idx_+1]]
                idx_token[idx_] = np.random.choice(list(tokens_probas.keys()), p=list(tokens_probas.values()), size=1)[0]
            for idx_ in idx_token:
                X[idx][idx_] = idx_token[idx_]
        return '\n\n'.join([detokenizer.detokenize(p) for p in X])

            
model = MarkovModel()
model.fit(corpus)
model.spin(corpus[20], 4)

"rank 'set to write off film 'ray '\n\nleisure group rank could unveil plans to extend its film services unit and sell its media business,reports claim.\n\nrank,formerly famous for the carry on series,will expose the shake-up at the announcement of its results on friday,the sunday telegraph reported . advisors goldman sachs are understood to have valued its demerged deluxe film unit at £300m,the report added . speculation of a possible shake-up has mounted since rank announced a study into a possible demerger in september . since mike smith's appointment as chief executive in 1999,the group has focused on fewer businesses and embarked on a major cost-cutting programme which has seen it dispose of a number of businesses,including the odeon cinema chain and the pinewood studios . the move to the group with three core divisions:gaming,hard rock and deluxe films,which provides technical services to hollywood studios.\n\nrank now aims to concentrate on its gaming,bars and hotels business,in