In [47]:
import pandas as pd

from rouge import Rouge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [44]:
DATA_PATH = '/Users/michaelwellner/Workspaces/thesis--text-summarization/data/swisstext'
data = pd.read_csv(DATA_PATH + '/data_train.csv')

In [88]:
class BaselineModel(BaseEstimator, TransformerMixin):
    
    def __init__(self, words: int = 12):
        self.__words = words
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X["prediction_summary"] = X.apply(lambda row: ' '.join(str.split(row["source"], ' ')[:self.__words]), axis=1)
        return X

    
class RougeScorer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.__rouge = Rouge()

    def fit(self, X, x=None):
        return self
    
    def score(self, X):
        scores = self.__rouge.get_scores(X['prediction_summary'], X['summary'])
        
        X['rouge_1_f'] = scores[0]['rouge-1']['f']
        X['rouge_1_p'] = scores[0]['rouge-1']['p']
        X['rouge_1_r'] = scores[0]['rouge-1']['r']
        
        X['rouge_2_f'] = scores[0]['rouge-2']['f']
        X['rouge_2_p'] = scores[0]['rouge-2']['p']
        X['rouge_2_r'] = scores[0]['rouge-2']['r']
        
        X['rouge_l_f'] = scores[0]['rouge-l']['f']
        X['rouge_l_p'] = scores[0]['rouge-l']['p']
        X['rouge_l_r'] = scores[0]['rouge-l']['r']
        
        return X
    
    def transform(self, X):
        return X.apply(self.score, axis=1)

In [89]:
model = BaselineModel()
scorer = RougeScorer()

pipeline = Pipeline([
    ("summarizer", model),
    ("scorer", scorer)
])

In [90]:
pipeline.fit_transform(data.loc[:100].copy())

Unnamed: 0,source,summary,prediction_summary,rouge_1_f,rouge_1_p,rouge_1_r,rouge_2_f,rouge_2_p,rouge_2_r,rouge_l_f,rouge_l_p,rouge_l_r
0,Minghella war der Sohn italienisch-schottische...,"Anthony Minghella, CBE war ein britischer Film...",Minghella war der Sohn italienisch-schottische...,0.076923,0.083333,0.071429,0.000000,0.000000,0.000000,0.080000,0.090909,0.071429
1,Ende der 1940er Jahre wurde eine erste Auteur-...,Die Auteur-Theorie ist eine Filmtheorie und di...,Ende der 1940er Jahre wurde eine erste Auteur-...,0.117647,0.333333,0.071429,0.000000,0.000000,0.000000,0.105263,0.250000,0.066667
2,"Al Pacino, geboren in Manhattan, ist der Sohn ...","Alfredo James ""Al"" Pacino ist ein US-amerikani...","Al Pacino, geboren in Manhattan, ist der Sohn ...",0.106667,0.333333,0.063492,0.000000,0.000000,0.000000,0.119403,0.400000,0.070175
3,Der Name der Alkalimetalle leitet sich von dem...,Als Alkalimetalle werden die chemischen Elemen...,Der Name der Alkalimetalle leitet sich von dem...,0.045977,0.166667,0.026667,0.000000,0.000000,0.000000,0.053333,0.166667,0.031746
4,Die Arbeit ist bereits seit dem Altertum Gegen...,Das deutsche Arbeitsrecht ist ein Rechtsgebiet...,Die Arbeit ist bereits seit dem Altertum Gegen...,0.105263,0.166667,0.076923,0.000000,0.000000,0.000000,0.114286,0.166667,0.086957
...,...,...,...,...,...,...,...,...,...,...,...,...
96,Die Stadt liegt zwischen den Hansestädten Ros...,Ribnitz-Damgarten ist eine Stadt im Landkreis ...,Die Stadt liegt zwischen den Hansestädten Ros...,0.131579,0.416667,0.078125,0.027027,0.090909,0.015873,0.149254,0.416667,0.090909
97,Die Fläche des Rhein-Lahn-Kreises beträgt 78...,Der Rhein-Lahn-Kreis ist eine Gebietskörpersc...,Die Fläche des Rhein-Lahn-Kreises beträgt 78...,0.100000,0.250000,0.062500,0.034483,0.090909,0.021277,0.120000,0.250000,0.078947
98,"Im Film war der ""Split Screen"" vor allem in de...",Split Screen oder Bildschirmaufteilung ist ein...,"Im Film war der ""Split Screen"" vor allem in de...",0.052632,0.083333,0.038462,0.000000,0.000000,0.000000,0.057143,0.083333,0.043478
99,Sylvester Gardenzio Stallone wurde 1946 in New...,"Sylvester ""Sly"" Gardenzio Stallone ist ein US-...",Sylvester Gardenzio Stallone wurde 1946 in New...,0.210526,0.500000,0.133333,0.036364,0.090909,0.022727,0.240000,0.500000,0.157895
