In [49]:
import pandas as pd

data = pd.read_csv('unprocessed_summaries.csv').drop(['Unnamed: 0'], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3821 entries, 0 to 3820
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paragraph         3821 non-null   object
 1   genre             3821 non-null   object
 2   bart_summary_30   3821 non-null   object
 3   bart_summary_50   3821 non-null   object
 4   bart_summary_100  3821 non-null   object
 5   xsum_summary_30   3821 non-null   object
 6   xsum_summary_50   3821 non-null   object
 7   xsum_summary_100  3821 non-null   object
dtypes: object(8)
memory usage: 238.9+ KB


In [52]:
from pandarallel import pandarallel
def process_termination_point(row):
    for model_name in ['bart_summary_30', 'bart_summary_50', 'bart_summary_100', 'xsum_summary_30', 'xsum_summary_50', 'xsum_summary_100']:
        text = row[model_name]
        stop_chars = ['.', '...', '?', '!', ';']
        index_of_stop_char = []
        for char in stop_chars:
            try:
                index_of_stop_char.append(len(text) - text[::-1].index(char))
            except ValueError:
                index_of_stop_char.append(0)

        max_info = max(index_of_stop_char)
        if max_info == 0:
            row[model_name] = None
        else:
            row[model_name] = text[:max_info]
    return row

pandarallel.initialize()
processed_data = data.parallel_apply(process_termination_point, axis=1)
processed_data

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Unnamed: 0,paragraph,genre,bart_summary_30,bart_summary_50,bart_summary_100,xsum_summary_30,xsum_summary_50,xsum_summary_100
0,“Then I can wait in the next room”. “Not at al...,detective,,"“I know, my dear Watson, that you share my lov...","“I know, my dear Watson, that you share my lov...",,"Holmes, one of the world's most famous cases ...","Holmes, one of the world's most famous cases ..."
1,"“You did, Doctor, but none the less you must c...",detective,Mr.,Mr. Jabez Wilson has been good enough to call ...,Mr. Jabez Wilson has been good enough to call ...,The case of a man who was at the centre of Lo...,The case of a man who was at the centre of Lo...,The case of a man who was at the centre of Lo...
2,"I did not gain very much, however, by my inspe...",detective,Sherlock Holmes’ quick eye took in my occupati...,Sherlock Holmes’ quick eye took in my occupati...,Sherlock Holmes’ quick eye took in my occupati...,"I at the home of Jabez Wilson, I had to look ...","I at the home of Jabez Wilson, I had to look ...","I at the home of Jabez Wilson, I had to look ..."
3,"“How did you know, for example, that I did man...",detective,"“How did you know, for example, that I did man...","“Your hands, my dear sir. Your right hand is q...","“Your hands, my dear sir. Your right hand is q...",One of the most senior members of the Royal F...,One of the most senior members of the Royal F...,One of the most senior members of the Royal F...
4,"” “Well, but China? ” “The fish that you have ...",detective,The fish that you have tattooed immediately ab...,The fish that you have tattooed immediately ab...,The fish that you have tattooed immediately ab...,"Holmes, one of the most famous characters in ...","Holmes, one of the most famous characters in ...","Holmes, one of the most famous characters in ..."
...,...,...,...,...,...,...,...,...
3816,"Aye, lad, thou shalt sit here in my own screwe...",adventure,"“Oh! spite of million villains, this makes me ...","“Oh! spite of million villains, this makes me ...","“Oh! spite of million villains, this makes me ...","One of the most famous characters, Stubb, are...","One of the most famous characters, Stubb, are...","One of the most famous characters, Stubb, are..."
3817,"Sir, I must go with ye”. “If thou speakest thu...",adventure,"“If thou speakest thus to me much more, Ahab’s...","“If thou speakest thus to me much more, Ahab’s...","“If thou speakest thus to me much more, Ahab’s...","One of the biggest stories of the year, the s...","One of the biggest stories of the year, the s...",
3818,"True art thou, lad, as the circumference to it...",adventure,"“True art thou, lad, as the circumference to i...","“True art thou, lad, as the circumference to i...","“True art thou, lad, as the circumference to i...","A tale of the story of the Lord of the Year, ...","A tale of the story of the Lord of the Year, ...","A tale of the story of the Lord of the Year, ..."
3819,"What? neither lock, nor bolt, nor bar; and yet...",adventure,"""It must be the spell;","""It must be the spell; he told me to stay here...","""It must be the spell; he told me to stay here...","A new chair at the end of the year-long, ther...","A new chair at the end of the year-long, ther...","A new chair at the end of the year-long, ther..."


In [53]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3821 entries, 0 to 3820
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   paragraph         3821 non-null   object
 1   genre             3821 non-null   object
 2   bart_summary_30   3236 non-null   object
 3   bart_summary_50   3718 non-null   object
 4   bart_summary_100  3788 non-null   object
 5   xsum_summary_30   3208 non-null   object
 6   xsum_summary_50   3808 non-null   object
 7   xsum_summary_100  3447 non-null   object
dtypes: object(8)
memory usage: 238.9+ KB


In [55]:
processed_data.dropna().to_csv('summaries_processed_by_termination_point.csv', index=False)

In [63]:
processed_data.dropna(subset=['xsum_summary_100']).sample(3000, random_state=3294).to_csv('xsum_summary_100.csv', index=False)