# Text sentencizer

## Configuration

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import itertools
from tqdm import tqdm_notebook as tqdm 
from time import time  # To time our operations

## Data Extraction & Cleaning

In [8]:
df = pd.read_csv('../data/reviews/f1000_reviews.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10738 entries, 0 to 10737
Data columns (total 3 columns):
manuscript_ID    10738 non-null object
review_ID        10738 non-null object
review           10738 non-null object
dtypes: object(3)
memory usage: 251.8+ KB


Remove null entries from all columns except "minor comments". Remove duplicates from the 'major_comment' section.

In [10]:
# Replace string 'None' with actual None
df = df.replace(to_replace=['None','none'], value=np.nan)
d = df.isna().any()
data_df = df.dropna(subset = d[d.values == True].index.values).drop_duplicates(subset = ['review']).copy()
#Drop entries with 'major_comments' having less than two characters
data_df.drop(data_df.review[data_df.review.str.len() < 2].index,inplace=True)
data_df.reset_index(drop=True,inplace=True)

In [11]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7297 entries, 0 to 7296
Data columns (total 3 columns):
manuscript_ID    7297 non-null object
review_ID        7297 non-null object
review           7297 non-null object
dtypes: object(3)
memory usage: 171.1+ KB


Test preprocessing function.

In [12]:
from peertax.sentencizer_LDA import custom_sentencizer as cs
from random import randint
num = randint(0,len(data_df))
txt_test = [data_df.loc[num,'review']]
txt_after = cs(txt_test)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [13]:
for row in txt_test:
    print(repr(row))

'\n            This is an interesting paper showing that purified rainbow trout RBC exposed to IPNV do not get infected by the virus, but nevertheless raise an innate antiviral response. The latter is shown by the induction of IFN and a few IFN related genes and that conditioned media from IPNV exposed RBC inhibits IPNV infection of the susceptibe cell line CHSE-214. Many methods are used to approach the hypotheses and the results appear provide new information on the very intriguing role of fish red blood cells in interaction with viruses, in particular by showing that the cells can induce an antiviral immune response without being infected.\n             The main comments are related to how the authors have interpreted the results far beyond what they have shown, by drawing links to adaptive immune mechanisms and vaccination approaches, while they could have discussed antiviral protection mechanisms in further detail and with more scientific basis. There are also some missing informa

In [14]:
for i in txt_after[0]:
    print(i)
    print('\n')
#txt_after

This is an interesting paper showing that purified rainbow trout RBC exposed to IPNV do not get infected by the virus, but nevertheless raise an innate antiviral response.


The latter is shown by the induction of IFN and a few IFN related genes and that conditioned media from IPNV exposed RBC inhibits IPNV infection of the susceptibe cell line CHSE-214.


Many methods are used to approach the hypotheses and the results appear provide new information on the very intriguing role of fish red blood cells in interaction with viruses, in particular by showing that the cells can induce an antiviral immune response without being infected.


The main comments are related to how the authors have interpreted the results far beyond what they have shown, by drawing links to adaptive immune mechanisms and vaccination approaches, while they could have discussed antiviral protection mechanisms in further detail and with more scientific basis.


There are also some missing information regarding the ex

Run sentencizer.

In [16]:
t = time()
data_df['sentences'] = cs(data_df['review'])
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

HBox(children=(IntProgress(value=0, max=7297), HTML(value='')))


Time to clean up everything: 3.9 mins


In [17]:
#Replace 'nan' with proper NaN
data_df.replace(to_replace=['nan'], value=np.nan, inplace=True)
#Drop NaNs
data_df.drop(data_df.sentences[data_df.sentences.isna() == True].index,inplace=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7297 entries, 0 to 7296
Data columns (total 4 columns):
manuscript_ID    7297 non-null object
review_ID        7297 non-null object
review           7297 non-null object
sentences        7297 non-null object
dtypes: object(4)
memory usage: 285.0+ KB


Flatten the dataframe.

In [19]:
flatten_df = pd.DataFrame({
        "manuscript_ID": np.repeat(data_df.manuscript_ID.values, data_df.sentences.str.len()),
        "review_ID": np.repeat(data_df.review_ID.values, data_df.sentences.str.len()), 
        "sentences": list(itertools.chain.from_iterable(data_df.sentences))})
flatten_df.head()

Unnamed: 0,manuscript_ID,review_ID,sentences
0,10.12688/f1000research.1-1.v1,10.5256/f1000research.50.r101,This is an interesting article.
1,10.12688/f1000research.1-1.v1,10.5256/f1000research.50.r101,"However, I am sure there are some sections whe..."
2,10.12688/f1000research.1-1.v1,10.5256/f1000research.50.r101,I confirm that I have read this submission and...
3,10.12688/f1000research.1-1.v1,10.5256/f1000research.50.r100,This paper has a number of serious flaws.
4,10.12688/f1000research.1-1.v1,10.5256/f1000research.50.r100,a) The literature is quoted selectively and is...


In [20]:
flatten_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134393 entries, 0 to 134392
Data columns (total 3 columns):
manuscript_ID    134393 non-null object
review_ID        134393 non-null object
sentences        134393 non-null object
dtypes: object(3)
memory usage: 3.1+ MB


Clean the results.

In [24]:
#Replace 'nan' with nan in sentences
flatten_df.replace(to_replace=['nan'], value=np.nan,inplace=True)
#Drop NaNs
flatten_df.drop(flatten_df.sentences[flatten_df.sentences.str.len().isna() == True].index,inplace=True)
#Drop entries with less than ten characters (garbage)
flatten_df.drop(flatten_df.sentences[flatten_df.sentences.str.len() < 10].index,inplace=True)
#Reset index
flatten_df.reset_index(drop=True,inplace=True)
flatten_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130846 entries, 0 to 130845
Data columns (total 3 columns):
manuscript_ID    130846 non-null object
review_ID        130846 non-null object
sentences        130846 non-null object
dtypes: object(3)
memory usage: 3.0+ MB


Save dataframe with sentences.

In [26]:
path_save_pickle = "../pickles/f1000_sentencized.pkl"
flatten_df.to_pickle(path_save_pickle)