# Text sentencizer

## Configuration

In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import itertools
from tqdm import tqdm_notebook as tqdm 
from time import time  # To time our operations

## Data Extraction & Cleaning

In [2]:
df = pd.read_csv('../data/reviews/f1000_reviews.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10738 entries, 0 to 10737
Data columns (total 3 columns):
manuscript_ID    10738 non-null object
review_ID        10738 non-null object
review           10738 non-null object
dtypes: object(3)
memory usage: 251.8+ KB


Remove null entries from all columns except "minor comments". Remove duplicates from the 'major_comment' section.

In [4]:
# Replace string 'None' with actual None
df = df.replace(to_replace=['None','none'], value=np.nan)
d = df.isna().any()
data_df = df.dropna(subset = d[d.values == True].index.values).drop_duplicates(subset = ['review']).copy()
#Drop entries with 'major_comments' having less than two characters
data_df.drop(data_df.review[data_df.review.str.len() < 2].index,inplace=True)
data_df.reset_index(drop=True,inplace=True)

In [5]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7297 entries, 0 to 7296
Data columns (total 3 columns):
manuscript_ID    7297 non-null object
review_ID        7297 non-null object
review           7297 non-null object
dtypes: object(3)
memory usage: 171.1+ KB


Test preprocessing function.

In [6]:
from peertax.sentencizer_LDA import custom_sentencizer as cs
from random import randint
num = randint(0,len(data_df))
txt_test = [data_df.loc[num,'review']]
txt_after = cs(txt_test)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [7]:
for row in txt_test:
    print(repr(row))

'\n            The article is supposed to focus on the possible resistance mechanism to drug\xa0therapy\xa0in breast cancer. Yet, most literature review and examples cited are based on colorectal cancer, and none of the breast cancer trials are mentioned. The article is not articulated in a way to guide the readers point-to-point and rather scarce in consummating all the points systematically.\xa0\n             \n             The author also did not clearly point out what are the current problems and resistance mechanism in breast cancer, what are the current approaches to overcome these problems and what is the possible outlook in overcoming drug resistance in breast cancer.\n            I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above.\n        '


In [8]:
for i in txt_after[0]:
    print(i)
    print('\n')
#txt_after

The article is supposed to focus on the possible resistance mechanism to drugtherapyin breast cancer.


Yet, most literature review and examples cited are based on colorectal cancer, and none of the breast cancer trials are mentioned.


The article is not articulated in a way to guide the readers point-to-point and rather scarce in consummating all the points systematically.


The author also did not clearly point out what are the current problems and resistance mechanism in breast cancer, what are the current approaches to overcome these problems and what is the possible outlook in overcoming drug resistance in breast cancer.


I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above.




Run sentencizer.

In [9]:
t = time()
data_df['sentences'] = cs(data_df['review'])
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

HBox(children=(IntProgress(value=0, max=7297), HTML(value='')))


Time to clean up everything: 3.49 mins


In [10]:
#Replace 'nan' with proper NaN
data_df.replace(to_replace=['nan'], value=np.nan, inplace=True)
#Drop NaNs
data_df.drop(data_df.sentences[data_df.sentences.isna() == True].index,inplace=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7297 entries, 0 to 7296
Data columns (total 4 columns):
manuscript_ID    7297 non-null object
review_ID        7297 non-null object
review           7297 non-null object
sentences        7297 non-null object
dtypes: object(4)
memory usage: 285.0+ KB


Flatten the dataframe.

In [11]:
flatten_df = pd.DataFrame({
        "manuscript_ID": np.repeat(data_df.manuscript_ID.values, data_df.sentences.str.len()),
        "review_ID": np.repeat(data_df.review_ID.values, data_df.sentences.str.len()), 
        "sentences": list(itertools.chain.from_iterable(data_df.sentences))})
flatten_df.head()

Unnamed: 0,manuscript_ID,review_ID,sentences
0,10.12688/f1000research.1-1.v1,10.5256/f1000research.50.r101,This is an interesting article.
1,10.12688/f1000research.1-1.v1,10.5256/f1000research.50.r101,"However, I am sure there are some sections whe..."
2,10.12688/f1000research.1-1.v1,10.5256/f1000research.50.r101,I confirm that I have read this submission and...
3,10.12688/f1000research.1-1.v1,10.5256/f1000research.50.r100,This paper has a number of serious flaws.
4,10.12688/f1000research.1-1.v1,10.5256/f1000research.50.r100,a) The literature is quoted selectively and is...


In [12]:
flatten_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136061 entries, 0 to 136060
Data columns (total 3 columns):
manuscript_ID    136061 non-null object
review_ID        136061 non-null object
sentences        136061 non-null object
dtypes: object(3)
memory usage: 3.1+ MB


Clean the results.

In [13]:
#Replace 'nan' with nan in sentences
flatten_df.replace(to_replace=['nan'], value=np.nan,inplace=True)
#Drop NaNs
flatten_df.drop(flatten_df.sentences[flatten_df.sentences.str.len().isna() == True].index,inplace=True)
#Drop entries with less than ten characters (garbage)
flatten_df.drop(flatten_df.sentences[flatten_df.sentences.str.len() < 10].index,inplace=True)
#Reset index
flatten_df.reset_index(drop=True,inplace=True)
flatten_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132419 entries, 0 to 132418
Data columns (total 3 columns):
manuscript_ID    132419 non-null object
review_ID        132419 non-null object
sentences        132419 non-null object
dtypes: object(3)
memory usage: 3.0+ MB


Save dataframe with sentences.

In [14]:
path_save_tsv = "../pickles/f1000_sentencized.tsv"
flatten_df.to_csv(path_save_tsv, separator = '\t', quoting=csv.QUOTE_NONE)