# Text sentencizer

## Configuration

In [None]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import itertools
from tqdm import tqdm_notebook as tqdm 
from time import time  # To time our operations

## Data Extraction & Cleaning

In [None]:
df = pd.read_csv('../data/reviews/f1000_reviews.csv')
df.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df.info()

Remove null entries from all columns except "minor comments". Remove duplicates from the 'major_comment' section.

In [None]:
# Replace string 'None' with actual None
df = df.replace(to_replace=['None','none'], value=np.nan)
d = df.isna().any()
data_df = df.dropna(subset = d[d.values == True].index.values).drop_duplicates(subset = ['review']).copy()
#Drop entries with 'major_comments' having less than two characters
data_df.drop(data_df.review[data_df.review.str.len() < 2].index,inplace=True)
data_df.reset_index(drop=True,inplace=True)

In [None]:
data_df.info()

Test preprocessing function.

In [None]:
from peertax.sentencizer_LDA import custom_sentencizer as cs
from random import randint
num = randint(0,len(data_df))
txt_test = [data_df.loc[num,'review']]
txt_after = cs(txt_test)

In [None]:
for row in txt_test:
    print(repr(row))

In [None]:
for i in txt_after[0]:
    print(i)
    print('\n')
#txt_after

Run sentencizer.

In [None]:
t = time()
data_df['sentences'] = cs(data_df['review'])
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
#Replace 'nan' with proper NaN
data_df.replace(to_replace=['nan'], value=np.nan, inplace=True)
#Drop NaNs
data_df.drop(data_df.sentences[data_df.sentences.isna() == True].index,inplace=True)
data_df.info()

Flatten the dataframe.

In [None]:
flatten_df = pd.DataFrame({
        "manuscript_ID": np.repeat(data_df.manuscript_ID.values, data_df.sentences.str.len()),
        "review_ID": np.repeat(data_df.review_ID.values, data_df.sentences.str.len()), 
        "sentences": list(itertools.chain.from_iterable(data_df.sentences))})
flatten_df.head()

In [None]:
flatten_df.info()

Clean the results.

In [None]:
#Replace 'nan' with nan in sentences
flatten_df.replace(to_replace=['nan'], value=np.nan,inplace=True)
#Drop NaNs
flatten_df.drop(flatten_df.sentences[flatten_df.sentences.str.len().isna() == True].index,inplace=True)
#Drop entries with less than ten characters (garbage)
flatten_df.drop(flatten_df.sentences[flatten_df.sentences.str.len() < 10].index,inplace=True)
#Reset index
flatten_df.reset_index(drop=True,inplace=True)
flatten_df.info()

Save dataframe with sentences.

In [None]:
path_save_tsv = "../pickles/f1000_sentencized.tsv"
flatten_df.to_csv(path_save_tsv, sep = '\t', quoting=csv.QUOTE_NONE)