# Tokenization of sentences (preparation for LDA)

N.B.! This notebook requires a dataframe of "sentencized" texts.

Run "Text_Sentencizer.ipynb" and create a "sentencized.tsv" before running this notebook.

## Configuration

In [None]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import itertools

from tqdm import tqdm_notebook as tqdm 

from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# Data Loading

In [None]:
flatten_df = pd.read_csv('../pickles/f1000_sentencized.tsv',sep='\t',quoting=csv.QUOTE_NONE)
flatten_df.drop(columns=['Unnamed: 0'],inplace=True)
flatten_df.info()

Remove standard sentences.

In [None]:
std_sentence = ['I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard.',
                'I confirm that I have read this submission and believe that I have an appropriate level of expertise to state that I do not consider it to be of an acceptable scientific standard, for reasons outlined above.',
                'I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above.',
                'We confirm that we have read this submission and believe that we have an appropriate level of expertise to confirm that it is of an acceptable scientific standard.',
                'We confirm that we have read this submission and believe that we have an appropriate level of expertise to state that we do not consider it to be of an acceptable scientific standard, for reasons outlined above.',
                'We confirm that we have read this submission and believe that we have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however we have significant reservations, as outlined above.'
               ]
flatten_df = flatten_df[~flatten_df.sentences.isin(std_sentence)]
flatten_df.reset_index(drop=True,inplace=True)
flatten_df.info()

Test preprocessing function.

In [None]:
from peertax.tokenizer_LDA import custom_tokenizer as ct
from random import randint
num = randint(0,len(flatten_df))
sent_test = [flatten_df.loc[num,'sentences']]
sent_after = ct(sent_test)

In [None]:
print(sent_test)

In [None]:
print(sent_after)

Run tokenizer.

In [None]:
t = time()
txt = ct(flatten_df['sentences'])
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Put the results in a DataFrame to remove missing values.

Don't remove duplicates because they are still reviews!

In [None]:
df_clean = pd.DataFrame({'token': txt})
df_clean = df_clean.dropna()
df_clean.info()

Merge with initial dataset to retain indexing. Assign a provisional 'token' column (will update after creating bigram and trigram)

In [None]:
df_cleaned = pd.concat([flatten_df, df_clean], axis=1, join='inner').reset_index(drop=True)
df_cleaned.info()

In [None]:
df_cleaned.head()

Create bigrams.

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
sent = [row.split() for row in df_cleaned['token']]

In [None]:
phrases_bi = Phrases(sent, min_count=30, progress_per=100000)

In [None]:
bigram = Phraser(phrases_bi)

Create trigrams.

In [None]:
phrases_tri = Phrases(phrases_bi[sent], min_count=30, progress_per=100000)

In [None]:
trigram = Phraser(phrases_tri)

Transform the corpus based on the bigrams & trigrams

In [None]:
sentences = trigram[bigram[sent]]

Run figure_conv() to convert bigrams (like fig_a etc.) found during last step.

In [None]:
def figure_conv(text):
    if text in ['figure','figure_a','figure_b','figure_c',
                'fig','fig_a','fig_b','fig_c','figure_figure_supplement']:
        return 'figure'
    else:
        return text
    
def figure_conv_array(doc):
    return [figure_conv(word) for word in doc]

In [None]:
sentences = [figure_conv_array(r) for r in sentences]

Do sanity check of the effectiveness of the cleaning and addition of bigrams & trigrams.

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:20]

Replace 'token' column with new, actual 'tokens'

In [None]:
df_cleaned['token'] = [r for r in sentences]
df_cleaned.info()

Condense 'token' column as a string.

In [None]:
df_cleaned['token'] = df_cleaned['token'].str.join(',')
df_cleaned.head()

Save dataframe with tokens

In [None]:
path_save_tsv = "../pickles/f1000_tokenized_LDA_sentence_0.tsv"
df_cleaned.to_csv(path_save_tsv, sep='\t', quoting=csv.QUOTE_NONE)