# Dataset Restructuring Notebook
Let the carnage begin

Let's set up things for Colab and non-Colab (just in case...)

In [1]:
import os
import sys

if colab := 'google_colab' in sys.modules:
    from google.colab import drive
    drive.mount("/content/drive")
    # in case we'll do a shared drive, this will have to be changed.
    # For now it is basically a placeholder
    BASE_PATH = "drive/MyDrive/HLT/ProjectAthena/"
    sys.path.insert(0,BASE_PATH)

    !pip install wordcloud
    !pip install -U scikit-learn
    !pip install -U nltk
    !pip install -U seaborn
else:
    BASE_PATH = ".."

In [2]:
# More imports:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns

import nltk

from typing import List, Dict, Optional, Tuple

In [3]:
df = pd.read_csv(os.path.join(BASE_PATH,'philosophy_data.csv'))
df.head()

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
0,Plato - Complete Works,Plato,plato,"What's new, Socrates, to make you leave your ...","What's new, Socrates, to make you leave your ...",-350,1997,125,"what's new, socrates, to make you leave your ...","['what', 'new', 'socrates', 'to', 'make', 'you...","what be new , Socrates , to make -PRON- lea..."
1,Plato - Complete Works,Plato,plato,Surely you are not prosecuting anyone before t...,Surely you are not prosecuting anyone before t...,-350,1997,69,surely you are not prosecuting anyone before t...,"['surely', 'you', 'are', 'not', 'prosecuting',...",surely -PRON- be not prosecute anyone before ...
2,Plato - Complete Works,Plato,plato,The Athenians do not call this a prosecution b...,The Athenians do not call this a prosecution b...,-350,1997,74,the athenians do not call this a prosecution b...,"['the', 'athenians', 'do', 'not', 'call', 'thi...",the Athenians do not call this a prosecution ...
3,Plato - Complete Works,Plato,plato,What is this you say?,What is this you say?,-350,1997,21,what is this you say?,"['what', 'is', 'this', 'you', 'say']",what be this -PRON- say ?
4,Plato - Complete Works,Plato,plato,"Someone must have indicted you, for you are no...","Someone must have indicted you, for you are no...",-350,1997,101,"someone must have indicted you, for you are no...","['someone', 'must', 'have', 'indicted', 'you',...","someone must have indict -PRON- , for -PRON- ..."


## The Concatenation
Let's do it

Our objective is easy: the dataset has too many, too short sentences. We need "weld" sentences together to make them longer. Our dataset will be shorter, and composed of more significant sentences. Our models will be harder, better faster, stronger.

Of course this is easier said than done. I'll be doing it in a very rough way. That is: concatenate! Even in doing so, extra care has to be taken.

### Recap from the previous episodes

In [4]:
# average sentence length in tokens:
print(f'The overall average sentence length (in tokens) is: {df['sentence_length'].mean()}')
print(f'The longest sentence has {df['sentence_length'].max()} tokens')
print('Whereas, per school the average length (in tokens) is:')
df.groupby('school')['sentence_length'].mean()


The overall average sentence length (in tokens) is: 150.79096361499745
The longest sentence has 2649 tokens
Whereas, per school the average length (in tokens) is:


school
analytic           119.025205
aristotle          153.224953
capitalism         187.576289
communism          152.752311
continental        171.792060
empiricism         183.638051
feminism           153.083928
german_idealism    180.251329
nietzsche          116.599867
phenomenology      145.913345
plato              114.938018
rationalism        163.958996
stoicism           137.056410
Name: sentence_length, dtype: float64

### Now let's do business

For now, just as a beginning, we set a target length in terms of tokens

In [5]:
# Arbitrarily set
MAX_CHAR = 1074

Check that the sentence can be merged with the next

In [6]:
def check_conditions(df:pd.DataFrame, 
                     index:int,
                     accumulated_l:int) -> bool:
    check = True
    if index>0:
        check = check and (df['title'].loc[index-1] == df['title'].loc[index])
        # I skip checks on authors and schools.
        # If the book changes then the author changes, 
        # and therefore the school changes as well (this has been checked)
        
        # Let's skip the controls on info such as original publication date 
        # (who needs those anyway)
        
        # check that by adding the new sentence the total length is within bounds
        check = check and (accumulated_l + df['sentence_length'].loc[index] < MAX_CHAR + 2)
    return check

In [7]:
# let's declare a new dataframe
df_new = pd.DataFrame(columns=df.columns)

In [8]:
i = 0
while i < 100:#df.shape[0]:
    #print(f'iteration {i}')
    sentence_spacy = ''
    sentence_length = 0
    sentence_lowered = ''
    tokenized_txt = ''      # the fact that pandas turns it into a string is ridiculous
    lemmatized_str = ''

    # hopefully I don't mess up with the indexing...
    while check_conditions(df,index=i,accumulated_l=sentence_length):
        sentence_spacy += df['sentence_spacy'].loc[i] + ' '
        sentence_length += (i==0)*1 + df['sentence_length'].loc[i]
        sentence_lowered += df['sentence_lowered'].loc[i] + ' '
        tokenized_txt += df['tokenized_txt'].loc[i][1:-1] + ', '
        lemmatized_str += df['lemmatized_str'].loc[i]
        if i < df.shape:
            i += 1
        #print(f'i increased to {i}')


    row = {
        'title': df['title'].loc[i],
        'author': df['author'].loc[i],
        'school': df['school'].loc[i],
        'sentence_spacy': sentence_spacy[:-1],
        'sentence_str': sentence_spacy[:-1],
        'original_publication_date': df['original_publication_date'].loc[i],
        'corpus_edition_date': df['corpus_edition_date'].loc[i],
        'sentence_length': sentence_length,
        'sentence_lowered': sentence_lowered[:-1],
        'tokenized_txt': '[' + tokenized_txt[:-2] + ']',
        'lemmatized_str': lemmatized_str
    }
    df_hold = pd.DataFrame([row])
    df_new = pd.concat([df_new, df_hold])
   

iteration 0
i increased to 1
i increased to 2
i increased to 3
i increased to 4
i increased to 5
i increased to 6
i increased to 7
i increased to 8
i increased to 9
i increased to 10
i increased to 11
i increased to 12
i increased to 13
i increased to 14
i increased to 15
iteration 15
i increased to 16
i increased to 17
i increased to 18
i increased to 19
i increased to 20
i increased to 21
iteration 21
i increased to 22
i increased to 23
i increased to 24
i increased to 25
i increased to 26
i increased to 27
i increased to 28
i increased to 29
i increased to 30
iteration 30
i increased to 31
i increased to 32
i increased to 33
i increased to 34
i increased to 35
i increased to 36
i increased to 37
i increased to 38
i increased to 39
i increased to 40
i increased to 41
i increased to 42
iteration 42
i increased to 43
i increased to 44
i increased to 45
i increased to 46
i increased to 47
i increased to 48
i increased to 49
i increased to 50
i increased to 51
i increased to 52
iteration

### Last checks, and save

In [10]:
print(f'New dataframe is of shape {df_new.shape}')

<class 'pandas.core.frame.DataFrame'>


In [None]:
# check that every new sentence is no longer than maximum amount of tokens
print(df_new['sentence_length'].unique().any() > MAX_CHAR)