# Dataset Restructuring Notebook
Let the carnage begin

Let's set up things for Colab and non-Colab (just in case...)

In [1]:
import os
import sys

if colab := 'google_colab' in sys.modules:
    from google.colab import drive
    drive.mount("/content/drive")
    # in case we'll do a shared drive, this will have to be changed.
    # For now it is basically a placeholder
    BASE_PATH = "drive/MyDrive/HLT/ProjectAthena/"
    sys.path.insert(0,BASE_PATH)

    !pip install wordcloud
    !pip install -U scikit-learn
    !pip install -U nltk
    !pip install -U seaborn
else:
    BASE_PATH = ".."

In [2]:
# More imports:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns

import nltk

from typing import List, Dict, Optional, Tuple

In [3]:
df = pd.read_csv(os.path.join(BASE_PATH,'philosophy_data.csv'))
df.head()

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
0,Plato - Complete Works,Plato,plato,"What's new, Socrates, to make you leave your ...","What's new, Socrates, to make you leave your ...",-350,1997,125,"what's new, socrates, to make you leave your ...","['what', 'new', 'socrates', 'to', 'make', 'you...","what be new , Socrates , to make -PRON- lea..."
1,Plato - Complete Works,Plato,plato,Surely you are not prosecuting anyone before t...,Surely you are not prosecuting anyone before t...,-350,1997,69,surely you are not prosecuting anyone before t...,"['surely', 'you', 'are', 'not', 'prosecuting',...",surely -PRON- be not prosecute anyone before ...
2,Plato - Complete Works,Plato,plato,The Athenians do not call this a prosecution b...,The Athenians do not call this a prosecution b...,-350,1997,74,the athenians do not call this a prosecution b...,"['the', 'athenians', 'do', 'not', 'call', 'thi...",the Athenians do not call this a prosecution ...
3,Plato - Complete Works,Plato,plato,What is this you say?,What is this you say?,-350,1997,21,what is this you say?,"['what', 'is', 'this', 'you', 'say']",what be this -PRON- say ?
4,Plato - Complete Works,Plato,plato,"Someone must have indicted you, for you are no...","Someone must have indicted you, for you are no...",-350,1997,101,"someone must have indicted you, for you are no...","['someone', 'must', 'have', 'indicted', 'you',...","someone must have indict -PRON- , for -PRON- ..."


## The Concatenation
Let's do it

Our objective is easy: the dataset has too many, too short sentences. We need "weld" sentences together to make them longer. Our dataset will be shorter, and composed of more significant sentences. Our models will be harder, better faster, stronger.

Of course this is easier said than done. I'll be doing it in a very rough way. That is: concatenate! Even in doing so, extra care has to be taken.

### Recap from the previous episodes

In [4]:
# average sentence length in tokens:
print(f'The overall average sentence length (in tokens) is: {df['sentence_length'].mean()}')
print(f'The longest sentence has {df['sentence_length'].max()} tokens')
print('Whereas, per school the average length (in tokens) is:')
df.groupby('school')['sentence_length'].mean()


The overall average sentence length (in tokens) is: 150.79096361499745
The longest sentence has 2649 tokens
Whereas, per school the average length (in tokens) is:


school
analytic           119.025205
aristotle          153.224953
capitalism         187.576289
communism          152.752311
continental        171.792060
empiricism         183.638051
feminism           153.083928
german_idealism    180.251329
nietzsche          116.599867
phenomenology      145.913345
plato              114.938018
rationalism        163.958996
stoicism           137.056410
Name: sentence_length, dtype: float64

### Now let's do business

For now, just as a beginning, we set a target length in terms of tokens

In [13]:
# Arbitrarily set
MAX_TOKENS = 1074

In [14]:
def check_conditions(df:pd.DataFrame, index:int) -> bool:
    check = True
    if index > 0:
        prev = index-1
        check = check and (df['title'].loc[prev] == df['title'].loc[index])
        # I skip checks on authors and schools.
        # If the book changes then the author changes, 
        # and therefore the school changes as well (this has been checked)
        
        # Let's skip the controls on info such as original publication date 
        # (who needs those anyway)
        
        # 
        check = check and (df['sentence_length'].loc[prev] + df['sentence_length'].loc[index] < MAX_TOKENS + 2)

    return check

In [11]:
# let's declare a new dataframe
df_new = pd.DataFrame(columns=df.columns)

In [15]:
for i in range(100):#df.shape[0]:
    sentence_spacy = ''
    sentence_length = 0
    sentence_lowered = ''
    tokenized_txt = ''      # the fact that pandas turns it into a string is ridiculous
    lemmatized_str = ''

    # hopefully I don't mess up with the indexing...
    while check_conditions(df,i):
        sentence_spacy += df['sentence_spacy'].loc[i] + ' '
        sentence_length += 1 + df['sentence_length'].loc[i]
        sentence_lowered += df['sentence_lowered'].loc[i] + ' '
        tokenized_txt += df['tokenized_txt'].loc[i][1:-1] + ', '
        lemmatized_str += df['lemmatized_str'].loc[i]
        i += 1


    row = {
        'title': df['title'].loc[i],
        'author': df['author'].loc[i],
        'school': df['school'].loc[i],
        'sentence_spacy': sentence_spacy[:-1],
        'sentence_str': sentence_spacy[:-1],
        'original_publication_date': df['original_publication_date'].loc[i],
        'corpus_edition_date': df['corpus_edition_date'].loc[i],
        'sentence_lenght': sentence_length,
        'sentence_lowered': sentence_lowered[:-1],
        'tokenized_txt': '[' + tokenized_txt[:-2] + ']',
        'lemmatized_str': lemmatized_str
    }

    df_new = df_new.append(row, ignore_index=True)


    

AttributeError: 'DataFrame' object has no attribute 'append'

In [10]:
print(type(df_new))

<class 'pandas.core.frame.DataFrame'>
