In [2]:
import os
import glob
import re
from unidecode import unidecode
import csv
import pandas as pd
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, normalize

# Mount the Google drive for access to files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
poems_clean = False
poems_split = False

In [4]:
incerto_dir = '/content/drive/MyDrive/incerto-autore/'
og_poems_dir = os.path.join(incerto_dir, 'data', 'original-poems')
poems_files = glob.glob(os.path.join(og_poems_dir, "*.txt"))
new_poems_dir = os.path.join(incerto_dir, 'data', 'poems')
figures_dir = os.path.join(incerto_dir, 'output', 'figures')

## Cleaning poems


In [None]:
len(poems_files)

189

In [None]:
def cleaning_poems(_new_poems_dir, _poems_files):

  with open(os.path.join(_new_poems_dir,  f'poems.csv'), 'w', newline='') as csvfile:
      writer = csv.writer(csvfile)
      writer.writerow(['filename', 'author', 'poem'])

      for txtfile in sorted(_poems_files):
        filename = txtfile.split('/')[-1]
        authorname = txtfile.split('/')[-1].split('_')[0]
        
        poem = []
        with open(txtfile, 'r', encoding='utf-8-sig') as f:
          for line in f:
            line_noend = line.strip()   # no end of line
            line_nopunct = re.sub(r'[^\w\s]', ' ', line_noend)  # no punctuation
            line_noacc = unidecode(line_nopunct, 'utf-8')  # no accents
            for x in line_noacc.split(' '):
              if x == 'et':
                poem.append('e')
              else:
                poem.append(x)

        clean_poem = re.sub(r'\s+', ' ', ' '.join(poem)).strip()  # with spaces

        writer.writerow([filename, authorname, clean_poem])

In [None]:
if poems_clean:
  poems_df = pd.read_csv(os.path.join(incerto_dir, 'poems', 'poems.csv'))
  print(len(poems_df))
else:
  cleaning_poems(new_poems_dir, poems_files)

## Splitting poems into sonnets

In [10]:
def splitting_poems(_new_poems_dir, _og_poems_dir):

  with open(os.path.join(_new_poems_dir, f'poems_split.csv'), 'w', newline='') as csvfile:
      writer = csv.writer(csvfile)
      writer.writerow(['label', 'filename', 'author', 'poem'])

      df = pd.read_csv(os.path.join(_new_poems_dir, f'poems_whole.csv'))
      for index, row in df.iterrows():
        if row['author'] != 'Petrarca':
          txtfile = os.path.join(_og_poems_dir, row['filename'])

        with open(txtfile, 'r', encoding='utf-8-sig') as f:
          filelines = f.readlines()
          
          if 12 <= len(filelines) <= 16:
            writer.writerow([row['label'], row['filename'], row['author'], row['poem']])

          else:
            ix = 0
            n = 14
            lines = [l.strip() for l in filelines]
            for i in range(0, len(lines), n):
              chunk = lines[i:i+n]

              label = row['label']

              
              new_chunk = []
              for line in chunk:
                line_nonewlines = re.sub(r'/(\r\n)+|\r+|\n+|\t+/', '', line)  # no end of line
                line_nopunct = re.sub(r'[^\w\s]', ' ', line_nonewlines)  # no punctuation
                line_noacc = unidecode(line_nopunct, 'utf-8')       # no accents
                for x in line_noacc.split(' '):
                  if x:
                    if x == 'et':
                      new_chunk.append('e')
                    else:
                      new_chunk.append(x)

              if new_chunk:             
                clean_chunk = re.sub(r'\s+', ' ', ' '.join(new_chunk)).strip()   # with spaces

                ix += 1
                code = f'{label}_{ix}'
                writer.writerow([code, row['filename'], row['author'], clean_chunk])

In [11]:
if poems_split:
  splitting_poems(new_poems_dir, og_poems_dir)

DV16
GG5


## Petrarca's poems

In [None]:
with open(os.path.join(og_poems_dir, 'petrarca', 'Petrarca_Rvf_for_VF_analysis_5Oct2022_utf8.txt'), 'r', encoding='utf_8_sig') as f:
  lines = ''.join([line for line in f])

poems = [poem for poem in lines.split('\n\n') if '**' in poem and poem != '\n']

In [None]:
with open(os.path.join(new_poems_dir, f'petrarca.csv'), 'w', newline='') as csvfile:
  writer = csv.writer(csvfile)
  writer.writerow(['label', 'filename', 'author', 'poem'])
  filename = 'Petrarca_Rvf_for_VF_analysis_5Oct2022_utf8.txt'
  authorname = 'Petrarca'

  for poem in poems:
    new_poem = []
    for line in poem.split('\n'):

      if '**' in line:
        label = 'P'+str(line.strip('**').split(' ')[0])

      else:
        line_nopunct = re.sub(r'[^\w\s]', ' ', line)  # no punctuation
        line_noacc = unidecode(line_nopunct, 'utf-8')  # no accents
        for x in line_noacc.split(' '):
          if x == 'et':
            new_poem.append('e')
          else:
            new_poem.append(x)

    clean_poem = re.sub(r'\s+', ' ', ' '.join(new_poem)).strip()  # with spaces

    writer.writerow([label, filename, authorname, clean_poem])

In [None]:
with open(os.path.join(new_poems_dir, f'petrarca_split.csv'), 'w', newline='') as csvfile:
  writer = csv.writer(csvfile)
  writer.writerow(['label', 'filename', 'author', 'poem'])
  filename = 'Petrarca_Rvf_for_VF_analysis_5Oct2022_utf8.txt'
  authorname = 'Petrarca'

  for poem in poems:

    poemlines = poem.split('\n')

    if 12 <= len(poemlines[2:]) <= 16:
      
      new_poem = []
      for line in poemlines:

        if '**' in line:
          label = 'P'+str(line.replace('**', '').split(' ')[0])

        else:
          line_nopunct = re.sub(r'[^\w\s]', ' ', line)  # no punctuation
          line_noacc = unidecode(line_nopunct, 'utf-8')  # no accents
          for x in line_noacc.split(' '):
            if x == 'et':
              new_poem.append('e')
            else:
              new_poem.append(x)

      clean_poem = re.sub(r'\s+', ' ', ' '.join(new_poem)).strip()

      writer.writerow([label, filename, authorname, clean_poem])


    else:

      for line in poemlines[:5]:
        if '**' in line:
          label = 'P'+str(line.replace('**', '').split(' ')[0])        

      ix = 0
      n = 14
      
      for i in range(0, len(poemlines[2:]), n):
        chunk = poemlines[2:][i:i+n]

        new_chunk = []
        for line in chunk:
          line_nonewlines = re.sub(r'/(\r\n)+|\r+|\n+|\t+/', '', line)  # no end of line
          line_nopunct = re.sub(r'[^\w\s]', ' ', line_nonewlines)  # no punctuation
          line_noacc = unidecode(line_nopunct, 'utf-8')       # no accents
          for x in line_noacc.split(' '):
            if x:
              if x == 'et':
                new_chunk.append('e')
              else:
                new_chunk.append(x)

        if new_chunk:             
          clean_chunk = re.sub(r'\s+', ' ', ' '.join(new_chunk)).strip()   # with spaces

          ix += 1
          code = f'{label}_{ix}'
          writer.writerow([code, filename, authorname, clean_chunk])