# Packages and definitions

In [1]:
import requests as req
import pandas as pd
import os
import re
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from termcolor import colored
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
try :
  import transformers
except :
  !pip install transformers
  import transformers
from transformers import AutoModel, BertTokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from sklearn.utils.class_weight import compute_class_weight
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from itertools import compress
import seaborn as sns
from tqdm import *
import time

# torch.cuda.is_available() returns a boolean to check if the GPU can be used or not
if torch.cuda.is_available():
  # if CUDA is available set 'cuda' as the device
  device = 'cuda'
  # and then print the name of the GPU
  print('DEVICE = ', colored(torch.cuda.get_device_name(0), "green" ) )
else:
  # else, set 'cpu' as device
  device = 'cpu'
  # just print than the CPU is used. Alternatively you can check your CPU with the following command (linux based) in the next cell:
  # ! lscpu
  print('DEVICE = ', colored('CPU', "blue"))

git_url = "https://raw.githubusercontent.com/chlolv/NLP_Project/main/Data/"
H1_url = "H1.txt"
H2_url = "H2.txt"
H3_url = "H3.txt"
H4_url = "H4.txt"
H5_url = "H5.txt"
H6_url = "H6.txt"
H7_url = "H7.txt"

H1 = req.get(git_url + H1_url)
H1 = H1.text
H2 = req.get(git_url + H2_url)
H2 = H2.text
H3 = req.get(git_url + H3_url)
H3 = H3.text
H4 = req.get(git_url + H4_url)
H4 = H4.text
H5 = req.get(git_url + H5_url)
H5 = H5.text
H6 = req.get(git_url + H6_url)
H6 = H6.text
H7 = req.get(git_url + H7_url)
H7 = H7.text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chloe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\chloe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


DEVICE =  [34mCPU[0m


# Processing HP1

In [2]:
H1_processed = H1
H1_processed = H1_processed.split('\r\n\r\n')
H1_processed = [sentence.strip() for sentence in H1_processed]

remove_list = []
for i in range(0, len(H1_processed)) :
  paragraph = H1_processed[i]
  if "CHAPTER" in paragraph :
    remove_list.append(i)
    remove_list.append(i+1)
for index in sorted(remove_list, reverse = True) :
  del H1_processed[index]
H1_processed = [paragraph for paragraph in H1_processed if paragraph not in ["Harry Potter and the Sorcerer's Stone", 'THE END']]

# Processing HP2

In [3]:
H2_processed = H2
H2_processed = re.sub('\r\n[0-9]\r\n|\r\n[0-9][0-9]\r\n|\r\n[0-9][0-9][0-9]\r\n|\r\n[0-9][0-9][0-9][0-9]\r\n|\r\n[0-9][0-9][0-9][0-9][0-9]\r\n', ' ', H2_processed)
H2_processed = re.sub('\r\n|\r\n.\r\n|\r\n..\r\n|\r\n...\r\n|\r\n....\r\n|\r\n.....\r\n', ' ', H2_processed)
H2_processed = re.sub('\*.\*|\*..\*|\*...\*|\*....\*|\*.....\*|\*......\*|\*.......\*|\*........\*|\*.........\*|\*..........\*', '', H2_processed) 
H2_processed = H2_processed[202:]

remove_list = []
H2_processed = sent_tokenize(H2_processed)
for i in reversed(range(1,len(H2_processed))) :
  paragraph = H2_processed[i]
  if paragraph[0].islower() or paragraph[0] == '.' :
    H2_processed[i-1] += ' ' + paragraph
    remove_list.append(i)
for index in sorted(remove_list, reverse = True) :
  del H2_processed[index]

# Processing HP3

In [4]:
H3_processed = H3
H3_processed = re.sub('\\xad', '', H3_processed)
H3_processed = H3_processed.split('\r\n\r\n')
H3_processed = [paragraph.strip() for paragraph in H3_processed]

remove_list = []
for i in range(1, len(H3_processed)) :
  paragraph = H3_processed[i]
  if "CHAPTER" in paragraph :
    remove_list.append(i)
for index in sorted(remove_list, reverse = True) :
  del H3_processed[index]

remove_list = []
for i in reversed(range(1,len(H3_processed))) :
  paragraph = H3_processed[i]
  try :
    if paragraph[0].islower() or paragraph[0] == '.' :
      H3_processed[i-1] += ' ' + paragraph
      remove_list.append(i)
  except :
    H3_processed[i-1] += ' ' + paragraph
    remove_list.append(i)
for index in sorted(remove_list, reverse = True) :
  del H3_processed[index]

# Processing HP4

In [5]:
H4_processed = H4
H4_processed = re.sub('�', '-', H4_processed)
H4_processed = H4_processed.split('\n\n')
H4_processed = [paragraph.strip() for paragraph in H4_processed]

remove_list = []
for i in range(1, len(H4_processed)) :
  paragraph = H4_processed[i]
  if "CHAPTER" in paragraph :
    remove_list.append(i)
for index in sorted(remove_list, reverse = True) :
  del H4_processed[index]

H4_processed = H4_processed[2:]

# Processing HP5

In [6]:
H5_processed = H5
H5_processed = re.sub('�', "\'", H5_processed)
H5_processed = H5_processed.split('\n')
H5_processed = [paragraph.strip() for paragraph in H5_processed]

remove_list = []
for i in range(1, len(H5_processed)) :
  paragraph = H5_processed[i]
  if "CHAPTER" in paragraph :
    remove_list.append(i)
    remove_list.append(i+1)
for index in sorted(remove_list, reverse = True) :
  del H5_processed[index]

remove_list = []
for i in reversed(range(1,len(H5_processed))) :
  paragraph = H5_processed[i]
  try :
    if paragraph[0].islower() or paragraph[0] == '.' :
      H5_processed[i-1] += ' ' + paragraph
      remove_list.append(i)
  except :
    H5_processed[i-1] += ' ' + paragraph
    remove_list.append(i)
for index in sorted(remove_list, reverse = True) :
  del H5_processed[index]

H5_processed = H5_processed[2:]

# Processing HP6

In [7]:
H6_processed = H6
H6_processed = H6_processed.split('\n')
H6_processed = [paragraph.strip() for paragraph in H6_processed]

remove_list = []
for i in range(1, len(H6_processed)) :
  paragraph = H6_processed[i]
  if re.match('Chapter [0-9]', paragraph) :
    remove_list.append(i)
for index in sorted(remove_list, reverse = True) :
  del H6_processed[index]

H6_processed = H6_processed[32:]

# Processing HP7

In [8]:
H7_processed = H7
H7_processed = re.sub('�', "\'", H7_processed)
H7_processed = H7_processed.split('\n')
H7_processed = [paragraph.strip() for paragraph in H7_processed]

remove_list = []
for i in range(1, len(H7_processed)) :
  paragraph = H7_processed[i]
  if paragraph[:7] == 'Chapter' :
    remove_list.append(i)
for index in sorted(remove_list, reverse = True) :
  del H7_processed[index]

H7_processed = H7_processed[1:]

# Merging all HPs

In [9]:
taille_min_para = 30
book_label_list = []
H = []
for book in range(1,8) :
  current_H = globals()['H' + str(book) + '_processed']
  remove_list = []
  for i in reversed(range(0, len(current_H))) :
    paragraph = current_H[i]
    if len(paragraph.split()) < taille_min_para : # Split is on spaces (word count)
      remove_list.append(i)
      current_H[i-1] += ' '
      current_H[i-1] += current_H[i]
  for index in sorted(remove_list, reverse = True) :
    del current_H[index]
  for paragraph in current_H :
      book_label_list.append(book)
  H += current_H

In [14]:
df_H = pd.DataFrame({'HP': H})
df_book = pd.DataFrame({ 'book' : book_label_list})

In [15]:
from google.colab import files

df_H.to_csv('H_series.csv', index = False)
files.download('H_series.csv')

df_book.to_csv('book_labels.csv', index = False)
files.download('book_labels.csv')

ModuleNotFoundError: No module named 'google'