In [None]:
import json
import spacy
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import ngrams
from collections import Counter
import nltk 


file_path = 'path/to/your/en_df_senza_libro_extra.json'

# Load the JSON file into a pandas DataFrame
df = pd.read_json(file_path, lines=True)

# Display the DataFrame
print(df)

In [None]:


def custom_tokenizer(text):
    # Use nltk's word_tokenize for tokenization
    tokens = word_tokenize(text)
    return tokens

def preprocess_text(text):
    nlp = spacy.load('en_core_web_lg')
    nlp_it = spacy.load('it_core_news_sm')
    nlp_es = spacy.load('es_core_news_sm')
    
    #remove footnotes
    footnote_pattern = r'\d+\s+[A-Z]\.\s+[A-Z][a-z]+,?\s+“[^”]+”,\s+cit\.?,\s+(p\.|pp\.)\s+\d+(-\d+)?\.?'
    text = re.sub(footnote_pattern, '', text)
   
    # Define a regular expression pattern for websites
    website_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Defining a regular expression pattern for emails
    email_pattern = r'\S+@\S+'
    
    # Defining a regular expression pattern for phone numbers
    phone_number_pattern = r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
    
     # removing words connected to numbers
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    
    # Words Connected to Numbers with Specific Punctuation
    text = re.sub(r'\b\w+[-/]\d+|\d+[-/]\w+\b', '', text)
    
    #removing words connected to punctuation
    text = re.sub(r'\w*[\d,.!?;:]+\w*', '', text)


    # removing websites, emails, and phone numbers
    text = re.sub(website_pattern, '', text)
    text = re.sub(email_pattern, '', text)
    text = re.sub(phone_number_pattern, '', text)
   
    
    # removing numeric values
    text = re.sub(r'\b\d+\b', '', text)

   
    # removing HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # removing all occurrences of 'º'
    text = re.sub(r'º+', '', text) 
    
    # removing Roman numerals
    text = re.sub(r'\b(?:i{1,3}|iv|v|vi{1,3}|ix|x|xi{1,3}|xl|l|lx|xc|c|cc|ccc|cd|d|dc|dcc|dccc|cm|m|mm|mmm)\b', '', text, flags=re.IGNORECASE)

    # removing words with three same characters in sequence
    text = re.sub(r'\b\w*(\w)\1{2,}\w*\b', '', text)
    
    
    #removing punctuations
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Using custom tokenizer for better tokenization
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]  # Exclude non-alphabetic tokens
    
    # removing words with accents by filtering out tokens that are not ASCII
    tokens = [token for token in tokens if all(ord(char) < 128 for char in token)]

    # removing stopwords
    stop_words = set(stopwords.words('english')) | nlp_it.Defaults.stop_words | nlp_es.Defaults.stop_words
    tokens = [word for word in tokens if word.lower() not in stop_words]

    
    # removing specific words
    exclusion_list = ['—', ',', '–', '’,', '``', 'et','eg', 'al']
    tokens = [word for word in tokens if word not in exclusion_list and len(word) > 1]

    
    # Lemmatizing the tokens
    
    doc = nlp(' '.join(tokens))
    tokens_final = [token.lemma_ for token in doc if not token.is_stop]
    bigrams = list(ngrams(tokens_final, 2))
    named_entities = [(entity.text.lower(), entity.label_) for entity in doc.ents]
    named_entity_bigrams = [(bigram, label) for bigram in bigrams for word, label in named_entities if word.lower() in bigram]

    
  

    return named_entity_bigrams #Or return named entities to get a seperate list of entities


In [None]:
# preprocessing each document seperately to examine the output

text_to_preprocess = en_df.iloc[24, 1] #defining the row and column containing the doc
twenty_fifth_row = preprocess_text(text_to_preprocess)


print("\nPreprocessed Text:")
print(twenty_fifth_row)

In [None]:
#Examining top 100 frequent words and entities and check for duplicates

nlp=spacy.load("en_core_web_lg")

tks = twenty_fifth_row

# Creating a Counter for word counts
word_counter = Counter(tks)

# Get the 10 most common words
most_common_words = word_counter.most_common(100)

# Check for duplicates in the top 100 most frequent words
most_common_duplicates = [term for term, count in Counter(most_common_terms).items() if count > 1]
if most_common_duplicates:
    print("Duplicate terms in the top 100 most frequent words:", most_common_duplicates)
else:
    print("There are no duplicate terms in the top 100 most frequent words.")


print("Top 100 most frequent words:")
for word, count in most_common_words:
    token = nlp(word)
    named_entity = token.ents[0].label_ if token.ents else 'None'
    print(f"{word}: {count} , {named_entity}")

In [None]:
import pandas as pd
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer


def summarize_text(text, sentences_count=3):

    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, sentences_count)
    summarized_text = " ".join(str(sentence) for sentence in summary)
    
    return summarized_text

df['summarized_text'] = df['original_text_partial'].apply(summarize_text)


In [None]:
#export to json

file_path = '/mnt/data/en_df_senza_libro_extra.json'

# Save the JSON data to a file
df.to_json(file_path, orient='records', lines=True)