In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

multinews = load_dataset("multi_news")

multinews

In [None]:
import pandas as pd

# Then, convert each dataset in the DatasetDict to a pandas DataFrame
train_df = pd.DataFrame(multinews['train'])
validation_df = pd.DataFrame(multinews['validation'])
test_df = pd.DataFrame(multinews['test'])

# Concatenate the DataFrames to create a single DataFrame
multinews_df = pd.concat([train_df, validation_df, test_df], ignore_index=True)

DATA UNDERSTANDING

In [None]:
# Overview of the dataset structure
print(multinews_df.info())
print(multinews_df.head())

# Dataset size
num_documents = len(multinews_df)
print(f"Number of documents: {num_documents}")

DATA QUALITY

In [None]:
# Missing values check
missing_values = multinews_df.isnull().sum()
print("Missing values:\n", missing_values)

# Data consistency check: Ensure every document has a summary
consistent_pairs = multinews_df.dropna(subset=['document', 'summary'])
print(f"Consistent document-summary pairs: {len(consistent_pairs)}")

In [None]:
# Duplicate documents/summaries
duplicates = multinews_df.duplicated(subset=['document', 'summary'])
print(f"Number of duplicate pairs: {duplicates.sum()}")

# Remove duplicates
df = multinews_df.drop_duplicates(subset=['document', 'summary'])

ANALISI LUNGHEZZE

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

# Document and summary length analysis
df['doc_length_sentences'] = df['document'].apply(lambda x: len(sent_tokenize(x)))
df['doc_length_words'] = df['document'].apply(lambda x: len(word_tokenize(x)))

df['summary_length_sentences'] = df['summary'].apply(lambda x: len(sent_tokenize(x)))
df['summary_length_words'] = df['summary'].apply(lambda x: len(word_tokenize(x)))

In [None]:
max_doc_length = multinews_df['doc_length_words'].max()
print("Maximum document length:", max_doc_length)

min_doc_length = multinews_df['doc_length_words'].min()
print("Minimum document length:", min_doc_length)

max_summary_length = multinews_df['summary_length_words'].max()
print("Maximum summary length:", max_summary_length)

min_summary_length = multinews_df['summary_length_words'].min()
print("Minimum summary length:", min_summary_length)

# Calculate median lengths
median_doc_length = multinews_df['doc_length_words'].median()
median_summary_length = multinews_df['summary_length_words'].median()

print(f"Median document length (in words): {median_doc_length}")
print(f"Median summary length (in words): {median_summary_length}")

In [None]:
# Distribution of lengths
import matplotlib.pyplot as plt

# Plot the histogram for Document Length
plt.figure(figsize=(10, 6))
plt.hist(multinews_df['doc_length_words'], bins=2000, alpha=0.7, color=(.90, .4, .4), ec='black')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.xlim([0, 18000])  # Set x-axis range
plt.ylim([0, 7000])  # Set y-axis range
plt.title('Distribution of Document Lengths')
plt.show()

# Plot the histogram for Summary Length
plt.figure(figsize=(10, 6))
plt.hist(multinews_df['summary_length_words'], bins=50, alpha=0.7, color='lightblue', ec='black')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.xlim([0, 1200])  # Set x-axis range
plt.ylim([0, 7000])  # Set y-axis range
plt.title('Distribution of Summary Lengths')
plt.show()

In [None]:
ANALISI NUMERO TESTI PER DOCUMENTO

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Function to count sources
def count_sources(document):
    return document.count('|||||') + 1

# Apply function to each document in the DataFrame
multinews_df['num_sources'] = multinews_df['document'].apply(count_sources)

# Count number of documents for each number of sources
source_counts = multinews_df['num_sources'].value_counts().sort_index()


In [None]:
# Determine the minimum number of sources
min_sources = multinews_df['num_sources'].min()
print("Minimum number of sources:", min_sources)

# Determine the maximum number of sources
max_sources = multinews_df['num_sources'].max()
print("Maximum number of sources:", max_sources)

# Determine the median number of sources
median_sources = multinews_df['num_sources'].median()
print("Median number of sources:", median_sources)

In [None]:
# Plotting
plt.figure(figsize=(10, 6))
plt.bar(source_counts.index, source_counts.values, color=(.90, .4, .4), ec='black')
plt.xlabel('Number of Sources')
plt.ylabel('Number of Documents')
plt.title('Number of Sources in Documents')
plt.xticks(source_counts.index)
plt.show()