# **Information Processing and Retrieval**

**Project developed by:**
- Diogo Fonte - up202004175
- Rodrigo Figueiredo - up202005216
- Sofia Rodrigo  - up202301429
- Vítor Cavaleiro - up202004724

## **Environment Setup**

In [None]:
import pandas as pd
import numpy as np
import os
import json

---

# Data Preparation

## Data Ingestion

### All The News - Collection of Articles from 18 publishers

In [None]:
# the original file is a .db file, which was exported as a json file using the sqlite studio

# get table with rows and columns
f = open("../data/all-the-news/all-the-news-conv.json", encoding="utf8")
data = json.load(f)
table = data["objects"][0]

# get rows and columns
columns = table["columns"]
rows = table["rows"]

# get column names
column_names = []
for column in columns:
    column_names.append(column["name"])

# Create resulting dictionary
result = {}
for column_name in column_names:
    result[column_name] = []

# get rows
for row in rows:
    for i in range(len(column_names)):
        result[column_names[i]].append(row[i])

pd.DataFrame.from_dict(result).to_csv('all_the_news.csv', encoding='utf-8')

In [None]:
all_the_news = pd.read_csv('all_the_news.csv', encoding='utf-8')
all_the_news.isna().sum()

In [None]:
all_the_news.head()

In [None]:
# Drops irrelevant columns
all_the_news = all_the_news.drop(columns=['Unnamed: 0', 'id', 'year', 'month', 'digital'])
all_the_news = all_the_news.rename(columns={"publication": "publisher"})
all_the_news = all_the_news.rename(columns={"category": "source"})
all_the_news = all_the_news.rename(columns={"section": "category"})
all_the_news.head()

In [None]:
rows_count = all_the_news.shape[0]
print("Number of rows: ", rows_count)

### BBC News

In [None]:
main_folder = "../data/bbc_news_collection/"
news = []

# Iterate through subfolders of the 5 categories (business, entertainment, politics, sport, tech)
for subfolder in os.listdir(main_folder):
    subfolder_path = os.path.join(main_folder, subfolder)
    
    if os.path.isdir(subfolder_path):
        for filename in os.listdir(subfolder_path):
            if filename.endswith(".txt"):
                with open(os.path.join(subfolder_path, filename), "r", encoding="utf-8") as file:
                    author = np.nan  # No author information
                    date = "2005-12-31"
                    lines = file.readlines()
                    title = lines[0].strip()  # Read the first line as the title
                    content = "".join(lines[1:]).replace("\n", " ").strip()  # Read the rest as content
                    publisher = "BBC"
                    category = subfolder
                    url = np.nan
                    source = "website"

                    aux = pd.DataFrame({"title": [title], "author": [author], "date": [date],
                                        "content": [content], "publisher": [publisher],  "source": [source],
                                        "category": [category], "url": [url]})
                    news.append(aux)

bbc_news = pd.concat(news, ignore_index=True)
bbc_news.to_csv("bbc_articles.csv", index=False)
bbc_news.head()

In [None]:
rows_count_bbc = bbc_news.shape[0]
print("Number of rows: ", rows_count_bbc)

## Merge of Datasets

In [None]:
news_dataset = pd.concat([all_the_news, bbc_news], ignore_index=True)
news_dataset.head()

In [None]:
rows_count_news_dataset = news_dataset.shape[0]
print("Number of rows: ", rows_count_news_dataset)

---

## Data Cleaning

In [None]:
text_columns = ['title', 'author', 'content', 'publisher', 'source', 'category', 'url']

# strip the text columns
news_dataset[text_columns] = news_dataset[text_columns].apply(lambda x: x.str.strip())

# Remove duplicates
news_dataset = news_dataset.drop_duplicates(subset=['title'], keep='first')
news_dataset = news_dataset.drop_duplicates(subset=['content'], keep='first')

# Replace empty strings with NaN
news_dataset[text_columns] = news_dataset[text_columns].replace('', np.nan)

# drop na values from title and content columns
news_dataset = news_dataset.dropna(subset=['title'])
news_dataset = news_dataset.dropna(subset=['content'])

# Convert 'date' column to datetime format
news_dataset.loc[:, 'date'] = pd.to_datetime(news_dataset['date'], format='%Y/%m/%d', errors='coerce')
# Use loc to create a new column with the desired format
news_dataset.loc[:, 'formatted_date'] = news_dataset['date'].dt.strftime('%Y-%m-%d')
# drop the date column
news_dataset = news_dataset.drop(columns=['date'])
# rename the formatted_date column to date
news_dataset = news_dataset.rename(columns={"formatted_date": "date"})

# Remove "\n" from author column
news_dataset['author'] = news_dataset['author'].str.replace("\n", "")

# add id as "A" + id column
news_dataset['code'] = "A" + news_dataset.index.astype(str)

news_dataset.head()

In [None]:
# show rows where content starts with ", I want to receive updates from partners and sponsors., , "
news_dataset[news_dataset['content'].str.startswith(".")]

In [None]:
rows_count_news_dataset = news_dataset.shape[0]
print("Number of rows: ", rows_count_news_dataset)

---

## Dataset Resizing

In [None]:
# Filter the first 70% of the DataFrame
num_rows = int(len(news_dataset) * 0.9)
first_70_percent = news_dataset.iloc[:num_rows]
news_dataset = news_dataset.iloc[num_rows:] # The remaining 30% of the DataFrame

# Filter rows containing both 'Trump' and 'Lebron' in the 'Title' column
filtered_data = first_70_percent[
    (first_70_percent['title'].str.contains('Trump', case=False)) |
    (first_70_percent['title'].str.contains('Lebron', case=False)) |
    (first_70_percent['title'].str.contains('FBI', case=False)) |
    (first_70_percent['title'].str.contains('gun', case=False))
]

news_dataset = pd.concat([filtered_data, news_dataset], ignore_index=True)

In [None]:
rows_count_news_dataset = news_dataset.shape[0]
print("Number of rows: ", rows_count_news_dataset)

---

## Data Processing

In [None]:
# installation in the command line first: pip install clean-text

from cleantext.sklearn import CleanTransformer

cleaner = CleanTransformer(no_punct=False, lower=False)

news_dataset['content'] = cleaner.transform(news_dataset['content'])

---

## Semantic Analysis

In [None]:
# installation in the command line first: pip install sentence_transformers
from sentence_transformers import SentenceTransformer

def embed_title(model, title):
    embedding = model.encode([title])
    return [float(w) for w in embedding[0]]

def update_dataframe_with_embeddings(df):
    try:
        model_name = 'all-MiniLM-L6-v2'
        model = SentenceTransformer(model_name)
    except Exception as e:
        print("Error occurred initializing model:", e)
        return df

    df['vector'] = df['title'].apply(lambda title: embed_title(model, title))
    return df

update_dataframe_with_embeddings(news_dataset)

---

## Generate JSON database

In [None]:
# select every column except keyphrases and url
news_database = news_dataset[['title', 'author', 'date', 'content', 'publisher', 'source', 'category', 'code', 'vector']]

# generate json file to data folder
news_database.to_json('./../solr/news.json', orient='records')

---

## Data Analysis

#### Keyphrases Extraction

In [None]:
# installation in the command line first: pip install rake-nltk
from rake_nltk import Rake
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Uses stopwords for english from NLTK, and all puntuation characters by
r = Rake()

# Define a function to extract keywords
def extract_keywords(row):
    r.extract_keywords_from_text(row['title'] + ' ' + row['content'])
    keywords_list = r.get_ranked_phrases()
    return ';'.join(keywords_list)

# Apply the function to create a new 'keywords' column
news_dataset['keyphrases'] = news_dataset.apply(extract_keywords, axis=1)

In [None]:
news_dataset.head()

#### Named Entity Recognition and Word Clouds

In [None]:
# installation in the command line first: pip install wordcloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# set stopwords
stopwords = set(STOPWORDS)

import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
wordcloud = WordCloud(stopwords=stopwords, background_color="white", collocations=False).generate(' '.join(news_dataset['keyphrases']))

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

A sample of 10,000 objects is selected, and from each row, the keyphrases are extracted. From these keyphrases, we will identify named entities (like names of people or places) using spacy. Then, it creates two strings, one containing all the identified entities and another with their corresponding labels. After that, we generate word clouds for both the entities and labels.

In [None]:
# select a sample of 10000 rows
news_content = news_dataset.sample(n=10000)['keyphrases']

all_entities = []
all_labels = []

# collect all entities and labels
for text in news_content:
    # Apply NER
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    labels = [ent.label_ for ent in doc.ents]
    all_labels.extend(labels)
    all_entities.extend(entities)

entity_string = ' '.join(all_entities)
labels_string = ' '.join(all_labels)

# create wordclouds
wordcloud = WordCloud(stopwords=stopwords, background_color="white", collocations=False).generate(entity_string)

plt.title('Entities', fontsize=20, fontweight='bold', pad=20, loc='left', color='black', fontfamily='serif', y=1.02)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

wordcloud = WordCloud(stopwords=stopwords, background_color="white", collocations=False).generate(labels_string)

plt.title('Labels', fontsize=20, fontweight='bold', pad=20, loc='left', color='black', fontfamily='serif', y=1.02)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


### Characterization

In [None]:
news_dataset_analysis = news_dataset.copy()

#### Pie Charts

In [None]:
# Pie chart for Source distribution
source_counts = news_dataset_analysis['source'].value_counts()
threshold = 0.05  # Define a threshold for combining into 'Others'
small_sources = source_counts[source_counts / source_counts.sum() < threshold].index
news_dataset_analysis['source'] = news_dataset_analysis['source'].apply(lambda x: 'Others' if x in small_sources else x)
source_counts = news_dataset_analysis['source'].value_counts()

plt.figure(figsize=(6, 6))
plt.pie(source_counts, labels=source_counts.index, autopct='%1.1f%%', startangle=90)
plt.title("Source Distribution")
plt.show()

In [None]:
# Pie chart for Publisher distribution
publisher_counts = news_dataset_analysis['publisher'].value_counts()
small_publishers = publisher_counts[publisher_counts / publisher_counts.sum() < threshold].index
news_dataset_analysis['publisher'] = news_dataset_analysis['publisher'].apply(lambda x: 'Others' if x in small_publishers else x)
publisher_counts = news_dataset_analysis['publisher'].value_counts()

plt.figure(figsize=(6, 6))
plt.pie(publisher_counts, labels=publisher_counts.index, autopct='%1.1f%%', startangle=90)
plt.title("Publisher Distribution")
plt.show()

In [None]:
# Pie chart for Year distribution
def extract_year(date_str):
    try:
        return int(date_str.split('-')[0])
    except:
        return None

news_dataset_analysis['year'] = news_dataset_analysis['date'].apply(extract_year)

df = news_dataset_analysis.dropna(subset=['year'])
year_counts = df['year'].value_counts()

threshold = 0.03  # Adjust the threshold as needed
small_years = year_counts[year_counts / year_counts.sum() < threshold].index
year_counts.loc['Others'] = year_counts[small_years].sum()
year_counts.drop(small_years, inplace=True)

plt.figure(figsize=(6, 6))
plt.pie(year_counts, labels=year_counts.index, autopct='%1.1f%%', startangle=90)
plt.title("Year Distribution")
plt.show()

#### Histograms

In [None]:
news_dataset_analysis['article_length'] = news_dataset_analysis['content'].apply(len)

In [None]:
bin_edges = [form * 500 for form in range(0, 95)]

# Create a histogram of article lengths
plt.figure(figsize=(8, 6))
plt.hist(news_dataset_analysis['article_length'], bins=bin_edges, edgecolor='k')
plt.xlabel('Article Length (Characters)')
plt.ylabel('Number of Articles')
plt.title('Article Length Histogram')
plt.grid(True)
plt.show()

In [None]:
# Create a box plot of article lengths
plt.figure(figsize=(8, 6))
plt.boxplot(news_dataset_analysis['article_length'], vert=False)
plt.ylabel('Article Length (Characters)')
plt.title('Article Length Box Plot')
plt.grid(True)
plt.show()

#### Heatmaps

In [None]:
import seaborn as sns

news_dataset_analysis['year'] = news_dataset_analysis['date'].apply(extract_year)
df = news_dataset_analysis.dropna(subset=['year'])
year_counts = df['year'].value_counts()

heatmap_data = df.pivot_table(index='year', values='title', aggfunc='count')

# Create the heatmap no analyse the number of articles published each year
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, annot=True, fmt="d", cmap="YlGnBu")
plt.xlabel('Year of Publication')
plt.ylabel('Number of Articles')
plt.title('Publication Year Heatmap')
plt.show()


#### Scatter Plots

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

aux_df = news_dataset_analysis.copy()

# Extract the year of publication
aux_df['year'] = pd.to_datetime(aux_df['date']).dt.year

# Calculate the ratio of content length to publication date
aux_df['content_length'] = aux_df['content'].apply(len)
aux_df['ratio'] = aux_df['content_length'] / aux_df['year']

# Create a scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(aux_df['year'], aux_df['ratio'], s=100, alpha=0.7)
plt.xlabel('Publication Year')
plt.ylabel('Content Length / Year')
plt.title('Scatter Plot of Content Length vs. Publication Year')
plt.grid(True)
plt.show()


---

## Evaluation

### Building qrels file - 1st information need

In [None]:
# create list of possible values referencing trump
filter1 = [' trump ', ' donald ']
filter2 = ['in-migration', 'immigration']
filter3 = ['exile', 'deportation', 'expatriation', 'transportation']
filter4 = ['refugee', 'refugees']
filter5 = ['migrant', 'migrants']
filter6 = ['ban']

result = news_dataset[news_dataset['keyphrases'].str.contains('|'.join(filter1))]
result = result[result['keyphrases'].str.contains('|'.join(filter2))]
result = result[result['keyphrases'].str.contains('|'.join(filter3))]
result = result[result['keyphrases'].str.contains('|'.join(filter4))]
result = result[result['keyphrases'].str.contains('|'.join(filter5))]
result = result[result['keyphrases'].str.contains('|'.join(filter6))]


# print the number of rows
rows_count = result.shape[0]
print("Number of rows: ", rows_count)

for row in result.iterrows():
    id = row[1]['code']
    print(id)

for row in result.iterrows():
    title = row[1]['title']
    id = row[1]['code']
    print(id)
    print(title)
    print('-----------------')
    

### Building qrels file - 2st information need

In [None]:
# create list of possible items to be used for evaluation

# print the number of rows
rows_count = result.shape[0]
print("Number of rows: ", rows_count)

for row in result.iterrows():
    id = row[1]['code']
    print(id)

for row in result.iterrows():
    title = row[1]['title']
    id = row[1]['code']
    print(id)
    print(title)
    print('-----------------')

### Building qrels file - 3st information need

In [None]:
# create list of possible items to be used for evaluation

# print the number of rows
rows_count = result.shape[0]
print("Number of rows: ", rows_count)

for row in result.iterrows():
    id = row[1]['code']
    print(id)

for row in result.iterrows():
    title = row[1]['title']
    id = row[1]['code']
    print(id)
    print(title)
    print('-----------------')

### Building qrels file - 4st information need

In [None]:
# create list of possible items to be used for evaluation


# print the number of rows
rows_count = result.shape[0]
print("Number of rows: ", rows_count)

for row in result.iterrows():
    id = row[1]['code']
    print(id)

for row in result.iterrows():
    title = row[1]['title']
    id = row[1]['code']
    print(id)
    print(title)
    print('-----------------')

### Evaluation Metrics

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import PrecisionRecallDisplay
import requests

Query 1: Find news articles where Trump spoke on the immigration crisis

Query 2: Find news about LeBron's good performances in lost games

Query 3: Find articles related to homicides investigated by the FBI in 2017

Query 4: Find news articles regarding the conflicts between republicans and democrats about gun ownership

In [None]:
QRELS1_FILE = "./../data_evaluation/qrels/query1_qrels.txt"
QRELS2_FILE = "./../data_evaluation/qrels/query2_qrels.txt"
QRELS3_FILE = "./../data_evaluation/qrels/query3_qrels.txt"
QRELS4_FILE = "./../data_evaluation/qrels/query4_qrels.txt"

QUERY1_URL_WITHOUT_BOOST = 'http://localhost:8983/solr/news/select?defType=edismax&qf=title+content&q=Trump+immigration+crisis&indent=true&q.op=AND&fl=code,title,author,date,publisher,category,content,score'
QUERY1_URL_WITH_BOOST = 'http://localhost:8983/solr/news/select?defType=edismax&qf=title^2.5+content&q=Trump+immigration+crisis&indent=true&q.op=AND&fl=code,title,author,date,publisher,category,content,score&bq=title:Trump^2.5 title:immigration^1.5 content:Trump^2.5 content:immigration^1.5'

QUERY2_URL_WITHOUT_BOOST = 'http://localhost:8983/solr/news/select?defType=edismax&qf=title+content&q=LeBron+good+performance+lost+game+points&indent=true&q.op=AND&fl=code,title,author,date,publisher,category,content,score'
QUERY2_URL_WITH_BOOST = 'http://localhost:8983/solr/news/select?defType=edismax&qf=title^1.5+content&q=LeBron+good+performance+lost+game+points&indent=true&q.op=AND&fl=code,title,author,date,publisher,category,content,score&bq=title:Lebron^3 title:lost^1.5 title:game^1.5 content:Lebron^2.5 content:lost^2 content:game^1.5'

QUERY3_URL_WITHOUT_BOOST = 'http://localhost:8983/solr/news/select?defType=edismax&qf=title+content&q=homicide+FBI&indent=true&q.op=AND&fq=date:[2017-01-01T00:00:00Z TO 2017-12-31T23:59:59Z]&fl=code,title,author,date,publisher,category,content,score'
QUERY3_URL_WITH_BOOST = 'http://localhost:8983/solr/news/select?defType=edismax&qf=title^2+content&q=homicide+FBI&indent=true&q.op=AND&fq=date:[2017-01-01T00:00:00Z TO 2017-12-31T23:59:59Z]&fl=code,title,author,date,publisher,category,content,score&bq=title:homicides^2.0 content:homicides^2.0'

QUERY4_URL_WITHOUT_BOOST = 'http://localhost:8983/solr/news/select?defType=edismax&qf=title+content&q=Republicans+Democrats+"gun+ownership"+conflicts&indent=true&q.op=AND&fl=code,title,author,date,publisher,category,content,score'
QUERY4_URL_WITH_BOOST = 'http://localhost:8983/solr/news/select?defType=edismax&qf=title^2.5+content&q=Republicans+Democrats+"gun+ownership"+conflicts&indent=true&q.op=AND&fl=code,title,author,date,publisher,category,content,score&bq=title:gun^2.5 title:conflicts^1.5 content:gun^2.5 content:conflicts^1.5'

# Read qrels to extract relevant documents
relevant = list(map(lambda el: el.strip(), open(QRELS1_FILE).readlines())) # Change to QRELS2_FILE, QRELS3_FILE or QRELS4_FILE to see the results for other queries
# Get query results from Solr instance
results = requests.get(QUERY1_URL_WITHOUT_BOOST).json()['response']['docs'] # Without boost -> change to with boost to compare

In [None]:
# METRICS TABLE
# Define custom decorator to automatically calculate metric based on key
metrics = {}
metric = lambda f: metrics.setdefault(f.__name__, f)

@metric
def ap(results, relevant):
    """Average Precision"""
    precision_values = []
    relevant_count = 0

    for idx, doc in enumerate(results):
        if doc['code'] in relevant:
            relevant_count += 1
            precision_at_k = relevant_count / (idx + 1)
            precision_values.append(precision_at_k)

    if not precision_values:
        return 0.0

    return sum(precision_values)/len(precision_values)

@metric
def p10(results, relevant, n=10):
    """Precision at N"""
    return len([doc for doc in results[:n] if doc['code'] in relevant])/n

def calculate_metric(key, results, relevant):
    return metrics[key](results, relevant)

# Define metrics to be calculated
evaluation_metrics = {
    'ap': 'Average Precision',
    'p10': 'Precision at 10 (P@10)'
}

# Calculate all metrics and export results as LaTeX table
df = pd.DataFrame([['Metric','Value']] +
    [
        [evaluation_metrics[m], calculate_metric(m, results, relevant)]
        for m in evaluation_metrics
    ]
)

with open('results.tex','w') as tf:
    tf.write(df.to_latex(index=False, header=False))

In [None]:
import numpy as np
import copy
import matplotlib.pyplot as plt

precision = [
    len([
        doc 
        for doc in results[:idx]
        if doc['code'] in relevant
    ]) / idx 
    for idx, _ in enumerate(results, start=1)
]

recall = [
    len([
        doc for doc in results[:idx]
        if doc['code'] in relevant
    ]) / len(relevant)
    for idx, _ in enumerate(results, start=1)
]

precision2 = copy.deepcopy(precision)
i = len(recall) - 2

# interpolation...
while i>=0:
    if precision[i+1] > precision[i]:
        precision[i] = precision[i+1]
    i = i - 1

# plotting...
fig, ax = plt.subplots()
for i in range(len(recall) - 1):
    ax.plot((recall[i], recall[i]), (precision[i], precision[i+1]), 'k-' ,label='', color='red') #vertical
    ax.plot((recall[i], recall[i+1]), (precision[i+1], precision[i+1]), 'k-', label='', color='red') #horizontal

ax.plot(recall,precision2,'k--',color='blue')
ax.set_xlabel("recall")
ax.set_ylabel("precision")
plt.savefig('precision_recall.png')
fig.show()