In [None]:
#Import libraries
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# Exploratory Data Analysis

In [None]:
CORPUS_FILEPATH = "dataset/corpus.csv"
QUERIES_FILEPATH = "dataset/queries.csv"

## Corpus dataset

In [None]:
#Read dataset
corpus = pd.read_csv(CORPUS_FILEPATH)

In [None]:
print(corpus.shape)
corpus.head()

In [None]:
corpus['source'].nunique()

In [None]:
#compute number of tokens for each body text
corpus['num_tokens'] = corpus['body'].str.split().str.len()
corpus['num_tokens'].describe()

In [None]:
corpus['published_at'].min(), corpus['published_at'].max()

In [None]:
corpus.info()

In [None]:
#compute the average number of articles per author
corpus.groupby(['author'])['title'].count().mean()

In [None]:
corpus['category'].value_counts().plot(kind='bar')

In [None]:
corpus['source'].value_counts(normalize=True)[:10]

In [None]:
corpus[corpus['author'].isnull()]['source'].value_counts(normalize=True)

**Insights**:
- The corpus dataset contains 609 articles published in 49 websites in Q4 2023
- On average, articles' body contains 1746 tokens
- The author is missing only in 68 articles (11% of cases)
- 41% of articles with missing author have been published in "Fox News"
- On average, one author publish 1.8 articles
- "Sports" is the most frequent category
- "Sporting News" is the most frequent website

In [None]:
#concatenate text from body and title
body_text = " ".join(review for review in corpus['body'])
title_text = " ".join(review for review in corpus['title'])
body_title_text = body_text + title_text

In [None]:
def generate_word_cloud(text:str)->None:
    # Generate word cloud
    word_cloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        stopwords=set(STOPWORDS),
        min_font_size=10
    ).generate(text)

    # Display the generated Word Cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis('off')  # Hide axes
    plt.show()

In [None]:
generate_word_cloud(body_title_text)

The Word Cloud shows the most frequent keywords. "Game", "Team", "Season", "Player" words seem to relate to the Sports category.

## Query dataset

In [None]:
#Read dataset
queries = pd.read_csv(QUERIES_FILEPATH)

In [None]:
print(queries.shape)
queries.head()

In [None]:
#Example of Query - Article match
print(queries['query'][0])
print(queries['result'][0])
corpus[corpus['uuid']=='414e032731794e7ca2b7ad46eb11df41']

In [None]:
#compute the lenght of result set for each query
queries['result_len'] = queries.result.apply(len)
#compute the distribution of result_len
queries['result_len'].describe()

In [None]:
#compute number of tokens for each body text
queries['num_tokens'] = queries['query'].str.split().str.len()
queries['num_tokens'].describe()

**Insights**:
- 2330 queries in total
- On average, each query match with 75 articles
- On average, each query contains 46 tokens