In [1]:
#Import libraries
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# Exploratory Data Analysis

In [2]:
CORPUS_FILEPATH = "dataset/corpus.csv"
QUERIES_FILEPATH = "dataset/queries.csv"

## Corpus dataset

In [3]:
#Read dataset
corpus = pd.read_csv(CORPUS_FILEPATH)

In [4]:
print(corpus.shape)
corpus.head()

(609, 8)


Unnamed: 0,title,author,source,published_at,category,url,body,uuid
0,200+ of the best deals from Amazon's Cyber Mon...,,Mashable,2023-11-27T08:45:59+00:00,entertainment,https://mashable.com/article/cyber-monday-deal...,"Table of Contents Table of Contents Echo, Fire...",162d0b25487d4fabb788d10c3112312a
1,ASX set to drop as Wall Street‚Äôs September slu...,Stan Choe,The Sydney Morning Herald,2023-09-26T19:11:30+00:00,business,https://www.smh.com.au/business/markets/asx-se...,"ETF provider Betashares, which manages $30 bil...",9baae8f1b1b54a77b72f18fb47a46688
2,Amazon sellers sound off on the FTC's 'long-ov...,,Cnbc | World Business News Leader,2023-10-06T21:31:00+00:00,business,https://www.cnbc.com/2023/10/06/amazon-sellers...,A worker sorts out parcels in the outbound doc...,ed80d097383b431a96618a0b62b154d6
3,"Christmas Day preview: 49ers, Ravens square of...","Colum Dell, Yardbarker",Yardbarker,2023-12-24T23:34:39+00:00,sports,https://www.yardbarker.com/nfl/articles/christ...,"Christmas Day isn't just for the NBA, as the N...",a1a4cd07573247f891de321560f97d09
4,"Raiders vs. Lions live score, updates, highlig...",Dan Treacy,Sporting News,2023-10-30T22:20:03+00:00,sports,https://www.sportingnews.com/us/nfl/news/raide...,The Lions just needed to get themselves back i...,046d5f4e21f34edda2bd4613e13afc38


In [5]:
corpus['source'].nunique()

49

In [None]:
#compute number of tokens for each body text
corpus['num_tokens'] = corpus['body'].str.split().str.len()
corpus['num_tokens'].describe()

In [None]:
corpus['published_at'].min(), corpus['published_at'].max()

In [None]:
corpus.info()

In [None]:
#compute the average number of articles per author
corpus.groupby(['author'])['title'].count().mean()

In [None]:
corpus['category'].value_counts().plot(kind='bar')

In [None]:
corpus['source'].value_counts(normalize=True)[:10]

In [None]:
corpus[corpus['author'].isnull()]['source'].value_counts(normalize=True)

**Insights**:
- The corpus dataset contains 609 articles published in 49 websites in Q4 2023
- On average, articles' body contains 1746 tokens
- The author is missing only in 68 articles (11% of cases)
- 41% of articles with missing author have been published in "Fox News"
- On average, one author publish 1.8 articles
- "Sports" is the most frequent category
- "Sporting News" is the most frequent website

In [None]:
#concatenate text from body and title
body_text = " ".join(review for review in corpus['body'])
title_text = " ".join(review for review in corpus['title'])
body_title_text = body_text + title_text

In [None]:
def generate_word_cloud(text:str)->None:
    # Generate word cloud
    word_cloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        stopwords=set(STOPWORDS),
        min_font_size=10
    ).generate(text)

    # Display the generated Word Cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis('off')  # Hide axes
    plt.show()

In [None]:
generate_word_cloud(body_title_text)

The Word Cloud shows the most frequent keywords. "Game", "Team", "Season", "Player" words seem to relate to the Sports category.

## Query dataset

In [6]:
#Read dataset
queries = pd.read_csv(QUERIES_FILEPATH)

In [7]:
print(queries.shape)
queries.head()

(2330, 2)


Unnamed: 0,query,result
0,Between the Sky Sports report on Manchester Un...,"['414e032731794e7ca2b7ad46eb11df41', '61bd8181..."
1,Considering the information from an article by...,[]
2,Does the Sporting News article attribute the r...,"['b563f26ea6714e02a9cd178fa7b2479d', 'e6a68318..."
3,Between the report from CBSSports.com publishe...,"['12949c3389e642b0acb55c4e6641f092', 'b8d042ab..."
4,Do the articles from Sporting News on 'Line Sh...,"['0deb1f49aee6427bad32e9f0fb3aa77e', 'f2c1b7a0..."


In [None]:
#Example of Query - Article match
print(queries['query'][0])
print(queries['result'][0])
corpus[corpus['uuid']=='414e032731794e7ca2b7ad46eb11df41']

In [None]:
#compute the lenght of result set for each query
queries['result_len'] = queries.result.apply(len)
#compute the distribution of result_len
queries['result_len'].describe()

In [None]:
#compute number of tokens for each body text
queries['num_tokens'] = queries['query'].str.split().str.len()
queries['num_tokens'].describe()

**Insights**:
- 2330 queries in total
- On average, each query match with 75 articles
- On average, each query contains 46 tokens