## News Data Mining

In [3]:
import pandas as pd
from pygooglenews import GoogleNews

In [11]:
gn = GoogleNews()
search = gn.search('apple', from_ = '2024-02-01', to_ = '2024-02-20')

In [None]:
search['entries']

In [None]:
# Initialize an empty list to hold the article data
articles = []

# Iterate over the entries and extract the required information
for entry in search['entries']:
    article = {
        'title': entry['title'],
        'link': entry['link'],
        'published': entry['published'],
        'source': entry['source']['title'] if 'source' in entry else 'Unknown'
    }
    articles.append(article)

# Convert the list of articles into a DataFrame
df = pd.DataFrame(articles)
df

# Data Collection

In [61]:
import os
import pandas as pd
from pygooglenews import GoogleNews
from datetime import datetime, timedelta
from tqdm import tqdm

def collect_articles(query, output_file_path, start_date='2009-01-01', end_date='2009-01-02'):
    # Initialize GoogleNews
    gn = GoogleNews()

    # Convert string dates to datetime objects
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')

    # Initialize an empty list to hold the article data
    articles = []
    total_articles = 0

    # Calculate the total number of days for the progress bar
    total_days = (end - start).days + 1  # +1 to include the end date

    # Track the last month processed
    last_date = start

    # Iterate through each day in the specified date range with tqdm
    for day in tqdm(range(total_days), desc="Collecting articles"):
        current_date = start + timedelta(days=day)

        # Check if the month has changed or it's the last day
        if current_date.month != last_date.month or current_date == end:
            if articles:
                # Convert the list of articles into a DataFrame and save
                df = pd.DataFrame(articles)
                file_exists = os.path.exists(output_file_path)
                df.to_csv(output_file_path, mode='a', index=False, header=not file_exists)

                # Update total articles count
                total_articles += len(articles)

                # Reset articles list for the new month
                articles = []

            last_date = current_date

        # Format current_date for the search
        search_date = current_date.strftime('%Y-%m-%d')
        to_date = (current_date + timedelta(days=1)).strftime('%Y-%m-%d')

        # Perform the search for the current day
        search = gn.search(query, from_=search_date, to_=to_date)

        # Iterate over the entries and extract the required information
        for entry in search['entries']:
            article = {
                'title': entry['title'],
                'link': entry['link'],
                'published': entry['published'],
                'source': entry['source']['title'] if 'source' in entry else 'Unknown'
            }
            articles.append(article)

    # Return total articles saved
    return total_articles



### Function to remove non company related articles

In [62]:
from transformers import BertTokenizer, BertModel
import torch
import intel_extension_for_pytorch as ipex
from tqdm import tqdm

# Check if a GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def filter_titles_by_keywords(df, inclusion_keywords, exclusion_keywords):
    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Convert model to use IPEX optimized paths
    model = model.to(ipex.DEVICE).eval()  # Use eval mode since we are doing inference

    def get_embedding(text):
        # Encode text and move tensors to the same device as model
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(ipex.DEVICE)
        # Perform the forward pass without computing gradients (inference mode)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    # Precompute keyword embeddings and move them to the device
    inclusion_keyword_embeddings = [get_embedding(keyword).detach() for keyword in inclusion_keywords]
    exclusion_keyword_embeddings = [get_embedding(keyword).detach() for keyword in exclusion_keywords]

    def is_related(title, positive_threshold=0.624, negative_threshold=0.9):
        title_embedding = get_embedding(title).detach()
        # Compute cosine similarities and check against thresholds
        positive_similarities = [torch.cosine_similarity(title_embedding, keyword_embedding) for keyword_embedding in inclusion_keyword_embeddings]
        is_positive = any(similarity.item() > positive_threshold for similarity in positive_similarities)

        negative_similarities = [torch.cosine_similarity(title_embedding, keyword_embedding) for keyword_embedding in exclusion_keyword_embeddings]
        is_negative = any(similarity.item() > negative_threshold for similarity in negative_similarities)

        return is_positive and not is_negative

    # Apply the is_related function to each title
    df['is_related'] = [is_related(title) for title in tqdm(df['title'], desc="Filtering Titles By Keywords")]

    # Filter the DataFrame to keep only related entries and drop the 'is_related' column
    related_df = df[df['is_related']].copy().drop(columns=['is_related'])

    return related_df

# Data Cleaning/ Preparation

In [63]:
import os

def clean_dataset(file_path, inclusion_keywords, exclusion_keywords):
    # Load the dataset
    original_df = pd.read_csv(file_path)
    df = original_df.copy()

    # Extract the base name of the file without extension
    base_name = os.path.basename(file_path).replace('.csv', '')

    # Remove duplicated data
    df.drop_duplicates(subset=['title', 'link', 'published'], inplace=True)

    # Remove entries with null values
    df.dropna(inplace=True)

    # Remove entries with non-ASCII characters
    df = df[df['title'].map(lambda x: x.isascii()) & df['source'].map(lambda x: x.isascii())]

    # Remove non-company-related articles
    df = filter_titles_by_keywords(df, inclusion_keywords, exclusion_keywords)

    # Convert 'published' column to datetime to ensure proper sorting
    df['published'] = pd.to_datetime(df['published'])

    # Order the dataset by date
    df.sort_values(by='published', inplace=True)

    # Define cleaned and removed file paths according to the new directory structure
    cleaned_file_path = os.path.join('data', 'cleaned', f'{base_name}_cleaned.csv')
    removed_file_path = os.path.join('data', 'removed', f'{base_name}_removed.csv')

    # Save the cleaned dataset back to a CSV file
    df.to_csv(cleaned_file_path, index=False)
    print(f"Cleaned dataset saved to {cleaned_file_path}")

    # Identify all removed entries as the difference between the original and cleaned datasets
    removed_entries = original_df[~original_df.index.isin(df.index)]

    # Save the removed entries to a separate CSV file for inspection
    removed_entries.to_csv(removed_file_path, index=False)
    print(f"Removed entries saved to {removed_file_path}")

    return df


In [64]:
def get_articles_for(query, inclusion_keywords, exclusion_keywords):
    # Path to raw dataset
    file_path = f'./data/raw/{query}.csv' 
    # Collect the articles
    num_data = collect_articles(query, file_path)
    print(f"Collected {num_data} articles and exported to '{file_path}'")
    # Clean the dataset
    cleaned_df = clean_dataset(file_path, inclusion_keywords, exclusion_keywords)


# # Specify your search query
# query = 'apple'

# # Keywords to compute keyword embeddings and compare with article tites
# inclusion_keywords = []

# exclusion_keywords = []

# get_articles_for(query, inclusion_keywords, exclusion_keywords)

# # Path to your dataset
# file_path = './data/raw/apple.csv'



In [65]:

# Keywords to compute keyword embeddings and compare with article tites
microsoft_inclusion_keywords = [
    # Products and Hardware
    "Surface", "Surface Pro", "Surface Laptop", "Surface Book",
    "Xbox", "Xbox One", "Xbox Series X", "Xbox Series S",
    "Microsoft PC", "Microsoft HoloLens", "Windows Phone",

    # Software and Services
    "Windows updates", "Windows 10", "Windows 11", "Microsoft 365",
    "Microsoft Office", "Outlook", "OneDrive",
    "Microsoft Teams", "Skype", "Bing",
    "Microsoft Edge", "Internet Explorer",
    "Azure", "SQL Server", "Visual Studio",

    # Technology and Innovations
    "Cloud computing", "AI", "Machine Learning",
    "Microsoft Cognitive Services", "Azure AI",
    "Microsoft Security", "Windows Defender",
    "GitHub", "Power Platform", "Dynamics 365",

    # Corporate and Culture
    "Satya Nadella", "Bill Gates", "Paul Allen",
    "Microsoft Store", "Microsoft Research",
    "Microsoft Campus", "Redmond",
    "Microsoft Build", "Microsoft Ignite",
    "Microsoft Philanthropies", "Code.org",

    # Events and Conferences
    "Microsoft Inspire", "Build Conference",
    "Microsoft Developer Days", "TechEd",
    "Xbox game releases", "Halo release",

    # Competitors and Industry
    "Microsoft vs. Apple", "Microsoft vs. Amazon", "Microsoft vs. Google",
    "Microsoft's market share", "Cloud market",
    "PC market", "Gaming market", "Enterprise solutions",
]

microsoft_exclusion_keywords = [
    # Unrelated "windows"
    "glass windows", "car windows", "house windows", "window cleaning", "window treatments",
    
    # Unrelated "surface"
    "surface area", "surface water", "surface texture", "surface cleaning", "surface design",

    # Common phrases or idioms
    "opening a window of opportunity", "window into the soul", "surface-level examination", "only scratched the surface",
    
    # Geographical references
    "Windows Valley", "Surface Creek"
]

companies_keywords = {
    'microsoft': {
        'inclusion': microsoft_inclusion_keywords,
        'exclusion': microsoft_exclusion_keywords
    },
    # Add more company queries with their keywords here
}

for company, keywords in companies_keywords.items():
    get_articles_for(company, keywords['inclusion'], keywords['exclusion'])

Collecting articles: 100%|██████████| 2/2 [00:04<00:00,  2.48s/it]


Collected 10 articles and exported to './data/raw/microsoft.csv'


Filtering Titles By Keywords: 100%|██████████| 10/10 [00:00<00:00, 12.25it/s]

Cleaned dataset saved to data\cleaned\microsoft_cleaned.csv
Removed entries saved to data\removed\microsoft_removed.csv





# Keywords to compute keyword embeddings and compare with article tites
inclusion_keywords = [
    # Products and Hardware
    "iMac", "iMac Pro",
    "Mac Mini", "Mac Pro",
    "iPhone", "iPhone SE", "iPhone X", "iPhone 11", "iPhone 12", "iPhone 13",
    "iPad Pro", "iPad Air", "iPad Mini", "iPadOS",
    "Apple Watch Series", "watchOS",
    "iPod", "iPod touch", "iPod Shuffle"

    # Software and Services
    "iOS updates", "macOS updates", "watchOS updates",
    "iCloud", "iCloud Drive", "iCloud Photo Library",
    "iTunes", "Apple Music", "Apple Podcasts",
    "Apple TV+", "Apple TV app", "Apple Originals",
    "Apple Arcade", "Game Center", "App Store"

    # Technology and Innovations
    "phones", "Retina display", "Liquid Retina", "Super Retina",
    "Face ID", "Touch ID", "Apple Pay Cash",
    "Apple M2 Chip", "A15 Bionic", "H1 Chip",
    "TrueDepth camera", "Night mode", "Deep Fusion",
    "LiDAR Scanner", "ARKit",

    # Corporate and Culture
    "Tim Cook", "Steve Jobs", "Steve Wozniak", "Jony Ive",
    "Apple Store", "Genius Bar", "Today at Apple",
    "Apple Campus 2", "Spaceship campus",
    "Infinite Loop", "One Apple Park Way",
    "Apple Design Awards", "Apple Entrepreneur Camp",
    "AppleInsider", "MacRumors"

    # Events and Conferences
    "Special Events", "Apple Keynotes",
    "Apple Spring Event", "Apple Fall Event",
    "iPhone launch event", "iPad launch event",
    "Apple Developer Forums", "Tech Talks",
    "Apple Design Awards", "Shot on iPhone Challenge",

    # Competitors and Industry
    "Apple vs. Samsung", "Apple vs. Google", "Apple vs. Amazon", "Apple vs. Microsoft",
    "iOS vs. Android",
    "Mac vs. PC",
    "Apple's market share",
    "Innovations in consumer electronics",
    "Trends in technology and mobile computing",

    # Potentially ambiguous headlines
    "Repurposing Your Dead Mac",
    "Apple surprises us with a new, more-talkative iPod shuffle",
    "Not Only Was Steve Jobs Sick, He Had A Liver Transplant",
    "What's driving Steve Jobs?",
    "A Suicide at an Apple Manufacturer in China",
    "Compensation: $44000 And a MacBook - Cult of Mac",
    "The iTunes App Store Rolls with the Travel Season",
    "Steve Jobs Explains His Weight Loss in Healthnote",
    "Apple iPod touch (3rd Generation) 32GB Review: iPod touch Review",
    "He Put the Mac in Mackintosh",
    "Apple's Iconic Ad",
    "Apple's Latest Ad Takes Aim at Microsoft's 'Laptop Hunters' Campaign"
]

exclusion_keywords = [
    # Fruit and agriculture
    "apple orchard", "apple harvest", "apple picking", "apple variety", "apple cider",
    "apple pie", "apple crumble", "apple sauce", "apple dessert", "apple recipe",
    "apple nutrition", "apple health benefits", "apple tree", "apple seed", "apple farming",

    # Geographical references
    "Big Apple", "Apple Hill", "Apple City", "Apple River"
    
    # Common phrases or idioms
    "apple of my eye", "bad apple", "apple doesn't fall far from the tree", "upset the apple cart", "compare apples to oranges",
    "eating healthy",
    
    # Unrelated products or services
    "apple shampoo", "apple cosmetics", "apple fragrance", "apple scented", "apple flavor",
    "apple soda", "apple juice", "apple wine", "apple brandy", "apple beer",
]

# Extend the existing exclusion keywords list with the new ones
exclusion_keywords.extend([
    "Bikes With Non-EPA Tagged Exhausts Out of Big Apple",
    "Copycat Fragrances",
    "Swiss apple is key to Michelle Obama's youthful looks",
    "Learn to grow tropical fruit at home in Houston",
    # Add any other previously identified exclusion keywords here...
])

# Clean the dataset
cleaned_df = clean_dataset(file_path, inclusion_keywords, exclusion_keywords)
cleaned_df