## News Data Mining

### Imports

In [1]:
import os
import pandas as pd
from pygooglenews import GoogleNews
from datetime import datetime, timedelta
from tqdm import tqdm

# Data Collection

### Collect Articles

In [2]:
def collect_articles(query, output_file_path, start_date='2009-01-01', end_date='2024-02-20'):
    # Initialize GoogleNews
    gn = GoogleNews()

    # Check if the directory of the output file exists, create it if not
    output_dir = os.path.dirname(output_file_path)
    os.makedirs(output_dir, exist_ok=True)

    # Convert string dates to datetime objects
    start = datetime.strptime(start_date, '%Y-%m-%d')
    end = datetime.strptime(end_date, '%Y-%m-%d')

    # Initialize an empty list to hold the article data
    articles = []
    total_entries = 0

    # Calculate the total number of days for the progress bar
    total_days = (end - start).days + 1  # +1 to include the end date

    # Track the last month processed
    last_date = start

    # Iterate through each day in the specified date range with tqdm
    for day in tqdm(range(total_days), desc=f"Collecting articles from {start_date} to {end_date}"):
        try:
            current_date = start + timedelta(days=day)

            # Check if the month has changed or it's the last day
            if current_date.month != last_date.month or current_date == end:
                if articles:
                    # Convert the list of articles into a DataFrame and save
                    df = pd.DataFrame(articles)
                    file_exists = os.path.exists(output_file_path)
                    df.to_csv(output_file_path, mode='a', index=False, header=not file_exists)

                    # Reset articles list for the new month
                    articles = []

                last_date = current_date

            # Format current_date for the search
            search_date = current_date.strftime('%Y-%m-%d')
            to_date = (current_date + timedelta(days=1)).strftime('%Y-%m-%d')

            # Perform the search for the current day
            search = gn.search(query, from_=search_date, to_=to_date)

            # Iterate over the entries and extract the required information
            for entry in search['entries']:
                # Update total articles count
                total_entries += 1
                article = {
                    'id': total_entries,
                    'title': entry['title'],
                    'link': entry['link'],
                    'published': entry['published'],
                    'source': entry['source']['title'] if 'source' in entry else 'Unknown'
                }
                articles.append(article)
                
        except Exception as e:
            print(f"Error processing {current_date.strftime('%Y-%m-%d')}: {e}")

        # Save any remaining articles
    if articles:
        df = pd.DataFrame(articles).drop_duplicates()
        file_exists = os.path.exists(output_file_path)
        df.to_csv(output_file_path, mode='a', index=False, header=not file_exists)

    # Return total articles saved
    return total_entries



# Data Cleaning/ Preparation

### Saving Files

In [3]:
def save_file(df, base_name, cleaned_or_removed, filter_status):
    """
    Saves the DataFrame to a CSV file with a structured naming convention.
    
    Parameters:
    - df: DataFrame to be saved.
    - base_name: The base name extracted from the original file path.
    - cleaned_or_removed: Indicates whether the DataFrame contains cleaned or removed data.
    - filter_status: Indicates the filtering status of the DataFrame ('unfiltered' or 'filtered').
    """
    # Construct the directory path
    dir_path = os.path.join(base_name, filter_status)
    # Ensure the directory exists
    os.makedirs(dir_path, exist_ok=True)

    # Construct the full file path
    file_name = f"{base_name}_{cleaned_or_removed}_{filter_status}.csv"
    file_path = os.path.join(dir_path, file_name)

    # Attempt to save the DataFrame, catching any exceptions
    try:
        df.to_csv(file_path, index=False)
        print(f"{cleaned_or_removed} dataset saved to {file_path}")
    except Exception as e:
        print(f"Failed to save {cleaned_or_removed} dataset to {file_path}. Error: {e}")

### Filter Articles By Keywords

In [4]:
from transformers import BertTokenizer, BertModel
import torch

# Check if a GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def filter_titles_by_keywords(df, inclusion_keywords, exclusion_keywords, exclusion_included):
    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').to(device)  # Move model to GPU if available

    def get_embedding(text):
        # Encode text and move tensors to the same device as model
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
        # Perform the forward pass and don't compute gradients as we're not training
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    # Precompute keyword embeddings and move them to the device
    inclusion_keyword_embeddings = [get_embedding(keyword).detach() for keyword in inclusion_keywords]
    if (exclusion_included):
        exclusion_keyword_embeddings = [get_embedding(keyword).detach() for keyword in exclusion_keywords]

    def is_related(title, positive_threshold=0.625, negative_threshold=0.73):
        title_embedding = get_embedding(title).detach()
        # Compute cosine similarities and check against thresholds
        positive_similarities = [torch.cosine_similarity(title_embedding, keyword_embedding) for keyword_embedding in inclusion_keyword_embeddings]
        is_positive = any(similarity.item() > positive_threshold for similarity in positive_similarities)

        if (exclusion_included):
            negative_similarities = [torch.cosine_similarity(title_embedding, keyword_embedding) for keyword_embedding in exclusion_keyword_embeddings]
            is_negative = any(similarity.item() > negative_threshold for similarity in negative_similarities)
            return is_positive and not is_negative

        return is_positive

    # Apply the is_related function to each title
    df['is_related'] = [is_related(title) for title in tqdm(df['title'], desc="Filtering Titles By Keywords")]

    # Filter the DataFrame to keep only related entries
    related_df = df[df['is_related']].copy().drop(columns=['is_related'])

    return related_df


### Clean Dataset

In [5]:
def clean_dataset(file_path, inclusion_keywords, exclusion_keywords, exclusion_included):
    # Load the dataset
    original_df = pd.read_csv(file_path)
    unfiltered_df = original_df.copy()

    # Extract the base name of the file without extension
    base_name = os.path.basename(file_path).split('.')[0]

     # Remove duplicated data
    unfiltered_df.drop_duplicates(subset=['title', 'link', 'published'], inplace=True)

    # Remove entries with null values
    unfiltered_df.dropna(inplace=True)

    for col in unfiltered_df.select_dtypes(include='object').columns:
        unfiltered_df[col] = unfiltered_df[col].str.replace('’', '\'')

    # Remove entries with non-ASCII characters
    unfiltered_df = unfiltered_df[unfiltered_df['title'].map(lambda x: x.isascii()) & unfiltered_df['source'].map(lambda x: x.isascii())]

    # Convert 'published' column to datetime to ensure proper sorting
    unfiltered_df['published'] = pd.to_datetime(unfiltered_df['published'])

    # Order the dataset by date
    unfiltered_df.sort_values(by='published', inplace=True)

    # Order the dataset by date
    unfiltered_df.sort_values(by='published', inplace=True)

    # Save the cleaned dataset before keyword filtering
    save_file(unfiltered_df, base_name, 'cleaned', 'unfiltered')

    # Identify and save the removed dataset before keyword filtering
    removed_unfiltered_df = original_df[~original_df.index.isin(unfiltered_df.index)]
    save_file(removed_unfiltered_df, base_name, 'removed', 'unfiltered')

    # Filter non-company-related articles
    filtered_df = filter_titles_by_keywords(unfiltered_df, inclusion_keywords, exclusion_keywords, exclusion_included)

    # Save the cleaned and removed datasets after further processing
    save_file(filtered_df, base_name, 'cleaned', 'filtered')
    removed_filtered_df = unfiltered_df[~unfiltered_df.index.isin(filtered_df.index)]
    save_file(removed_filtered_df, base_name, 'removed', 'filtered')

    return filtered_df

### Get Articles For

In [6]:
def get_articles_for(query, inclusion_keywords, exclusion_keywords, exclusion_included, start_date='2009-01-01', end_date='2024-02-20'):
    # Path to raw dataset
    file_path = f'./{query}/{query}.csv' 
    # # Collect the articles
    num_data = collect_articles(query, file_path, start_date, end_date)
    print(f"Collected {num_data} articles and exported to '{file_path}'")
    # Clean the dataset
    clean_dataset(file_path, inclusion_keywords, exclusion_keywords, exclusion_included)



In [7]:
# Keywords to compute keyword embeddings and compare with article tites
boeing_inclusion_keywords = [
    # Commercial Aircraft
    "Boeing 737", "Boeing 747", "Boeing 767", "Boeing 777", "Boeing 787 Dreamliner",
    "Boeing MAX", "Boeing Business Jet", "Sky Interior", "Boeing Freighter",
    
    # Defense, Space & Security
    "Boeing Defense", "Boeing Space", "Boeing Security",
    "F/A-18 Super Hornet", "EA-18G Growler", "Boeing P-8 Poseidon",
    "CH-47 Chinook", "AH-64 Apache", "V-22 Osprey",
    "Boeing Satellites", "Boeing Missile Defense", "Boeing Autonomous Systems",
    
    # Services
    "Boeing Global Services", "Boeing Support", "Boeing Training",
    "Boeing Parts", "Boeing Analytics", "Boeing Digital Solutions",
    
    # Technology and Innovations
    "Boeing Research", "Boeing Technology", "Boeing Innovations",
    "Boeing Aerodynamics", "Boeing Materials Technology",
    "Boeing Environmental Performance", "Boeing Fuel Efficiency",
    
    # Corporate and Culture
    "Boeing CEO", "Boeing Leadership", "Boeing Culture",
    "Boeing Stock", "Boeing Shareholders", "Boeing Financials",
    "Boeing Ethics", "Boeing Careers", "Boeing History",
    
    # Events and Conferences
    "Boeing Air Show", "Paris Air Show", "Farnborough Air Show",
    "Boeing Earnings Call", "Boeing Investor Relations",
    
    # Key Personnel and Locations
    "Dennis Muilenburg", "Dave Calhoun", "William Boeing",
    "Boeing Chicago", "Boeing Seattle", "Boeing Everett",
    
    # Notable Incidents and Developments
    "Boeing 737 MAX crisis", "Boeing groundings", "Boeing safety",
    "Boeing return to service", "Boeing MCAS", "Boeing Starliner",
    
    # Potentially ambiguous headlines
    "Boeing Is Likely to Overtake Airbus",
]

boeing_exclusion_keywords = [
    # Common names and unrelated "boeing"
    "Boeing family", "Boeing Field", "Boeing Creek",
    
    # General aviation unrelated to The Boeing Company
    "Cessna", "Airbus", "Embraer", "Lockheed Martin",
    "Private jet", "Airplane model", "Flight school",
    
    # Common phrases or idioms
    "On a wing and a prayer", "Fly by the seat of one's pants", "When pigs fly",
    "Jet set lifestyle", "Sky-high expectations", "Take under one's wing",
    
    # Geographical references
    "Boeing Avenue", "Boeing Way", "Boeing Park",
    
    # Completely unrelated contexts
    "Boeing boing sound", "Boeing out of control", "Boeing from work",
]

blackrock_inclusion_keywords = [
    # Financial Products and Services
    "BlackRock funds", "iShares", "ETFs", "BlackRock Mutual Funds",
    "Aladdin", "BlackRock Solutions", "Risk management", "Asset management",
    "BlackRock Alternative Investors", "Portfolio management",
    "BlackRock Retirement Solutions", "BlackRock Liquidity",
    
    # Investments and Strategies
    "Equity investments", "Fixed income", "Multi-asset solutions",
    "Systematic investing", "Factor-based strategies", "Sustainable investing",
    "Impact investing", "BlackRock Global Allocation Fund",
    
    # Corporate Information and Culture
    "BlackRock CEO", "Larry Fink", "BlackRock leadership team",
    "BlackRock Board of Directors", "BlackRock Annual Report",
    "BlackRock ESG", "BlackRock corporate culture",
    "BlackRock investment approach", "BlackRock careers",
    
    # Technology and Innovations
    "BlackRock financial technology", "FinTech", "Robo-advisors",
    "Digital wealth management", "Investment analytics",
    
    # Key Personnel and Locations
    "Robert Kapito","Barbara Novick", "Susan Wagner", 
    "Laurence D. Fink", "Robert S. Kapito", "Gary Shedlin", "Christopher J. Meade", "Mark Wiedman",
    "BlackRock New York headquarters",
    "BlackRock offices worldwide",
    
    # Events and Conferences
    "BlackRock earnings call", "BlackRock investor day",
    "BlackRock conference",
    
    # Notable Developments and Initiatives
    "BlackRock's climate change strategy", "BlackRock's stake in companies",
    "BlackRock shareholder activism", "BlackRock market outlook",
    
    # Regulatory and Legal Aspects
    "BlackRock SEC filings", "BlackRock compliance",
    "Financial regulations affecting BlackRock",
]

blackrock_exclusion_keywords = [
    # Geographical references and natural features
    "Black rock formations", "Black Rock City", "Black Rock Desert",
    "Black rocks in geology", "Black Rock Forest", "Black Rock Beach",
    
    # Common phrases or idioms
    "As black as a rock", "Between a rock and a hard place",
    
    # Cultural references
    "Black Rock musical festival", "Black Rock art installation",
    
    # Unrelated businesses and products
    "Black rock grill", "Black rock coffee", "Black rock landscaping",
    "Black rock chic fashion", "Black Rock Bar & Grill",
    
    # Miscellaneous
    "BlackRock mountain biking", "Black Rock sailing",
    "Black rock candy", "Black rock mining",
]

amazon_inclusion_keywords = [
    # Online Retail and Services
    "Amazon Prime", "Amazon Marketplace", "Amazon Fresh", "Amazon Pantry",
    "Amazon Fashion", "Amazon Basics", "Prime Day", "Kindle Store",
    "Amazon Music", "Amazon Books", "Amazon Go", "Amazon Drive",
    
    # Amazon Devices
    "Kindle", "Fire Tablet", "Fire TV", "Amazon Echo", "Alexa",
    "Echo Dot", "Amazon Fire Stick", "Kindle Oasis", "Echo Show",
    
    # Amazon Web Services (AWS)
    "AWS", "Amazon S3", "EC2", "AWS Lambda", "Amazon RDS",
    "Amazon DynamoDB", "AWS CloudFront", "Amazon Aurora",
    "AWS Marketplace", "Amazon CloudWatch",
    
    # Technology and Innovations
    "Amazon Robotics", "Drone delivery", "Amazon AI",
    "Amazon Machine Learning", "Amazon Lex",
    
    # Corporate and Culture
    "Jeff Bezos", "Andy Jassy", "Amazon leadership",
    "Amazon stock", "NASDAQ:AMZN", "Amazon annual report",
    "Amazon sustainability", "Amazon career",
    
    # Events and Conferences
    "re:Invent", "Amazon Web Services Summit",
    
    # Key Personnel
    "Jeff Bezos", "Andy Jassy", "Werner Vogels", "Dave Clark",
    "Jeff Blackburn", "Brian Olsavsky", "Douglas Herrington",
    
    # Notable Developments
    "Amazon HQ2", "Amazon Air", "Amazon fulfillment center",
    "Amazon second headquarters", "Amazon delivery network",
    
    # Legal and Regulatory Issues
    "Amazon antitrust", "Amazon tax", "Amazon labor practices",
    
    # Amazon Studios and Video
    "Amazon Original Series", "Amazon Studios", "Prime Video",
]

amazon_exclusion_keywords = [
    # Amazon Rainforest and River
    "Amazon rainforest", "Amazon river", "Amazon basin",
    "Amazon deforestation", "Amazon conservation",
    
    # Mythology and Ancient History
    "Amazon warriors", "Amazons", "Amazon mythology",
    
    # Unrelated Businesses and Entities
    "Amazon fish", "Amazon parrot", "Amazon snakes",
    
    # Common Phrases and Other Uses
    "Amazon-like", "Amazonian climate", "Amazon rainforest reviews"
    
    # Geographical Locations
    "Amazon region", "Amazon state", "Hotel Amazon",
    "Travel", "Trip"
    
    # Ambiguous Phrases
    "Amazon of industry", "Amazon of the North",
]


In [9]:

# Keywords to compute keyword embeddings and compare with article tites

start_date='2013-01-01'
end_date='2013-01-05'

companies_keywords = {
    'Boeing': {
        'inclusion': boeing_inclusion_keywords,
        'exclusion': boeing_exclusion_keywords,
        'exclusion_included': True,
    },
    'Blackrock': {
        'inclusion': blackrock_inclusion_keywords,
        'exclusion': blackrock_exclusion_keywords,
        'exclusion_included': True,
    },
    'Amazon': {
        'inclusion': amazon_inclusion_keywords,
        'exclusion': amazon_exclusion_keywords,
        'exclusion_included': True,
    },
    # Add more company queries with their keywords here
}

for company, keywords in companies_keywords.items():
    print(f"Working on {company}...")
    get_articles_for(company, keywords['inclusion'], keywords['exclusion'], keywords['exclusion_included'], )

Working on Boeing...


Collecting articles from 2009-01-01 to 2024-02-20:   7%|▋         | 396/5529 [57:02<1257:42:46, 882.09s/it]

Error processing 2010-01-31: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))


Collecting articles from 2009-01-01 to 2024-02-20: 100%|██████████| 5529/5529 [2:37:11<00:00,  1.71s/it]   


Collected 124174 articles and exported to './Boeing/Boeing.csv'
cleaned dataset saved to Boeing\unfiltered\Boeing_cleaned_unfiltered.csv
removed dataset saved to Boeing\unfiltered\Boeing_removed_unfiltered.csv


Filtering Titles By Keywords: 100%|██████████| 63822/63822 [1:19:31<00:00, 13.38it/s]


cleaned dataset saved to Boeing\filtered\Boeing_cleaned_filtered.csv
removed dataset saved to Boeing\filtered\Boeing_removed_filtered.csv
Working on Blackrock...


Collecting articles from 2009-01-01 to 2024-02-20: 100%|██████████| 5529/5529 [1:43:13<00:00,  1.12s/it]


Collected 39619 articles and exported to './Blackrock/Blackrock.csv'
cleaned dataset saved to Blackrock\unfiltered\Blackrock_cleaned_unfiltered.csv
removed dataset saved to Blackrock\unfiltered\Blackrock_removed_unfiltered.csv


Filtering Titles By Keywords: 100%|██████████| 20046/20046 [25:27<00:00, 13.13it/s]


cleaned dataset saved to Blackrock\filtered\Blackrock_cleaned_filtered.csv
removed dataset saved to Blackrock\filtered\Blackrock_removed_filtered.csv
Working on Amazon...


Collecting articles from 2009-01-01 to 2024-02-20: 100%|██████████| 5529/5529 [1:50:59<00:00,  1.20s/it]  


Collected 214441 articles and exported to './Amazon/Amazon.csv'
cleaned dataset saved to Amazon\unfiltered\Amazon_cleaned_unfiltered.csv
removed dataset saved to Amazon\unfiltered\Amazon_removed_unfiltered.csv


Filtering Titles By Keywords: 100%|██████████| 107780/107780 [2:13:46<00:00, 13.43it/s] 


cleaned dataset saved to Amazon\filtered\Amazon_cleaned_filtered.csv
removed dataset saved to Amazon\filtered\Amazon_removed_filtered.csv


# Keywords to compute keyword embeddings and compare with article tites
apple_inclusion_keywords = [
    # Products and Hardware
    "iMac", "iMac Pro",
    "Mac Mini", "Mac Pro",
    "iPhone", "iPhone SE", "iPhone X", "iPhone 11", "iPhone 12", "iPhone 13",
    "iPad Pro", "iPad Air", "iPad Mini", "iPadOS",
    "Apple Watch Series", "watchOS",
    "iPod", "iPod touch", "iPod Shuffle",

    # Software and Services
    "iOS updates", "macOS updates", "watchOS updates",
    "iCloud", "iCloud Drive", "iCloud Photo Library",
    "iTunes", "Apple Music", "Apple Podcasts",
    "Apple TV+", "Apple TV app", "Apple Originals",
    "Apple Arcade", "Game Center", "App Store",

    # Technology and Innovations
    "phones", "Retina display", "Liquid Retina", "Super Retina",
    "Face ID", "Touch ID", "Apple Pay Cash",
    "Apple M2 Chip", "A15 Bionic", "H1 Chip",
    "TrueDepth camera", "Night mode", "Deep Fusion",
    "LiDAR Scanner", "ARKit",

    # Corporate and Culture
    "Tim Cook", "Steve Jobs", "Steve Wozniak", "Jony Ive",
    "Apple Store", "Genius Bar", "Today at Apple",
    "Apple Campus 2", "Spaceship campus",
    "Infinite Loop", "One Apple Park Way",
    "Apple Design Awards", "Apple Entrepreneur Camp",
    "AppleInsider", "MacRumors",

    # Events and Conferences
    "Special Events", "Apple Keynotes",
    "Apple Spring Event", "Apple Fall Event",
    "iPhone launch event", "iPad launch event",
    "Apple Developer Forums", "Tech Talks",
    "Apple Design Awards", "Shot on iPhone Challenge",

    # Competitors and Industry
    "Apple vs. Samsung", "Apple vs. Google", "Apple vs. Amazon", "Apple vs. Microsoft",
    "iOS vs. Android",
    "Mac vs. PC",
    "Apple's market share",
    "Innovations in consumer electronics",
    "Trends in technology and mobile computing",

    # Potentially ambiguous headlines
    "Repurposing Your Dead Mac",
    "Apple surprises us with a new, more-talkative iPod shuffle",
    "Not Only Was Steve Jobs Sick, He Had A Liver Transplant",
    "What's driving Steve Jobs?",
    "A Suicide at an Apple Manufacturer in China",
    "Compensation: $44000 And a MacBook - Cult of Mac",
    "The iTunes App Store Rolls with the Travel Season",
    "Steve Jobs Explains His Weight Loss in Healthnote",
    "Apple iPod touch (3rd Generation) 32GB Review: iPod touch Review",
    "He Put the Mac in Mackintosh",
    "Apple's Iconic Ad",
    "Apple's Latest Ad Takes Aim at Microsoft's 'Laptop Hunters' Campaign",
]

apple_exclusion_keywords = [
    # Fruit and agriculture
    "apple orchard", "apple harvest", "apple picking", "apple variety", "apple cider", "apples",
    "apple pie", "apple crumble", "apple sauce", "apple dessert", "apple recipe",
    "apple nutrition", "apple health benefits", "apple tree", "apple seed", "apple farming",

    # Geographical references
    "Big Apple", "Apple Hill", "Apple City", "Apple River", "Apple Blossom Festival"
    
    # Common phrases or idioms
    "apple of my eye", "bad apple", "apple doesn't fall far from the tree", "upset the apple cart", "compare apples to oranges",
    "eating healthy", 
    
    # Unrelated products or services
    "apple shampoo", "apple cosmetics", "apple fragrance", "apple scented", "apple flavor",
    "apple soda", "apple juice", "apple wine", "apple brandy", "apple beer",
]

# Extend the existing exclusion keywords list with the new ones
apple_exclusion_keywords.extend([
    "Bikes With Non-EPA Tagged Exhausts Out of Big Apple",
    "Copycat Fragrances",
    "Swiss apple is key to Michelle Obama's youthful looks",
    "Learn to grow tropical fruit at home in Houston",
    # Add any other previously identified exclusion keywords here...
])

# Clean the dataset
cleaned_unfiltered_df = clean_dataset(file_path, inclusion_keywords, exclusion_keywords)
cleaned_unfiltered_df

In [None]:
# gn = GoogleNews()
# search = gn.search('apple', from_ = '2024-02-01', to_ = '2024-02-20')

In [None]:
# # Initialize an empty list to hold the article data
# articles = []

# # Iterate over the entries and extract the required information
# for entry in search['entries']:
#     article = {
#         'title': entry['title'],
#         'link': entry['link'],
#         'published': entry['published'],
#         'source': entry['source']['title'] if 'source' in entry else 'Unknown'
#     }
#     articles.append(article)

# # Convert the list of articles into a DataFrame
# df = pd.DataFrame(articles)
# df

In [None]:
# file_path = f'./Apple/apple.csv' 

# year = 2018

# month = 4

# # Load the CSV file into a DataFrame
# df = pd.read_csv(file_path)

# # Convert the date column to datetime format
# # Replace 'published' with the name of your date column
# df['published'] = pd.to_datetime(df['published'])

# # Filter the DataFrame for the desired year
# # Replace 2020 with the year you want to extract
# df_year = df[(df['published'].dt.year == year) & (df['published'].dt.month == month)]

# # Save the filtered DataFrame to a new CSV file
# df_year.to_csv(f'./test_data/raw/apple_{year}.csv', index=False)

In [None]:
# def merge_datasets(file_path1, file_path2, output_file_path):
#     # Load the datasets from the given file paths
#     df1 = pd.read_csv(file_path1)
#     df2 = pd.read_csv(file_path2)
    
#     # Assuming 'id' is the column name for IDs in your datasets
#     # Adjust IDs in df2 to continue from the last ID in df1
#     last_id_df1 = df1['id'].max()
#     df2['id'] = df2['id'].apply(lambda x: x + last_id_df1)

#     # Merge the datasets without resetting the index
#     merged_df = pd.concat([df1, df2], ignore_index=True)
    
#     # Save the resulting dataset to the specified file
#     merged_df.to_csv(output_file_path, index=False)

#     print(f"Merged dataset saved to {output_file_path}")

In [None]:
# def drop_dates(file_path, date_to_remove):

#     df = pd.read_csv(file_path)

#     df['published'] = pd.to_datetime(df['published'])

#     # Convert the string to a datetime object for comparison
#     date_to_remove = pd.to_datetime(date_to_remove)

#     # Filter out the entries for the specified date
#     df_filtered = df[df['published'].dt.date != date_to_remove.date()]

#     # Save the resulting dataset to the specified file
#     df_filtered.to_csv(file_path, index=False)