# MSC Thesis Applied Data Science - Filtering Tweets

## Initialisation

In [None]:
# Import modules
import pandas as pd
import numpy as np
from datetime import timedelta

## Define Function for filtering

In [None]:
def filter_telegram_data_on_dates_and_keywords(dataset_path, date_keywords_dict, output_path, chunksize=100000):
    # Define the number of days before and after each date
    days_before = 20
    days_after = 5

    # Initialize a list to store DataFrames for each date
    filtered_dataframes = []

    for target_date_str, keywords in date_keywords_dict.items():
        target_date = pd.to_datetime(target_date_str)
        start_date = target_date - timedelta(days=days_before)
        end_date = target_date + timedelta(days=days_after)
        print(f'Filtering data for target date: {target_date} with keywords: {keywords}')

        # Compile the keywords into a regex pattern
        keyword_pattern = '|'.join(keywords)

        # Initialize a DataFrame to collect filtered data for the current date
        date_df = pd.DataFrame(columns=['id', 'body', 'date'])

        for chunk in pd.read_csv(dataset_path, chunksize=chunksize, delimiter=';'):
            chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')  # Ensure date conversion
            chunk = chunk.dropna(subset=['date'])  # Drop rows where date conversion failed

            # Filter the chunk for the specified date range, where is_retweet is 'no', and body contains keywords
            filtered_chunk = chunk[
                (chunk['date'] >= start_date) &
                (chunk['date'] <= end_date) &
                (chunk['body'].str.contains(keyword_pattern, case=False, na=False))
            ]

            # Append filtered chunk to date_df
            if not filtered_chunk.empty:
                date_df = pd.concat([date_df, filtered_chunk[['id', 'body', 'date']]])

        # Remove duplicates
        date_df = date_df.drop_duplicates(subset="id")

        # Append the filtered data for the current date to the list
        filtered_dataframes.append(date_df)

        # Optionally save each date's data to a separate file
        output_file = f'{output_path}/filtered_data_{target_date.date()}.csv'
        date_df.to_csv(output_file, index=False)

    # Concatenate all the filtered DataFrames into one final DataFrame
    final_df = pd.concat(filtered_dataframes, ignore_index=True)

    # Save the final concatenated DataFrame
    final_output_file = f'{output_path}/filtered_data_all_dates.csv'
    final_df.to_csv(final_output_file, index=False)

    return final_df

## Filtering

In [None]:
dataset_path = '/path/to/dataset'
output_path = '/path/to/output/dataset'

# Define incidents and keywords
date_keywords_dict = {
    '2021-07-15': ['Geert Wilders', '@geertwilderspvv', 'wilders'],
    '2022-01-05': ['Sigrid Kaag', '@SigridKaag', 'kaag'],
    '2022-07-06': ['Christianne van der Wal', 'Christianne van der Wal-Zeggelink', '@MinisterNenS', 'van der Wal', 'Zeggelink'],
    '2022-06-15': ['Dilan Yeşilgöz', 'Dilan Yesilgoz', '@DilanYesilgoz', 'Yesilgoz'],
    '2021-11-02': ['Mark Rutte', 'Hugo de Jonge', 'Rutte', 'de Jonge', '@MinPres', '@hugodejonge' ],
    '2021-10-06': ['Mark Rutte', 'Rutte', '@MinPres' ],
    '2021-01-10': [ 'Hugo de Jonge', 'de Jonge', '@hugodejonge' ],
    '2021-05-15': [ 'Hugo de Jonge', 'de Jonge', '@hugodejonge', 'Mark Rutte', 'Rutte', '@MinPres', 'Jaap van Dissel', 'van Dissel', '@RIVM_vDissel' ]
    }

filtered_df = filter_telegram_data_on_dates_and_keywords(dataset_path, date_keywords_dict, output_path)
print(filtered_df.head())