In [15]:
import pandas as pd
import numpy as np
from datetime import timedelta

In [16]:
def find_string_in_messages(df, string_to_find):
    """
    Find a string in df and return a subset df that of messages that include the string
    :param: df: dataframe of interest
    :param: string_to_find: the string to find in message
    """
    subset_df = df[df['message'].str.contains(string_to_find, na=False, case=False)]
    subset_df.reset_index(inplace=True)
    return subset_df

def get_account_message_counts(df):
    """
    For each account in the dataframe, creates a count for how many messages they have sent
    and returns a dataframe that contains account names and count.
    :param: df: dataframe of interest
    """
    account_sums = df.groupby(['account']).size().reset_index(name='count')
    account_sums = account_sums.sort_values(by=['count'], ascending=False)
    return account_sums

def get_topic_df(df, keywords):
    """
    Creates a subset df of messages that include strings from a list of keywords
    :param: df: dataframe of interest
    :param: keywords: list of strings to select for
    """
    mask = df[df['message'].str.contains('|'.join(keywords), na=False, case=False)]
    mask.reset_index(inplace=True, drop=True)
    return mask

In [17]:
def clean_yint_data(file_url):
    """
    Takes in a hosted csv file of yint messages and returns a cleaned dataframe
    Removes standard bots to allow gov officials to quickly remove non-community member posts
    
    Removes commercial bots promoting deals and bots promoting usage of the rumble app
    
    Note: recommend doing spot checks to ensure real users are not being removed
    
    :param: file_url: the hosted location of a raw csv file containing y*int data
    """
    df = pd.read_csv(file_url, parse_dates=[0], infer_datetime_format=True)
    
    # Create time frame for data
    start_date = min(df['time'])
    end_date = max(df['time'])
    delta = end_date - start_date
    days = delta.days
    # get a standard amount of posts per day that would probably be a bot
    # in this case, I set the standard to 10+ posts a day
    bot_level_posting = 10*days 
    
    # Eliminate "deals" bots promoting commercial info
    bot_keywords = ['deal', 'sale', 'rumble']
    potential_bot_df = get_topic_df(df, bot_keywords)
    # Get list of accounts that mention deals
    potential_bot_accounts = potential_bot_df['account'].unique()
    
    # Remove users that don't seem like bots
    boolean_series = df['account'].isin(potential_bot_accounts)
    potential_bot_df = df[boolean_series]
    account_sums = get_account_message_counts(potential_bot_df)
    # NOTE: Recommend doing spot checks every so often to confirm you are not removing real users
    bots = account_sums.loc[account_sums['count'] > bot_level_posting]
    # Get list of bot account names
    bot_accounts = bots['account'].unique()


    # Create a df that doesn't include bots
    cleaned_df = df[~df['account'].isin(bot_accounts)]
    cleaned_df.reset_index(inplace=True)
    return cleaned_df

### Running the notebook
Paste the url of your hosted csv below to create the cleaned dataset. 

In [18]:
# Edit the url variable here:
url = 'https://raw.githubusercontent.com/data-challengers/DC5/main/DC5-Data/YInt%20Social%20Media%20Data/YInt.csv'
cleaned_df = clean_yint_data(url)