In [22]:
import pandas as pd
import numpy as np
from datetime import timedelta
import plotly.graph_objects as go

In [23]:
def find_string_in_messages(df, string_to_find):
    """
    Find a string in df and return a subset df that of messages that include the string
    :param: df: dataframe of interest
    :param: string_to_find: the string to find in message
    """
    subset_df = df[df['message'].str.contains(string_to_find, na=False, case=False)]
    subset_df.reset_index(inplace=True)
    return subset_df

def get_account_message_counts(df):
    """
    For each account in the dataframe, creates a count for how many messages they have sent
    and returns a dataframe that contains account names and count.
    :param: df: dataframe of interest
    """
    account_sums = df.groupby(['account']).size().reset_index(name='count')
    account_sums = account_sums.sort_values(by=['count'], ascending=False)
    return account_sums

def get_topic_df(df, keywords):
    """
    Creates a subset df of messages that include strings from a list of keywords
    :param: df: dataframe of interest
    :param: keywords: list of strings to select for
    """
    mask = df[df['message'].str.contains('|'.join(keywords), na=False, case=False)]
    mask.reset_index(inplace=True, drop=True)
    return mask

In [24]:
def clean_yint_data(file_url):
    """
    Takes in a hosted csv file of yint messages and returns a cleaned dataframe
    Removes standard bots to allow gov officials to quickly remove non-community member posts
    
    Removes commercial bots promoting deals and bots promoting usage of the rumble app
    
    Note: recommend doing spot checks to ensure real users are not being removed
    
    :param: file_url: the hosted location of a raw csv file containing y*int data
    """
    df = pd.read_csv(file_url, parse_dates=[0], infer_datetime_format=True)
    
    # Create time frame for data
    start_date = min(df['time'])
    end_date = max(df['time'])
    delta = end_date - start_date
    days = delta.days
    # get a standard amount of posts per day that would probably be a bot
    # in this case, I set the standard to 10+ posts a day
    bot_level_posting = 10*days 
    
    # Eliminate "deals" bots promoting commercial info
    bot_keywords = ['deal', 'sale', 'rumble']
    potential_bot_df = get_topic_df(df, bot_keywords)
    # Get list of accounts that mention deals
    potential_bot_accounts = potential_bot_df['account'].unique()
    
    # Remove users that don't seem like bots
    boolean_series = df['account'].isin(potential_bot_accounts)
    potential_bot_df = df[boolean_series]
    account_sums = get_account_message_counts(potential_bot_df)
    # NOTE: Recommend doing spot checks every so often to confirm you are not removing real users
    bots = account_sums.loc[account_sums['count'] > bot_level_posting]
    # Get list of bot account names
    bot_accounts = bots['account'].unique()


    # Create a df that doesn't include bots
    cleaned_df = df[~df['account'].isin(bot_accounts)]
    cleaned_df.reset_index(inplace=True)
    return cleaned_df

### Running the notebook
Paste the url of your hosted csv below to create the cleaned dataset. This will save a cleaned version of the data as a csv.

In [25]:
# Edit the url variable here:
url = 'https://raw.githubusercontent.com/data-challengers/DC5/main/DC5-Data/YInt%20Social%20Media%20Data/YInt.csv'
df = clean_yint_data(url)
df.to_csv('../additional_data/cleaned_yint_data.csv', index=False) # output csv

### Create earthquake-related message plots by neighborhood

The following plot can be used to allocate resources by neighborhood, depending on which have the highest number of earthquake messages. It also outputs a csv that saves all relevant messages for further deep dives by city officials.

In [26]:
# Plot frequency of messaging from neighborhoods over time
def plot_messages_over_time(df, title):
    fig = go.Figure()
    # plot all classifications as different lines
    for i in range(0, len(neighborhoods)):
        # filter for specific classification
        expr = df['location'] == neighborhoods[i]
        fig.add_trace(go.Scatter(
            x=df[expr]['hour'],
            y=df[expr]['count'],
            name=neighborhoods[i],
            connectgaps=False,
            mode='lines')
        )
        
    fig.update_layout(
        title_text=title,
        xaxis_title="Time",
        yaxis_title="Count",
        legend_title="Neighborhood")
    
    fig.show()

def create_neighborhood_grouped_count_plot(df, title):
    grouped_df = create_message_counts(df)
    fig_df = fill_df_nas(grouped_df, 'hour', 'location', neighborhoods)
    plot_messages_over_time(fig_df, title)

def fill_df_nas(df, time_col, group_col, group_arr):
    """
    Expands dataframe to include all x-axis values for every group, and
    fills dataframes with NAs when there are no observations for the specified group.
    
    Modified function that includes NA values for all time points within range from min-max times observed.
    
    Useful for Plotly graphs in mode='lines+markers'
    :param: df: dataframe of interest
    :param: time_col: string name of column that contains time variable (or generally, the x variable)
    :param: group_col: string name of column that contains the groups to plot over different traces
    :param: group_arr: list or numpy array of all unique observations in df['group_col']
    """
    # Sort by time for graphing
    df = df.sort_values(by=[time_col])
    # Get series time range between min and max time points
    time_range = pd.date_range(df[time_col].min(), df[time_col].max(), freq='H')
    
    df_series = pd.Series(np.tile(group_arr, len(time_range)))
    df_idx_series = time_range \
        .repeat(len(group_arr))
    new_df = pd.DataFrame({time_col: df_idx_series,
                          group_col: df_series})
    df_with_nas = pd.merge(new_df, df, on=[time_col, group_col], how='left')
    return df_with_nas
    
# Round times to the hour (to be grouped for plotting)
def create_message_counts(df):
    df['hour'] = df['time'].dt.floor('h')
    # Create count aggregate of classification appearances over time
    grouped_df = df.groupby(['location', 'hour']).size().reset_index(name='count')
    return grouped_df

# Get list of neighborhoods being used in Yint dataset
neighborhoods = df['location'].unique()
alert_topics = ['quake', 'quaking', 'aftershock', 'shake', 'shaking', 'outage', 'sewage', 'fire']
alert_df = get_topic_df(df, alert_topics)
alert_df.to_csv('../additional_data/yint_alerts.csv', index=False) 
create_neighborhood_grouped_count_plot(alert_df, 'Y*Int Earthquake-Related Messages')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

