In [1]:
import glob
import json
import numpy as np
import pandas as pd
import scipy.stats as stats

In [2]:
def read_files(directory):
    '''
    Reads all JSON files in a given directory into a single pandas dataframe.
    (Disclaimer: this function written by ChatGPT and edited by me)
    
    Parameters:
    -----------
    directory : str
        The path to the directory containing the JSON files.
        
    Returns:
    --------
    pandas.DataFrame
        A single dataframe containing all data from the JSON files.
    '''
   
    # Initialize an empty list to store the dataframes
    dfs = []

    # Loop through all the JSON files in the directory
    for filename in glob.glob(directory + '/*.json'):

        # Read the JSON file into a pandas dataframe
        with open(filename, 'r') as f:
            json_data = json.load(f)
            df = pd.DataFrame(json_data)
            dfs.append(df)

    # Concatenate all the dataframes into a single dataframe
    df = pd.concat(dfs, ignore_index=True)
    
    return df

In [3]:
def set_date_interval(df):
    """
    Adds four columns to a pandas DataFrame that contain the date 7 days before and
    after each date in the DataFrame, the original date in YYYY-MM-DD format, and a week ID.


    Args:
        df (pandas.DataFrame): The DataFrame to which the date interval columns will be added. 
        This DataFrame must contain a 'date' column that contains datetime objects.

    Returns:
        pandas.DataFrame: The input DataFrame with four new columns: '7_days_before', 
        '7_days_after', 'date_str', and 'week_id'.

        These columns contain the date 7 days before and after each date in the 'date' column of the input DataFrame,
        the date in YYYY-MM-DD format, and a week identifier in the format 'YEAR_WEEK'.

    Disclaimer:
        This docstring was written by ChatGPT, a large language model trained by OpenAI.
    """
    df['datetime'] = pd.to_datetime(df.date)
    
    df['date_str'] = df.datetime.dt.strftime("%Y-%m-%d")
    df['7_days_before'] = df['datetime'] - pd.DateOffset(days=7)
    df['7_days_after'] = df['datetime'] + pd.DateOffset(days=7)
    df['week_id'] = df.datetime.dt.year.astype(str) + "_" + df.datetime.dt.isocalendar().week.astype(str)
    
    return df

In [4]:
def compute_group_statistics(df: pd.DataFrame, group: list, columns: list) -> pd.DataFrame:
    """
    For each item of interest, compute the average and standard deviation from each group.

    Args:
        df (pd.DataFrame): The input DataFrame.
        group (list): A list containing the columns over which a groupby will be applied.
        columns (list): A list of column names to compute the statistics.

    Returns:
        A DataFrame with the average and standard deviation for each group.

    Disclaimer:
        This docstring was written by ChatGPT, a large language model trained by OpenAI.
    """
    
    # Compute the average and standard deviation from each group
    avgs = df.groupby(group)[columns].mean().reset_index()
    stdevs = df.groupby(group)[columns].std().reset_index()
    percentiles = df.groupby(group)[columns].quantile(.9).reset_index()
    
    # Rename the columns to avoid naming conflicts
    avgs = avgs.rename(columns = {
        col_name: f'group_avg_for_{col_name}' for col_name in avgs.columns if col_name not in ['user', 'date_str', 'week_id']
    })

    stdevs = stdevs.rename(columns = {
        col_name: f'group_stdev_for_{col_name}'for col_name in stdevs.columns if col_name not in ['user', 'date_str', 'week_id']
    })
    
    percentiles = percentiles.rename(columns = {
        col_name: f'group_90th_percentile_for_{col_name}'for col_name in percentiles.columns if col_name not in ['user', 'date_str', 'week_id']
    })    

    # Merge the two DataFrames and return the result
    result = pd.merge(avgs, stdevs, on=group, suffixes=["","_y"]).merge(percentiles, on=group, suffixes=["", "_y"])
    result = result.drop(columns=[col for col in result.columns if "_y" in col])
    
    return result


In [5]:
def compute_diffs(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    For each item, computes how many standard deviations we are away from the average.

    Args:
    - df (pd.DataFrame): The pandas DataFrame containing the data.
    - column (str): The name of the column to compute differences for.

    Returns:
    - The input DataFrame with a new column added, containing the computed differences.

    Example:
    >>> data = pd.DataFrame({'values': [1, 2, 3, 4, 5],
                             'group_avg_for_values': [3, 3, 3, 3, 3],
                             'group_stdev_for_values': [1, 1, 1, 1, 1]})
    >>> result = compute_diffs(data, 'values')
    >>> print(result)
       values  group_avg_for_values  group_stdev_for_values  values_diff
    0       1                     3                       1         -2.0
    1       2                     3                       1         -1.0
    2       3                     3                       1          0.0
    3       4                     3                       1          1.0
    4       5                     3                       1          2.0
    
    Disclaimer:
        This docstring was written by ChatGPT, a large language model trained by OpenAI.
    """
    
    df[f'{column}_diff'] = (df[column] - df[f'group_avg_for_{column}']) / df[f'group_stdev_for_{column}']

    return df

In [6]:
def compute_quantiles(df: pd.DataFrame, group: list, column: str) -> pd.DataFrame:
    """
    For each item, compute the quantile relative to all other items in the same group.
    
    
    Args:
    - df (pd.DataFrame): The pandas DataFrame containing the data.
    - group (list): A list containing the columns over which a groupby will be applied.
    - column (str): The name of the column to compute differences for.

    Returns:
    - The input DataFrame with a new column added, containing the computed quantiles
    
    Disclaimer:
        This docstring was written by ChatGPT, a large language model trained by OpenAI.
    """
        
    # Adapted from https://stackoverflow.com/questions/37093088/pandas-create-percentile-field-based-on-groupby-with-level-1/37093160#37093160
    df[f'percentile_for_{column}'] = df.groupby(group) \
        .total_engagement \
        .apply(lambda series: \
                pd.Series(
                    [stats.percentileofscore(series, row, kind='rank') for row in series], # See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.percentileofscore.html
                    index=series.index
                )
              )
    
    df[f'percentile_for_{column}'] = 100 - df[f'percentile_for_{column}']
    df[f'percentile_for_{column}'] = df[f'percentile_for_{column}'].round()
               
    return df

In [7]:
def main():
    # Reads the collected tweets
    df = read_files("../../output/mvp/1.raw_tweets/jsons/")
    
    # Creates date columns
    df = set_date_interval(df)
    
    # Creates measures of interest: total engagement, engagement with no interaction,
    # engagement with interaction, interaction ratios
    df['total_engagement'] = df.like_count + df.retweet_count + df.quote_count + df.reply_count
#     df['non_interactive_engagement'] = df.like_count + df.retweet_count
#     df['interactive_engagement'] = df.quote_count + df.reply_count
#     df['interactive_ratio'] = df.interactive_engagement / df.total_engagement
    
#     # Computes the averages and standard deviations for the group
#     gpby = compute_group_statistics(df, ['user_id', 'week_id'], ["like_count", "retweet_count", "quote_count", "reply_count",
#     "total_engagement", "non_interactive_engagement", "interactive_engagement","interactive_ratio"])
    
#     gpby = compute_group_statistics(df, ['user', 'week_id'], ["total_engagement"])
    
    # Joins the computed values with the entire data frame
#     df = df.merge(gpby, on=['user', 'date_str'])
#     df = df.merge(gpby, on=['user', 'week_id'])    
    df = compute_quantiles(df, ['user', 'week_id'], "total_engagement")
    
    # Computes the differences
#     for column in ["like_count", "retweet_count", "quote_count", "reply_count",
#         'total_engagement', 'non_interactive_engagement', 'interactive_engagement', 'interactive_ratio']:
#         df = compute_diffs(df, column)
        
    df.to_csv("../../output/mvp/2.tweets_with_relative_engagement/concatenated-tweets.csv", index=False)
        
    return df

In [8]:
if __name__ == "__main__":
    df = main()

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda series: \
