In [1]:
'''
This scripts uses a series of keywords to detect
if a tweet refers to climate or environmental protests.
'''

'\nThis scripts uses a series of keywords to detect\nif a tweet refers to climate or environmental protests.\n'

In [2]:
import numpy as np
import pandas as pd

In [3]:
def read_csv(path):
    """
    Reads a CSV file and returns a pandas DataFrame object.

    Args:
    path (str): The file path of the CSV file to read.

    Returns:
    pandas.DataFrame: The contents of the CSV file as a pandas DataFrame object.
    """
    
    return pd.read_csv(path)

In [4]:
def detect_keywords(df, protest_keywords, climate_keywords):
    """
    Detects if any of the given keywords in both lists are present in a row of the input DataFrame.

    Args:
        df (pandas.DataFrame): The input DataFrame to be searched.
        protest_keywords (list): A list of keywords related to protest to search for in the DataFrame.
        climate_keywords (list): A list of keywords related to protest to search for in the DataFrame.


    Returns:
        pandas.DataFrame: A new DataFrame with columns indicating whether any of the keywords were present in each row.

    Raises:
        ValueError: If the input DataFrame is empty or if the keywords list is empty.

    Disclaimer: The docstring for this function was written with the help of ChatGPT, a language model trained by OpenAI.

    Example:
        If we have a DataFrame named 'df' with a column named 'text' that contains text data, and a list of keywords,
        we can detect the presence of the keywords in the following way:

        >>> keywords = ["climate change", "global warming", "environment"]
        >>> df_with_keywords = detect_keywords(df, keywords)
        >>> print(df_with_keywords)
                text               has_protest_keywords   has_climate_keywords
            0   foo bar            False                  False
            1   climate protest    True                   True
            2   climate bar        False                  True
            3   baz protest        True                   False
    """
    if df.empty:
        raise ValueError("Input DataFrame is empty.")
    if not protest_keywords or not climate_keywords:
        raise ValueError("Keywords list is empty.")
    

    # Join the list of keywords into a regex pattern with the '|' operator and capture groups
    def make_pattern(keywords):
        """
        Generates a regular expression pattern that matches any of the provided keywords.

        Args:
            keywords (list): A list of strings containing the keywords to match.

        Returns:
            str: A regular expression pattern that matches any of the provided keywords.
        """
        
        pattern =  "|".join([f'({word.lower()})' for word in keywords])
        return pattern
    
    
    # Creates regex patterns for the two sets of keywords
    protest_pattern = make_pattern(protest_keywords)
    climate_pattern = make_pattern(climate_keywords)

    # Use str.contains() to search for the pattern in each row of the DataFrame
    has_protest_keywords = df.raw_content.str.contains(protest_pattern, case=False, na=False)
    has_climate_keywords = df.raw_content.str.contains(climate_pattern, case=False, na=False)
    
    # Boolean arithmetic to detect which entries have at least one keyword in both lists
    has_both_keywords = (has_protest_keywords) & (has_climate_keywords)
    
    # Add the boolean results as a new column in the DataFrame
    df = df.assign(has_protest_keywords=has_protest_keywords) \
       .assign(has_climate_keywords=has_climate_keywords) \
       .assign(has_both_keywords=has_both_keywords)
        
    # Now we will discover which were the found tokens as well.
    # For eficiency sake, we will do this calculation only on rows where
    # a match was found.

    
    def extract_matches(df, pattern):
        """
        Extracts matches for a regular expression pattern from a pandas DataFrame column.

        Args:
            df (pandas.DataFrame): The DataFrame containing the text data.
            pattern (str): A regular expression pattern to search for in the DataFrame column.

        Returns:
            list: A list of the matched tokens for each row in the DataFrame.
        """

        # Extract the detected tokens and turns them into a bidiemnsional list
        tokens_found = df.raw_content.str.lower().str.extract(pattern)
        
        # Remove nans. Now we have one array of the found words only
        tokens_found = tokens_found.apply(lambda row: [item for item in row if pd.notna(item)], axis=1)
        
        return tokens_found
    
    
    df_matches = df[(df.has_protest_keywords) & (df.has_climate_keywords) ]
    df_no_matches = df[(~df.has_protest_keywords) | (~df.has_climate_keywords)]
    
    protest_tokens = extract_matches(df_matches, protest_pattern)
    climate_tokens = extract_matches(df_matches, climate_pattern)
        
    df_matches['protest_tokens'] = protest_tokens
    df_matches['climate_tokens'] = climate_tokens
    
    df_no_matches['protest_tokens'] = "not calculated, as the text didn't match keywords present in both lists"
    df_no_matches['climate_tokens'] = "not calculated, as the text didn't match keywords present in both lists"

    
    return pd.concat([df_matches, df_no_matches])
    


In [7]:
def main():

    df = read_csv("../../output/mvp/2.tweets_with_relative_engagement/concatenated-tweets.csv")

    protest_keywords = [ "greta",
        "thunberg",
        "painting",
        "soup",
        "gallery",
        "mashed",
        "potatoes",
        "museum",
        "Greta Thunberg",
        "Extinction Rebellion",
        "Fridays for Future",
        "Sunrise Movement",
        "Just Stop Oil",
        "activism",
        "activist",
        "march",
        "demonstration",
        "rally",
        "protest",
        "manifest",
        "strike",
    ]

    climate_keywords = [
        "climate change",
        "climate crisis",
        "global warming",
        "environment",
        "climate justice",
        "renewable energy",
        "fossil fuels",
        "carbon emissions",
        "greenhouse gases",
        "climate emergency",
        "climate action",
        "climate policy",
    ]
    
    df = detect_keywords(df, protest_keywords, climate_keywords)
    
    df.to_csv("../../output/mvp/3.tweets_with_keyword_detection/tweets-with-keywords.csv", sep="|")

In [8]:
%%time
if __name__ == "__main__":
    main()

  has_protest_keywords = df.raw_content.str.contains(protest_pattern, case=False, na=False)
  has_climate_keywords = df.raw_content.str.contains(climate_pattern, case=False, na=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_matches['protest_tokens'] = protest_tokens
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_matches['climate_tokens'] = climate_tokens
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas

CPU times: user 9min 57s, sys: 14.8 s, total: 10min 12s
Wall time: 10min 16s
