In [44]:
import pandas as pd
pre_data2 = pd.read_csv("pre_dataset2.csv")

text_column = 'post_body_text'

In [45]:
import re

# Ensure the column is string type to avoid errors with non-text data
pre_data2[text_column] = pre_data2[text_column].astype(str)

# Remove URLs 
pre_data2[text_column] = pre_data2[text_column].str.replace(r'https?://\S+|www\.\S+', '', regex=True)

# Remove emojis using a comprehensive regex pattern
emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # Emoticons
    "\U0001F300-\U0001F5FF"  # Symbols & pictographs
    "\U0001F680-\U0001F6FF"  # Transport & map symbols
    "\U0001F700-\U0001F77F"  # Alchemical symbols
    "\U0001F1E0-\U0001F1FF"  # Flag (iOS)
    "\U00002702-\U000027B0"
    "\U000024C2-\U0001F251"
    "]+",
    flags=re.UNICODE,
)
pre_data2[text_column] = pre_data2[text_column].str.replace(emoji_pattern, '', regex=True)

# Remove hashtags and the entire word associated
pre_data2[text_column] = pre_data2[text_column].str.replace(r'#\w+', '', regex=True)

# Clean up extra whitespace
pre_data2[text_column] = pre_data2[text_column].str.strip()  # Remove leading/trailing spaces
pre_data2[text_column] = pre_data2[text_column].str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single one


In [46]:
# Remove all repeated posts 
pre_data2.drop_duplicates(subset=[text_column], keep='first', inplace=True)

In [47]:
# change published_at dates
pre_data2['published_at'] = pd.to_datetime(pre_data2['published_at'])

# Extract just the date part (YYYY-MM-DD)
pre_data2['published_at'] = pre_data2['published_at'].dt.date

print(pre_data2[['published_at']].head())

  published_at
0   2020-11-24
1   2020-11-24
2   2020-11-24
3   2020-11-24
5   2020-11-23


In [48]:
# Save the cleaned data to a new CSV file 
pre_data2.to_csv("cleaned_dataset1.csv", index=False)

In [49]:
cleaned1 = pd.read_csv("cleaned_dataset1.csv")
print(cleaned1)
print(cleaned1.columns)

          PostId published_at  \
0      430137083   2020-11-24   
1      440975315   2020-11-24   
2      440975410   2020-11-24   
3      232168406   2020-11-24   
4      154357285   2020-11-23   
...          ...          ...   
23810  412473029   2019-11-25   
23811  476186768   2019-11-25   
23812  487272409   2019-11-25   
23813  375101593   2019-11-25   
23814   82156817   2019-11-25   

                                          post_body_text  
0      BCW Pittsburgh is proud to sponsor the Black E...  
1      RT @KarinesReyes87: Mayor Dinkins set the city...  
2      Mayor Dinkins set the city on the right track ...  
3      <p>DeRay, Kaya, Sam, and De'Ara dive into the ...  
4      President-elect Biden, in choosing Shuwanza Go...  
...                                                  ...  
23810  How Queen and Slim Pays Homage to Decades of B...  
23811  RT @Dlw20161950: HUGE! Black Support for Presi...  
23812  Dada a largada de BLACK WEEK de Nanum Coreano....  
23813  Overti

In [50]:
# 1. Create a mask for posts containing the keywords.
keyword_mask = cleaned1['post_body_text'].str.contains('black|african american', case=False, na=False)

# 2. Create a mask for posts that are 20 or more characters long.
length_mask = cleaned1['post_body_text'].str.len() >= 50

# 3. Apply both masks to filter the DataFrame.
cleaned1 = cleaned1[keyword_mask & length_mask].copy()


In [51]:
# removing irrelevant key words

irrelevant_keywords = [
    "pepper",        
    "friday",       
    "cat", 
    "cats"          
    "dress",        
    "coffee",       
    "tea",           
    "hole",         
    "box",           
    "market",        
    "licorice",     
    "ink",           
    "color",         
    "shoes",         
    "screen",        
    "belt",           
    "plague",
    "hair", 
    "sheep",
    "magic",
    "out",
    "list", 
    "window",
    "shirt",
    "photo"
]

def is_irrelevant(post_body_text, irrelevant_terms):
    """
    Checks if a given text contains any of the irrelevant terms,
    indicating a non-racial usage of "black".
    The check is case-insensitive and looks for whole words.
    """
    if pd.isna(post_body_text): 
        return False
    post_body_text_lower = str(post_body_text).lower() 
    for term in irrelevant_terms:
        if re.search(r'\b' + re.escape(term) + r'\b', post_body_text_lower):
            return True
    return False

In [52]:
# Ensure the 'text' column is clean 
cleaned1['post_body_text'] = cleaned1['post_body_text'].astype(str)

# Create a boolean mask: True for relevant posts, False for irrelevant ones.
relevant_mask = ~cleaned1['post_body_text'].apply(lambda x: is_irrelevant(x, irrelevant_keywords))

# Filter the DataFrame using the mask
cleaned2 = cleaned1[relevant_mask].copy() # .copy() to avoid SettingWithCopyWarning

cleaned2 = cleaned2.reset_index(drop=True)

print("--- Cleaned Data (Irrelevant Posts Removed) ---")
print(cleaned2)
print(f"\nNumber of posts after cleaning: {len(cleaned2)}")
print("\n" + "="*50 + "\n")


--- Cleaned Data (Irrelevant Posts Removed) ---
          PostId published_at  \
0      430137083   2020-11-24   
1      440975315   2020-11-24   
2      232168406   2020-11-24   
3      314080797   2020-11-23   
4      475916367   2020-11-21   
...          ...          ...   
16784  412473029   2019-11-25   
16785  476186768   2019-11-25   
16786  487272409   2019-11-25   
16787  375101593   2019-11-25   
16788   82156817   2019-11-25   

                                          post_body_text  
0      BCW Pittsburgh is proud to sponsor the Black E...  
1      RT @KarinesReyes87: Mayor Dinkins set the city...  
2      <p>DeRay, Kaya, Sam, and De'Ara dive into the ...  
3      FOR IMMEDIATE RELEASE PA AUDITOR GENERAL-ELECT...  
4      My question is how'd the African American vote...  
...                                                  ...  
16784  How Queen and Slim Pays Homage to Decades of B...  
16785  RT @Dlw20161950: HUGE! Black Support for Presi...  
16786  Dada a largada de

In [53]:
cleaned2.to_csv('cleaned_dataset2.csv', index=False)

In [54]:
# Dividing pre and post 
cleaned2 = pd.read_csv("cleaned_dataset2.csv")

cleaned2['published_at'] = pd.to_datetime(cleaned2['published_at'], errors='coerce')
split_date = pd.Timestamp("2020-05-25")

# Filter into pre-event and post-event datasets
pre_cleaned2 = cleaned2[cleaned2['published_at'] < split_date]
post_cleaned2 = cleaned2[cleaned2['published_at'] >= split_date]


pre_cleaned2.to_csv("pre_blm_cleaned2.csv", index=False)
post_cleaned2.to_csv("post_blm_cleaned2.csv", index=False)

In [None]:
pre_cleaned2 = pd.read_csv("pre_blm_cleaned2.csv")
print(pre_cleaned2)

         PostId published_at  \
0     476721818   2020-05-24   
1     476721844   2020-05-24   
2     313534965   2020-05-23   
3       1302491   2020-05-23   
4     475545296   2020-05-22   
...         ...          ...   
3742  412473029   2019-11-25   
3743  476186768   2019-11-25   
3744  487272409   2019-11-25   
3745  375101593   2019-11-25   
3746   82156817   2019-11-25   

                                         post_body_text  
0     RT @DonaldDynasty: The Democrats like to prete...  
1     The Democrats like to pretend to be the party ...  
2     RT @SecretaryCarson: It is disheartening to se...  
3     Trump 2020 senior advisor Katrina Pierson comm...  
4     The Story of Black Appalachia is rarely told.....  
...                                                 ...  
3742  How Queen and Slim Pays Homage to Decades of B...  
3743  RT @Dlw20161950: HUGE! Black Support for Presi...  
3744  Dada a largada de BLACK WEEK de Nanum Coreano....  
3745  Overtime Off the Record we ta

In [56]:
post_cleaned2 = pd.read_csv("post_blm_cleaned2.csv")
print(post_cleaned2)

          PostId published_at  \
0      430137083   2020-11-24   
1      440975315   2020-11-24   
2      232168406   2020-11-24   
3      314080797   2020-11-23   
4      475916367   2020-11-21   
...          ...          ...   
13037  269941062   2020-05-25   
13038  153189948   2020-05-25   
13039  242757680   2020-05-25   
13040  313329734   2020-05-25   
13041   73118219   2020-05-25   

                                          post_body_text  
0      BCW Pittsburgh is proud to sponsor the Black E...  
1      RT @KarinesReyes87: Mayor Dinkins set the city...  
2      <p>DeRay, Kaya, Sam, and De'Ara dive into the ...  
3      FOR IMMEDIATE RELEASE PA AUDITOR GENERAL-ELECT...  
4      My question is how'd the African American vote...  
...                                                  ...  
13037  The deliberate destruction of Black Afrikans c...  
13038  This plant lover is on a quest to show Black m...  
13039  RT @NCGOP: A put .@JoeBiden on BLAST for 'You ...  
13040  Black 

In [57]:
# Take a random sample of 10,000 posts for the post_blm dataset as it is too large
random_10k = post_cleaned2 .sample(n=10000, random_state=42)  

random_10k.to_csv("post_blm_cleaned2_10k.csv", index=False)

print(f"Sample size: {len(random_10k)}")

Sample size: 10000
