In [1]:
#import libraries
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
import tqdm
import glob
import os
import ast
from itertools import cycle

In [2]:
#spacy definitions
import spacy
print(f"spaCy version: {spacy.__version__}")
print(f"CUDA available: {spacy.prefer_gpu()}")
# print(f"GPU device count: {spacy.util.get_gpu_count()}")

# Load spaCy model
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm") 

# Add your custom EntityRuler
ruler = nlp.add_pipe("entity_ruler", before="ner")

spaCy version: 3.8.5
CUDA available: True


In [3]:
#Init Variables for csv names
year="2024"
# month="01"
cwd=os.getcwd()
month=os.path.basename(cwd)
# print(f"{month}")

#special_identifier='_xfin_amt_sep_spi_ama'
special_identifier='_moving_average' #for csv output
# Construct the directory name
output_directory = f"batch{special_identifier}"

services = [
    "Comcast", "Airline", "Healthcare", "Trains", "Banks", "United States",
    "ER", "Youtube", "Reddit", "Netflix",
    "Xfinity", "Amtrak", "Septa", "Spirit", "American",
    "Disney"
]

complaint_patterns = [{"label": "SERVICE", "pattern": service} for service in services]
ruler.add_patterns(complaint_patterns)

moving_average_window = 30

In [4]:
# Load data
import pandas as pd
import glob
import os

# parquet_directory = f"PARQUET/batch{special_identifier}"
parquet_directory = f"PARQUET/batch_base_parquet"
plot_output_directory = f"YEAR_APPEND/batch{special_identifier}/PLOTS"

# --- Read and Concatenate Parquet Files ---
all_parquet_files = []
try:
    # Use glob to find all files ending with .parquet in the specified directory
    parquet_file_pattern = os.path.join(parquet_directory, "ner*.parquet")
    all_parquet_files = glob.glob(parquet_file_pattern)

    if not all_parquet_files:
        print(f"No .parquet files found in directory: {parquet_directory}")
    else:
        print(f"Found {len(all_parquet_files)} .parquet files in {parquet_directory}")

except Exception as e:
    print(f"Error finding parquet files: {e}")
    # You might want to exit or handle this error differently
    exit() # Exit the script if we can't find files

# List to hold DataFrames read from each file
dfs = []

# Read each parquet file and append to the list
for f in all_parquet_files:
    try:
        print(f"Reading file: {f}")
        df = pd.read_parquet(f)
        dfs.append(df)
    except Exception as e:
        print(f"Error reading parquet file {f}: {e}")
        # Decide whether to skip the file or stop processing
        continue # Skip this file and try the next one

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.DataFrame() # Initialize an empty DataFrame
if dfs: # Check if the list of DataFrames is not empty
    try:
        print("Concatenating DataFrames...")
        combined_df = pd.concat(dfs, ignore_index=True)
        print("Concatenation complete.")
        print(f"Combined DataFrame shape: {combined_df.shape}")
        # print(combined_df.head()) # Display the head of the combined DataFrame

    except Exception as e:
        print(f"Error concatenating DataFrames: {e}")
else:
    print("No DataFrames were loaded to concatenate.")

Found 12 .parquet files in PARQUET/batch_base_parquet
Reading file: PARQUET/batch_base_parquet/ner_results_append_sum2024_04_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_base_parquet/ner_results_append_sum2024_10_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_base_parquet/ner_results_append_sum2024_07_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_base_parquet/ner_results_append_sum2024_08_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_base_parquet/ner_results_append_sum2024_05_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_base_parquet/ner_results_append_sum2024_01_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_base_parquet/ner_results_append_sum2024_12_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_base_parquet/ner_results_append_sum2024_06_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_base_parquet/ner_results_append_sum2024_03_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_base_parquet/ner_r

In [5]:
# combined_final_parquet = f'APPEND_SUM_{year}_sentiment{special_identifier}.parquet'
# combined_df.to_parquet(os.path.join(parquet_directory, combined_final_parquet))

In [5]:
combined_df.columns

Index(['id', 'date', 'title', 'author', 'url', 'content', 'post_id',
       'timestamp', 'subreddit', 'entities', 'sentiment'],
      dtype='object')

In [8]:
import pandas as pd
from scipy.signal import find_peaks

def find_sentiment_valleys_with_context(df, entity, year, month, context_window=1):
    """
    Finds the dates of sentiment valleys for a given entity in a specific month
    and retrieves the text of the valley posts and surrounding posts.

    Args:
        df (pd.DataFrame): The input dataframe with 'date', 'entities', 'sentiment', and 'content' columns.
        entity (str): The entity to analyze (e.g., "Comcast").
        year (int): The year of interest.
        month (int): The month of interest (1-12).
        context_window (int): The number of surrounding posts to include (default is 1).

    Returns:
        dict: A dictionary where keys are valley dates and values are lists of dictionaries,
              each containing the 'date', 'sentiment', and 'content' of the post.
              Returns an empty dictionary if no data is found or no valleys are detected.
    """
    df['date'] = pd.to_datetime(df['date'])
    month_data = df[(df['date'].dt.year == year) & (df['date'].dt.month == month)].copy()

    if month_data.empty:
        return {}

    entity_data = month_data[month_data['entities'].apply(lambda x: entity in x)].sort_values(by='date').copy()

    if entity_data.empty:
        return {}

    daily_sentiment = entity_data.groupby('date')['sentiment'].mean()

    if len(daily_sentiment) < 3:
        return {}

    valleys_indices = find_peaks(-daily_sentiment.values)[0]
    valley_dates = [daily_sentiment.index[i].date() for i in valleys_indices]

    valley_posts_with_context = {}
    for valley_date in valley_dates:
        valley_posts_with_context[valley_date] = []
        valley_row = entity_data[entity_data['date'].dt.date == valley_date].iloc[0] # Get one of the posts on the valley date
        valley_index_in_entity_data = entity_data.index.get_loc(valley_row.name)

        start_index = max(0, valley_index_in_entity_data - context_window)
        end_index = min(len(entity_data), valley_index_in_entity_data + context_window + 1)

        context_posts = entity_data.iloc[start_index:end_index][['date', 'sentiment', 'content']].to_dict('records')
        valley_posts_with_context[valley_date] = context_posts

    return valley_posts_with_context

services = [
    "Comcast", "Airline", "Healthcare", "Trains", "Banks", "United States",
    "ER", "Youtube", "Reddit", "Netflix",
    "Xfinity", "Amtrak", "Septa", "Spirit", "American",
    "Disney"
]

target_year = 2024
target_month = 1
context_window = 3

all_valleys_with_context = {}
for entity in services:
    valleys_context = find_sentiment_valleys_with_context(combined_df, entity, target_year, target_month, context_window)
    if valleys_context:
        all_valleys_with_context[entity] = valleys_context

if all_valleys_with_context:
    print(f"Sentiment Valleys with Surrounding Posts for {target_year}-{target_month:02d} (Context Window: {context_window}):")
    for entity, valley_data in all_valleys_with_context.items():
        print(f"\n- Entity: {entity}")
        for valley_date, context_posts in valley_data.items():
            print(f"  - Valley Date: {valley_date}")
            for post in context_posts:
                print(f"    - Date: {post['date'].date()}, Sentiment: {post['sentiment']:.2f}, Content: {post['content'][:50]}...")
else:
    print(f"No sentiment valleys found for any of the specified entities in {target_year}-{target_month:02d}.")

Sentiment Valleys with Surrounding Posts for 2024-01 (Context Window: 3):

- Entity: Comcast
  - Valley Date: 2024-01-10
    - Date: 2024-01-03, Sentiment: 0.38, Content: Internet down here and there for blips is to be ex...
    - Date: 2024-01-04, Sentiment: 0.56, Content: Seems like bad service she my Ziply home internet ...
    - Date: 2024-01-10, Sentiment: -0.46, Content: Hello all, back in November I moved into a new com...
    - Date: 2024-01-17, Sentiment: 0.94, Content: I'm curious if people can give me their opinions o...
    - Date: 2024-01-26, Sentiment: -0.60, Content: We have VZW and pay $101 for two lines. Wondering ...
    - Date: 2024-01-29, Sentiment: -0.84, Content: Hi i pay for 1200 mbps at comcast and just got 60 ...

- Entity: Airline
  - Valley Date: 2024-01-07
    - Date: 2024-01-04, Sentiment: 0.51, Content: So I've posted here a few times already about my u...
    - Date: 2024-01-05, Sentiment: -0.17, Content: Hi guys.
So I kind of messed up and overlooked the