In [10]:
#import libraries
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
import tqdm
import glob
import os
import ast
from itertools import cycle

In [11]:
#spacy definitions
import spacy
print(f"spaCy version: {spacy.__version__}")
print(f"CUDA available: {spacy.prefer_gpu()}")
# print(f"GPU device count: {spacy.util.get_gpu_count()}")

# Load spaCy model
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm") 

# Add your custom EntityRuler
ruler = nlp.add_pipe("entity_ruler", before="ner")

spaCy version: 3.8.5
CUDA available: True


In [12]:
#Init Variables for csv names
year="2024"
# month="01"
cwd=os.getcwd()
month=os.path.basename(cwd)
# print(f"{month}")

#special_identifier='_xfin_amt_sep_spi_ama'
special_identifier='_MULTI_PLOT' #for csv output
# Construct the directory name
output_directory = f"batch{special_identifier}"

services = [
    "Comcast", "Airline", "Healthcare", "Trains", "Banks", "United States",
    "ER", "Youtube", "Reddit", "Netflix",
    "Xfinity", "Amtrak", "Septa", "Spirit", "American",
    "Disney"
]

complaint_patterns = [{"label": "SERVICE", "pattern": service} for service in services]
ruler.add_patterns(complaint_patterns)

In [13]:
# Load data
import pandas as pd
import glob
import os

parquet_directory = f"PARQUET/batch{special_identifier}"
plot_output_directory = f"YEAR_APPEND/batch{special_identifier}/PLOTS"

# --- Read and Concatenate Parquet Files ---
all_parquet_files = []
try:
    # Use glob to find all files ending with .parquet in the specified directory
    parquet_file_pattern = os.path.join(parquet_directory, "ner*.parquet")
    all_parquet_files = glob.glob(parquet_file_pattern)

    if not all_parquet_files:
        print(f"No .parquet files found in directory: {parquet_directory}")
    else:
        print(f"Found {len(all_parquet_files)} .parquet files in {parquet_directory}")

except Exception as e:
    print(f"Error finding parquet files: {e}")
    # You might want to exit or handle this error differently
    exit() # Exit the script if we can't find files

# List to hold DataFrames read from each file
dfs = []

# Read each parquet file and append to the list
for f in all_parquet_files:
    try:
        print(f"Reading file: {f}")
        df = pd.read_parquet(f)
        dfs.append(df)
    except Exception as e:
        print(f"Error reading parquet file {f}: {e}")
        # Decide whether to skip the file or stop processing
        continue # Skip this file and try the next one

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.DataFrame() # Initialize an empty DataFrame
if dfs: # Check if the list of DataFrames is not empty
    try:
        print("Concatenating DataFrames...")
        combined_df = pd.concat(dfs, ignore_index=True)
        print("Concatenation complete.")
        print(f"Combined DataFrame shape: {combined_df.shape}")
        # print(combined_df.head()) # Display the head of the combined DataFrame

    except Exception as e:
        print(f"Error concatenating DataFrames: {e}")
else:
    print("No DataFrames were loaded to concatenate.")

Found 12 .parquet files in PARQUET/batch_MULTI_PLOT
Reading file: PARQUET/batch_MULTI_PLOT/ner_results_append_sum2024_12_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_MULTI_PLOT/ner_results_append_sum2024_09_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_MULTI_PLOT/ner_results_append_sum2024_11_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_MULTI_PLOT/ner_results_append_sum2024_01_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_MULTI_PLOT/ner_results_append_sum2024_02_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_MULTI_PLOT/ner_results_append_sum2024_03_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_MULTI_PLOT/ner_results_append_sum2024_04_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_MULTI_PLOT/ner_results_append_sum2024_05_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_MULTI_PLOT/ner_results_append_sum2024_06_sentiment_MULTI_PLOT.parquet
Reading file: PARQUET/batch_MULTI_PLOT/ner_results_append_sum2024_

In [None]:
# combined_final_parquet = f'APPEND_SUM_{year}_sentiment{special_identifier}.parquet'
# combined_df.to_parquet(os.path.join(parquet_directory, combined_final_parquet))

In [6]:
combined_df.columns

Index(['id', 'date', 'title', 'author', 'url', 'content', 'post_id',
       'timestamp', 'subreddit', 'entities', 'sentiment'],
      dtype='object')

In [14]:
# Create the PLOTS directory if it doesn't exist

# combined_final_parquet = f'APPEND_SUM_{year}_sentiment{special_identifier}.parquet'
# combined_df = pd.read_parquet(os.path.join(parquet_directory, combined_final_parquet))

os.makedirs(plot_output_directory, exist_ok=True)

In [8]:
#Plot 12 plots, monthly, all entities
import pandas as pd
import ast
import matplotlib.pyplot as plt
from itertools import cycle
import os
import calendar # To get month names
from tqdm.notebook import tqdm
import numpy as np # Import NumPy

# Assuming combined_df is already loaded with necessary columns (timestamp, entities, sentiment)
# Assuming 'services' and 'plot_output_directory' are defined

# --- Configuration ---
# Assuming your DataFrame is named 'combined_df' and has the specified columns:
# id, date, title, author, url, content, post_id, timestamp, subreddit, entities, sentiment
# Assuming 'services' and 'plot_output_directory' are defined.

# Example definitions if they are not defined elsewhere:
# services = [
#     "Comcast", "Airline", "Healthcare", "Trains", "Banks", "United States",
#     "ER", "Youtube", "Reddit", "Netflix",
#     "Xfinity", "Amtrak", "Septa", "Spirit", "American",
#     "Disney"
# ]

# Define your plot output directory
# special_identifier = "_your_identifier" # Example, replace with your value
# plot_output_directory = f"batch{special_identifier}/PLOTS"
# os.makedirs(plot_output_directory, exist_ok=True) # Ensure directory exists


# --- Extract relevant data and calculate daily average sentiment ---
entity_sentiments_over_time = []

# Assuming 'combined_df' is your DataFrame with columns:
# id, date, title, author, url, content, post_id, timestamp, subreddit, entities, sentiment

# Wrap the iteration over DataFrame rows with tqdm, using combined_df
for index, row in tqdm(combined_df.iterrows(), total=len(combined_df), desc="Extracting Plotting Data"):
    try:
        entities_value = row['entities']
        # Handle empty, NaN, or invalid entities
        if pd.isna(entities_value) or entities_value is None or str(entities_value).strip() == "":
            ents = []
        else:
            entities_str = str(entities_value)
            try:
                ents = ast.literal_eval(entities_str)
                if not isinstance(ents, list):
                     print(f"Warning: ast.literal_eval did not return a list for row {index}. Result type: {type(ents)}. Value: {entities_value}")
                     ents = []
            except (SyntaxError, ValueError, Exception) as e:
                print(f"Error evaluating entities string for row {index}: {e}. Value: {entities_value}")
                ents = []

        # Convert timestamp to datetime and then extract date
        # Assuming 'timestamp' is a timestamp (integer or float)
        if pd.isna(row['timestamp']) or row['timestamp'] is None:
            post_date = None # Handle missing timestamp
        else:
            try:
                # Attempt to convert to numeric first for robustness before datetime conversion
                timestamp_numeric = pd.to_numeric(row['timestamp'], errors='coerce')
                if not pd.isna(timestamp_numeric):
                    post_date = pd.to_datetime(timestamp_numeric, unit='s').date() # Adjust unit if needed
                else:
                    print(f"Warning: Could not convert timestamp to numeric for row {index}: {row['timestamp']}")
                    post_date = None
            except (ValueError, TypeError) as e:
                 print(f"Error converting timestamp {row['timestamp']} on row {index}: {e}")
                 post_date = None # Handle invalid timestamp


        sentiment = row['sentiment'] # Assuming sentiment is already a number or NaN

        if post_date is not None: # Only process if date is valid
             for ent_text, ent_label in ents:
                 if isinstance(ent_text, str) and isinstance(ent_label, str) and ent_label == 'SERVICE' and ent_text in services:
                    if np.isreal(sentiment): # Corrected: Use np.isreal
                        entity_sentiments_over_time.append((ent_text, post_date, float(sentiment))) # Ensure sentiment is float
                    else:
                         print(f"Warning: Skipping non-numeric sentiment {sentiment} for row {index}.")


    except (KeyError, TypeError, Exception) as e:
        print(f"Error processing row {index}: {e}")
        continue


# Create a DataFrame from the extracted data
entity_sentiment_df = pd.DataFrame(entity_sentiments_over_time, columns=['service', 'date', 'sentiment'])

# Convert the 'date' column to datetime objects and extract month and day
if not entity_sentiment_df.empty:
    # Assuming 'date' column in combined_df is already in a format parsable by pd.to_datetime
    if not pd.api.types.is_datetime64_any_dtype(entity_sentiment_df['date']):
         try:
             entity_sentiment_df['date'] = pd.to_datetime(entity_sentiment_df['date'])
         except ValueError as e:
              print(f"Error converting 'date' column to datetime: {e}")
              print("Please ensure the 'date' column is in a recognizable format.")
              entity_sentiment_df = pd.DataFrame() # Clear if date conversion fails

if not entity_sentiment_df.empty:
    entity_sentiment_df['month'] = entity_sentiment_df['date'].dt.month
    entity_sentiment_df['day'] = entity_sentiment_df['date'].dt.day

    # Calculate daily average sentiment
    # We need to group by service, month, and day to get daily average within each month
    if all(col in entity_sentiment_df.columns for col in ['service', 'month', 'day', 'sentiment']):
         avg_sentiment_per_day_month = entity_sentiment_df.groupby(['service', 'month', 'day'])['sentiment'].mean().reset_index()
    else:
         print("Required columns for grouping ('service', 'month', 'day', 'sentiment') not found after extraction.")
         avg_sentiment_per_day_month = pd.DataFrame() # Ensure it's an empty DataFrame


    # --- Plotting 12 Plots by Month ---
    if not avg_sentiment_per_day_month.empty: # Only attempt plotting if aggregation was successful
        fig, axes = plt.subplots(4, 3, figsize=(18, 16), sharey=True) # Create a 4x3 grid of subplots
        fig.suptitle("Average Daily Sentiment per Service by Month (All Years Combined)", fontsize=16, y=1.02) # Main title for the figure

        # Flatten the 2D axes array for easy iteration
        axes = axes.flatten()

        # Create a color cycle that will be consistent across services in all plots
        colors = cycle(plt.cm.tab10.colors)

        # Wrap the loop iterating through months with tqdm
        for month_num in tqdm(range(1, 13), desc="Generating Monthly Plots"): # Iterate through months 1 to 12 with tqdm
            ax = axes[month_num - 1] # Get the correct subplot axes for the current month (0-indexed)
            month_name = calendar.month_name[month_num] # Get the month name (e.g., 'January')

            # Filter data for the current month
            # .copy() is used to avoid SettingWithCopyWarning
            monthly_data = avg_sentiment_per_day_month[avg_sentiment_per_day_month['month'] == month_num].copy()

            if not monthly_data.empty:
                 # Create a fresh color cycle for each month's plot
                 month_colors = cycle(plt.cm.tab10.colors)
                 for service in services: # Iterate through each service
                     # Filter data for the current service within the current month
                     service_monthly_data = monthly_data[monthly_data['service'] == service]
                     if not service_monthly_data.empty:
                         color = next(month_colors) # Get the color for this service
                         # Plot day of the month vs. daily average sentiment
                         ax.plot(
                             service_monthly_data['day'],
                             service_monthly_data['sentiment'],
                             label=service, # Label for the legend
                             color=color,
                             marker='o', # Use markers for clarity on daily data points
                             linestyle='-' # Use a line to connect daily points
                         )

            # --- Format the subplot for the current month ---
            ax.set_title(month_name) # Set the title for the current month's subplot
            # Only add y-label to the leftmost plots for clarity
            if month_num in [1, 4, 7, 10]: # Months in the first column
                 ax.set_ylabel("Avg Sentiment")
            ax.grid(True) # Add a grid to the subplot
            ax.set_xticks(range(0, 32, 5)) # Set x-ticks for days (e.g., 0, 5, 10, ..., 30)
            ax.set_xlim(0, 31) # Set x-axis limits from day 0 to 31

            # Only add x-label to the bottommost plots
            if month_num in [10, 11, 12]: # Months in the last row
                ax.set_xlabel("Day of Month")
            else:
                 # Hide x-axis labels for plots not in the bottom row
                 ax.tick_params(labelbottom=False)


        # --- Add a single legend for all subplots outside the figure ---
        # We need handles and labels for the legend. Since colors are cycled per month,
        # we create dummy handles to represent each service with a consistent color
        # from the colormap's cycle.
        dummy_colors = cycle(plt.cm.tab10.colors)
        dummy_handles = [plt.Line2D([0], [0], color=next(dummy_colors), lw=2) for _ in services]
        # Place the legend on the right side of the figure
        fig.legend(dummy_handles, services, title="Entity", bbox_to_anchor=(1.02, 0.5), loc='center left', borderaxespad=0.)


        # --- Adjust layout ---
        # Use tight_layout to automatically adjust subplot parameters for a tight layout
        # rect parameter is used to leave space for the suptitle and the outside legend
        plt.tight_layout(rect=[0, 0.03, 0.95, 0.98])


        # --- Save the entire figure ---
        # Ensure plot_output_directory is defined and exists before saving
        # Example check: if 'plot_output_directory' in locals() and os.path.exists(plot_output_directory):
                
        plot_filename = os.path.join(plot_output_directory, "sentiment_by_month_daily_avg_12plots.png")
        plt.savefig(plot_filename)
        plt.close(fig) # Close the figure to prevent display if running in batch

        print(f"12-plot figure saved to: {plot_filename}")

    else:
        print("Average daily sentiment DataFrame is empty after aggregation. No plots generated.")

else:
    print("Post sentiment DataFrame is empty after initial processing. No plots or aggregations performed.")


Extracting Plotting Data:   0%|          | 0/3595301 [00:00<?, ?it/s]

Generating Monthly Plots:   0%|          | 0/12 [00:00<?, ?it/s]

12-plot figure saved to: YEAR_APPEND/batch_MULTI_PLOT/PLOTS/sentiment_by_month_daily_avg_12plots.png


In [9]:
# #36 plots, monthly plots with sub plotting via voltatilty
# import pandas as pd
# import ast
# import matplotlib.pyplot as plt
# from itertools import cycle
# import os
# import calendar # To get month names
# from tqdm.notebook import tqdm
# import numpy as np # Import NumPy

# # Assuming combined_df is already loaded with necessary columns (timestamp, entities, sentiment)
# # Assuming 'services' and 'plot_output_directory' are defined

# # --- Configuration ---
# # Assuming your DataFrame is named 'combined_df' and has the specified columns.
# # Assuming 'services' and 'plot_output_directory' are defined.

# # Example definitions if they are not defined elsewhere:
# # services = [
# #     "Comcast", "Airline", "Healthcare", "Trains", "Banks", "United States",
# #     "ER", "Youtube", "Reddit", "Netflix",
# #     "Xfinity", "Amtrak", "Septa", "Spirit", "American",
# #     "Disney"
# # ]

# # Define your plot output directory
# # special_identifier = "_your_identifier" # Example, replace with your value
# # plot_output_directory = f"batch{special_identifier}/PLOTS"
# # os.makedirs(plot_output_directory, exist_ok=True) # Ensure directory exists


# # --- Extract relevant data and calculate daily average sentiment ---
# entity_sentiments_over_time = []

# # Assuming combined_df has: timestamp, entities, sentiment
# for index, row in tqdm(combined_df.iterrows(), total=len(combined_df), desc="Extracting Plotting Data"):
#     try:
#         entities_value = row['entities']
#         # Handle empty, NaN, or invalid entities
#         if pd.isna(entities_value) or entities_value is None or str(entities_value).strip() == "":
#             ents = []
#         else:
#             entities_str = str(entities_value)
#             try:
#                 ents = ast.literal_eval(entities_str)
#                 if not isinstance(ents, list):
#                      print(f"Warning: ast.literal_eval did not return a list for row {index}. Result type: {type(ents)}. Value: {entities_value}")
#                      ents = []
#             except (SyntaxError, ValueError, Exception) as e:
#                 print(f"Error evaluating entities string for row {index}: {e}. Value: {entities_value}")
#                 ents = []

#         # Convert timestamp to datetime and then extract date
#         # Assuming 'timestamp' is a timestamp (integer or float)
#         if pd.isna(row['timestamp']) or row['timestamp'] is None:
#             post_date = None # Handle missing timestamp
#         else:
#             try:
#                 timestamp_numeric = pd.to_numeric(row['timestamp'], errors='coerce')
#                 if not pd.isna(timestamp_numeric):
#                     post_date = pd.to_datetime(timestamp_numeric, unit='s').date() # Adjust unit if needed
#                 else:
#                     print(f"Warning: Could not convert timestamp to numeric for row {index}: {row['timestamp']}")
#                     post_date = None
#             except (ValueError, TypeError) as e:
#                  print(f"Error converting timestamp {row['timestamp']} on row {index}: {e}")
#                  post_date = None # Handle invalid timestamp


#         sentiment = row['sentiment'] # Assuming sentiment is already a number or NaN

#         if post_date is not None: # Only process if date is valid
#              for ent_text, ent_label in ents:
#                  if isinstance(ent_text, str) and isinstance(ent_label, str) and ent_label == 'SERVICE' and ent_text in services:
#                     if np.isreal(sentiment): # Corrected: Use np.isreal
#                         entity_sentiments_over_time.append((ent_text, post_date, float(sentiment))) # Ensure sentiment is float
#                     else:
#                          print(f"Warning: Skipping non-numeric sentiment {sentiment} for row {index}.")


#     except (KeyError, TypeError, Exception) as e:
#         print(f"Error processing row {index}: {e}")
#         continue


# # Create a DataFrame from the extracted data
# entity_sentiment_df = pd.DataFrame(entity_sentiments_over_time, columns=['service', 'date', 'sentiment'])

# # Convert the 'date' column to datetime objects and extract month and day
# if not entity_sentiment_df.empty:
#     if not pd.api.types.is_datetime64_any_dtype(entity_sentiment_df['date']):
#          try:
#              entity_sentiment_df['date'] = pd.to_datetime(entity_sentiment_df['date'])
#          except ValueError as e:
#               print(f"Error converting 'date' column to datetime: {e}")
#               print("Please ensure the 'date' column is in a recognizable format.")
#               entity_sentiment_df = pd.DataFrame() # Clear if date conversion fails

# if not entity_sentiment_df.empty:
#     entity_sentiment_df['month'] = entity_sentiment_df['date'].dt.month
#     entity_sentiment_df['day'] = entity_sentiment_df['date'].dt.day

#     # Calculate daily average sentiment
#     if all(col in entity_sentiment_df.columns for col in ['service', 'month', 'day', 'sentiment']):
#          # Group by service, month, and day to get daily average within each month
#          avg_sentiment_per_day_month = entity_sentiment_df.groupby(['service', 'month', 'day'])['sentiment'].mean().reset_index()
#     else:
#          print("Required columns for daily average grouping ('service', 'month', 'day', 'sentiment') not found after extraction.")
#          avg_sentiment_per_day_month = pd.DataFrame()


#     # Calculate volatility (standard deviation of sentiment) per service
#     # Need sufficient data points per service to calculate std dev
#     if not avg_sentiment_per_day_month.empty:
#         # Calculate std dev of daily average sentiment for each service across all days/months
#         sentiment_volatility = avg_sentiment_per_day_month.groupby('service')['sentiment'].std()
#         # Drop NaNs that occur if a service has only one data point (std dev is NaN)
#         sentiment_volatility = sentiment_volatility.dropna().sort_values(ascending=False)

#         # Divide services into three groups based on volatility
#         n_services_with_volatility = len(sentiment_volatility)
#         if n_services_with_volatility >= 3: # Need at least 3 services to split into 3 groups
#              third = n_services_with_volatility // 3
#              high_volatility_services = sentiment_volatility.index[:third].tolist()
#              medium_volatility_services = sentiment_volatility.index[third:2*third].tolist()
#              low_volatility_services = sentiment_volatility.index[2*third:].tolist()
#         elif n_services_with_volatility > 0:
#              # Handle cases with less than 3 services that have calculated volatility
#              print(f"Warning: Only {n_services_with_volatility} services have enough data for volatility calculation.")
#              if n_services_with_volatility == 2:
#                   high_volatility_services = [sentiment_volatility.index[0]]
#                   medium_volatility_services = [sentiment_volatility.index[1]]
#                   low_volatility_services = []
#              elif n_services_with_volatility == 1:
#                   high_volatility_services = [sentiment_volatility.index[0]]
#                   medium_volatility_services = []
#                   low_volatility_services = []
#              else: # n_services_with_volatility == 0
#                   high_volatility_services = []
#                   medium_volatility_services = []
#                   low_volatility_services = []
#         else:
#              print("No services have enough data for volatility calculation.")
#              high_volatility_services = []
#              medium_volatility_services = []
#              low_volatility_services = []

#         volatility_groups = {
#             "High Volatility": high_volatility_services,
#             "Medium Volatility": medium_volatility_services,
#             "Low Volatility": low_volatility_services
#         }

#         # --- Plotting 3x12 Grid by Volatility and Month ---
#         # 3 rows (Volatility) x 12 columns (Months) = 36 subplots
#         fig, axes = plt.subplots(3, 12, figsize=(30, 12), sharey=True) # Increased overall figure size
#         fig.suptitle("Average Daily Sentiment per Service by Volatility and Month", fontsize=16, y=1.03) # Main title

#         volatility_level_names = ["High Volatility", "Medium Volatility", "Low Volatility"]

#         # Create a single color cycle for all services across all plots
#         all_services_colors = cycle(plt.cm.tab10.colors)
#         # Map each service to a consistent color
#         service_color_map = {service: next(all_services_colors) for service in services}


#         # Wrap the loop iterating through volatility groups
#         for row_idx, (level_name, services_in_level) in tqdm(enumerate(volatility_groups.items()), total=3, desc="Plotting Volatility Levels"):

#             # Wrap the loop iterating through months (columns)
#             for month_idx, month_num in tqdm(enumerate(range(1, 13)), total=12, leave=False, desc=f" Plotting {level_name}"):
#                 ax = axes[row_idx, month_idx] # Get the correct subplot axes

#                 month_name = calendar.month_name[month_num]

#                 # Filter data for the current month and services in this volatility level
#                 monthly_level_data = avg_sentiment_per_day_month[
#                     (avg_sentiment_per_day_month['month'] == month_num) &
#                     (avg_sentiment_per_day_month['service'].isin(services_in_level))
#                 ].copy()

#                 if not monthly_level_data.empty:
#                      # Plot sentiment for each service within this subplot
#                      for service in services_in_level:
#                          service_monthly_data = monthly_level_data[monthly_level_data['service'] == service]
#                          if not service_monthly_data.empty:
#                              color = service_color_map.get(service, 'gray') # Get consistent color, default to gray
#                              ax.plot(
#                                  service_monthly_data['day'],
#                                  service_monthly_data['sentiment'],
#                                  label=service,
#                                  color=color,
#                                  marker='o',
#                                  linestyle='-'
#                              )

#                 # --- Format the subplot ---
#                 # Add month name as column title (only for top row)
#                 if row_idx == 0:
#                     ax.set_title(month_name)

#                 # Add volatility level name as row title (only for first column)
#                 if month_idx == 0:
#                     ax.set_ylabel(level_name) # Use volatility level name as y-label/row title

#                 ax.grid(True, axis='y', linestyle='--') # Add horizontal grid

#                 # Set x-axis ticks and limits for days
#                 ax.set_xticks(range(0, 32, 5))
#                 ax.set_xlim(0, 31)

#                 # Only show x-axis labels for the bottom row (Low Volatility)
#                 if row_idx != 2:
#                     ax.tick_params(labelbottom=False)
#                 else:
#                      ax.set_xlabel("Day of Month") # X-label only on bottom row

#                 # Hide y-axis labels for columns after the first one
#                 if month_idx != 0:
#                     ax.tick_params(labelleft=False)


#         # --- Add a single legend for all subplots outside the figure ---
#         # Use the service_color_map to create handles for the legend
#         legend_handles = [plt.Line2D([0], [0], color=service_color_map.get(service, 'gray'), lw=2) for service in services]
#         # Place the legend on the right side of the figure
#         fig.legend(legend_handles, services, title="Entity", bbox_to_anchor=(1.02, 0.5), loc='center left', borderaxespad=0.)


#         # --- Adjust layout ---
#         # Use tight_layout to automatically adjust subplot parameters
#         # rect parameter leaves space for the main title and the outside legend
#         plt.tight_layout(rect=[0, 0.03, 0.95, 0.98])


#         # --- Save the entire figure ---
#         # Ensure plot_output_directory is defined and exists before saving
#         # Example check: if 'plot_output_directory' in locals() and os.path.exists(plot_output_directory):
#         plot_filename = os.path.join(plot_output_directory, "sentiment_volatility_month_grid.png")
#         plt.savefig(plot_filename)
#         plt.close(fig)

#         print(f"3x12 grid plot saved to: {plot_filename}")

#     else:
#         print("No services have enough data for volatility calculation or daily average calculation failed.")

# else:
#     print("Post sentiment DataFrame is empty after initial processing. No plots or aggregations performed.")


Extracting Plotting Data:   0%|          | 0/3595301 [00:00<?, ?it/s]

Plotting Volatility Levels:   0%|          | 0/3 [00:00<?, ?it/s]

 Plotting High Volatility:   0%|          | 0/12 [00:00<?, ?it/s]

 Plotting Medium Volatility:   0%|          | 0/12 [00:00<?, ?it/s]

 Plotting Low Volatility:   0%|          | 0/12 [00:00<?, ?it/s]

3x12 grid plot saved to: YEAR_APPEND/batch_MULTI_PLOT/PLOTS/sentiment_volatility_month_grid.png


In [15]:
# #4x3 w/ subplots
# import pandas as pd
# import ast
# import matplotlib.pyplot as plt
# from itertools import cycle
# import os
# import calendar
# from tqdm.notebook import tqdm
# import numpy as np
# import matplotlib.gridspec as gridspec # Import gridspec

# # Assuming combined_df is already loaded with necessary columns (timestamp, entities, sentiment)
# # Assuming 'services' and 'plot_output_directory' are defined

# # --- Configuration ---
# # Assuming your DataFrame is named 'combined_df' and has the specified columns.
# # Assuming 'services' and 'plot_output_directory' are defined.

# # Example definitions if they are not defined elsewhere:
# # services = [
# #     "Comcast", "Airline", "Healthcare", "Trains", "Banks", "United States",
# #     "ER", "Youtube", "Reddit", "Netflix",
# #     "Xfinity", "Amtrak", "Septa", "Spirit", "American",
# #     "Disney"
# # ]

# # Define your plot output directory
# # special_identifier = "_your_identifier" # Example, replace with your value
# # plot_output_directory = f"batch{special_identifier}/PLOTS"
# # os.makedirs(plot_output_directory, exist_ok=True) # Ensure directory exists


# # --- Extract relevant data and calculate daily average sentiment ---
# entity_sentiments_over_time = []

# # Assuming combined_df has: timestamp, entities, sentiment
# for index, row in tqdm(combined_df.iterrows(), total=len(combined_df), desc="Extracting Plotting Data"):
#     try:
#         entities_value = row['entities']
#         # Handle empty, NaN, or invalid entities
#         if pd.isna(entities_value) or entities_value is None or str(entities_value).strip() == "":
#             ents = []
#         else:
#             entities_str = str(entities_value)
#             try:
#                 ents = ast.literal_eval(entities_str)
#                 if not isinstance(ents, list):
#                      print(f"Warning: ast.literal_eval did not return a list for row {index}. Result type: {type(ents)}. Value: {entities_value}")
#                      ents = []
#             except (SyntaxError, ValueError, Exception) as e:
#                 print(f"Error evaluating entities string for row {index}: {e}. Value: {entities_value}")
#                 ents = []

#         # Convert timestamp to datetime and then extract date
#         # Assuming 'timestamp' is a timestamp (integer or float)
#         if pd.isna(row['timestamp']) or row['timestamp'] is None:
#             post_date = None # Handle missing timestamp
#         else:
#             try:
#                 timestamp_numeric = pd.to_numeric(row['timestamp'], errors='coerce')
#                 if not pd.isna(timestamp_numeric):
#                     post_date = pd.to_datetime(timestamp_numeric, unit='s').date() # Adjust unit if needed
#                 else:
#                     print(f"Warning: Could not convert timestamp to numeric for row {index}: {row['timestamp']}")
#                     post_date = None
#             except (ValueError, TypeError) as e:
#                  print(f"Error converting timestamp {row['timestamp']} on row {index}: {e}")
#                  post_date = None # Handle invalid timestamp


#         sentiment = row['sentiment'] # Assuming sentiment is already a number or NaN

#         if post_date is not None: # Only process if date is valid
#              for ent_text, ent_label in ents:
#                  if isinstance(ent_text, str) and isinstance(ent_label, str) and ent_label == 'SERVICE' and ent_text in services:
#                     if np.isreal(sentiment): # Corrected: Use np.isreal
#                         entity_sentiments_over_time.append((ent_text, post_date, float(sentiment))) # Ensure sentiment is float
#                     else:
#                          print(f"Warning: Skipping non-numeric sentiment {sentiment} for row {index}.")


#     except (KeyError, TypeError, Exception) as e:
#         print(f"Error processing row {index}: {e}")
#         continue


# # Create a DataFrame from the extracted data
# entity_sentiment_df = pd.DataFrame(entity_sentiments_over_time, columns=['service', 'date', 'sentiment'])

# # Convert the 'date' column to datetime objects and extract month and day
# if not entity_sentiment_df.empty:
#     if not pd.api.types.is_datetime64_any_dtype(entity_sentiment_df['date']):
#          try:
#              entity_sentiment_df['date'] = pd.to_datetime(entity_sentiment_df['date'])
#          except ValueError as e:
#               print(f"Error converting 'date' column to datetime: {e}")
#               print("Please ensure the 'date' column is in a recognizable format.")
#               entity_sentiment_df = pd.DataFrame() # Clear if date conversion fails

# if not entity_sentiment_df.empty:
#     entity_sentiment_df['month'] = entity_sentiment_df['date'].dt.month
#     entity_sentiment_df['day'] = entity_sentiment_df['date'].dt.day

#     # Calculate daily average sentiment
#     if all(col in entity_sentiment_df.columns for col in ['service', 'month', 'day', 'sentiment']):
#          # Group by service, month, and day to get daily average within each month
#          avg_sentiment_per_day_month = entity_sentiment_df.groupby(['service', 'month', 'day'])['sentiment'].mean().reset_index()
#     else:
#          print("Required columns for daily average grouping ('service', 'month', 'day', 'sentiment') not found after extraction.")
#          avg_sentiment_per_day_month = pd.DataFrame() # Ensure it's an empty DataFrame


#     # Calculate volatility (standard deviation of sentiment) per service
#     # Need sufficient data points per service to calculate std dev
#     if not avg_sentiment_per_day_month.empty:
#         # Calculate std dev of daily average sentiment for each service across all days/months
#         sentiment_volatility = avg_sentiment_per_day_month.groupby('service')['sentiment'].std()
#         # Drop NaNs that occur if a service has only one data point (std dev is NaN)
#         sentiment_volatility = sentiment_volatility.dropna().sort_values(ascending=False)

#         # Divide services into three groups based on volatility
#         n_services_with_volatility = len(sentiment_volatility)
#         if n_services_with_volatility >= 3: # Need at least 3 services to split into 3 groups
#              third = n_services_with_volatility // 3
#              high_volatility_services = sentiment_volatility.index[:third].tolist()
#              medium_volatility_services = sentiment_volatility.index[third:2*third].tolist()
#              low_volatility_services = sentiment_volatility.index[2*third:].tolist()
#         elif n_services_with_volatility > 0:
#              # Handle cases with less than 3 services that have calculated volatility
#              print(f"Warning: Only {n_services_with_volatility} services have enough data for volatility calculation.")
#              if n_services_with_volatility == 2:
#                   high_volatility_services = [sentiment_volatility.index[0]]
#                   medium_volatility_services = [sentiment_volatility.index[1]]
#                   low_volatility_services = []
#              elif n_services_with_volatility == 1:
#                   high_volatility_services = [sentiment_volatility.index[0]]
#                   medium_volatility_services = []
#                   low_volatility_services = []
#              else: # n_services_with_volatility == 0
#                   high_volatility_services = []
#                   medium_volatility_services = []
#                   low_volatility_services = []
#         else:
#              print("No services have enough data for volatility calculation.")
#              high_volatility_services = []
#              medium_volatility_services = []
#              low_volatility_services = []

#         volatility_groups = {
#             "High Volatility": high_volatility_services,
#             "Medium Volatility": medium_volatility_services,
#             "Low Volatility": low_volatility_services
#         }

#         # --- Plotting Sentiment vs. Date with Nested Subplots (4x3 Monthly Grid, 3 Volatility Rows per Month) ---
#         fig = plt.figure(figsize=(20, 20)) # Adjust overall figure size

#         # Create a main grid for the months (4 rows, 3 columns)
#         # Adjust wspace and hspace to control spacing between month blocks
#         outer_grid = gridspec.GridSpec(4, 3, wspace=0.2, hspace=0.3)

#         # Create a single color cycle for all services across all plots
#         all_services_colors = cycle(plt.cm.tab10.colors)
#         # Map each service to a consistent color
#         service_color_map = {service: next(all_services_colors) for service in services}

#         # Wrap the loop iterating through months (outer grid cells)
#         for month_idx, month_num in tqdm(enumerate(range(1, 13)), total=12, desc="Generating Monthly Plots"):
#             month_name = calendar.month_name[month_num]
#             row_idx = month_idx // 3 # Determine the row in the outer grid (0-3)
#             col_idx = month_idx % 3  # Determine the column in the outer grid (0-2)

#             # Create a subgrid within the current outer grid cell (3 rows for volatility, 1 column)
#             # Adjust hspace to control vertical spacing between inner plots for this month
#             inner_grid = gridspec.GridSpecFromSubplotSpec(3, 1,
#                                                           subplot_spec=outer_grid[row_idx, col_idx],
#                                                           hspace=0.05) # Minimal vertical spacing between inner plots

#             # Filter data for the current month
#             monthly_data = avg_sentiment_per_day_month[avg_sentiment_per_day_month['month'] == month_num].copy()


#             volatility_level_names = ["High Volatility", "Medium Volatility", "Low Volatility"]

#             # Create a list to hold the axes for sharing the y-axis within the inner grid
#             inner_axes = []

#             # Wrap the loop iterating through volatility levels within the month
#             for inner_row_idx, level_name in enumerate(volatility_level_names):
#                 services_in_level = volatility_groups.get(level_name, []) # Get the services for this volatility level


#                 # Get the subplot axes for the current volatility level within the inner grid
#                 # Share y-axis among the three plots within this month's block
#                 if inner_row_idx == 0: # First inner plot (High Volatility)
#                      ax = fig.add_subplot(inner_grid[inner_row_idx, 0])
#                 else: # Subsequent inner plots - share y-axis with the first inner plot in this cell
#                      ax = fig.add_subplot(inner_grid[inner_row_idx, 0], sharey=inner_axes[0])

#                 inner_axes.append(ax) # Add the current axes to the list

#                 # Filter data for services in this volatility level for the current month
#                 # Need to ensure services_in_level is not empty before filtering
#                 if services_in_level:
#                      monthly_level_data = monthly_data[
#                          monthly_data['service'].isin(services_in_level)
#                      ].copy()
#                 else:
#                      monthly_level_data = pd.DataFrame() # Empty DataFrame if no services in level


#                 if not monthly_level_data.empty:
#                      # Plot sentiment for each service within this inner subplot
#                      for service in services_in_level:
#                          service_monthly_data = monthly_level_data[monthly_level_data['service'] == service]
#                          if not service_monthly_data.empty:
#                              color = service_color_map.get(service, 'gray') # Get consistent color, default to gray
#                              ax.plot(
#                                  service_monthly_data['day'],
#                                  service_monthly_data['sentiment'],
#                                  label=service,
#                                  color=color,
#                                  marker='o',
#                                  linestyle='-'
#                              )

#                 # --- Format the inner subplot ---
#                 ax.grid(True, axis='y', linestyle='--') # Add horizontal grid

#                 # Set x-axis ticks and limits for days
#                 ax.set_xticks(range(0, 32, 5))
#                 ax.set_xlim(0, 31)

#                 # Set title for the month (only on the top inner subplot)
#                 if inner_row_idx == 0:
#                     ax.set_title(month_name)
#                     # Add y-label for the first column's top inner plot
#                     if col_idx == 0:
#                          ax.set_ylabel("Avg Sentiment")
#                 else:
#                     # Hide x-axis labels for inner plots that are not at the bottom of the inner grid
#                      ax.tick_params(labelbottom=False)


#                 # Only show y-axis labels for the first column of outer grid
#                 if col_idx != 0:
#                      ax.tick_params(labelleft=False)

#                 # Add volatility level name as text annotation on the right side of the inner plots
#                 # You might need to adjust the coordinates (1.02, 0.5) and text alignment based on figsize
#                 # This is an alternative to row titles on the left
#                 ax.text(1.02, 0.5, level_name, transform=ax.transAxes,
#                         fontsize=9, va='center', ha='left', rotation=0)


#             # Set a common x-label for the inner grid (only on the bottom inner plot)
#             # This will be plotted *below* the bottom inner plot
#             inner_axes[-1].set_xlabel("Day of Month")


#     # --- Add a single legend for all subplots outside the figure ---
#     # Use the service_color_map to create handles for the legend
#     legend_handles = [plt.Line2D([0], [0], color=service_color_map.get(service, 'gray'), lw=2) for service in services]
#     # Place the legend on the right side of the figure
#     fig.legend(legend_handles, services, title="Entity", bbox_to_anchor=(1.02, 0.5), loc='center left', borderaxespad=0.)


#     # --- Adjust layout ---
#     # Use tight_layout on the figure
#     # rect parameter leaves space for the main title and the outside legend
#     # Adjust rect based on overall figsize and legend position
#     plt.tight_layout(rect=[0, 0.03, 0.95, 0.98])


#     # --- Save the entire figure ---
#     # Ensure plot_output_directory is defined and exists before saving
#     # Example check: if 'plot_output_directory' in locals() and os.path.exists(plot_output_directory):
#     plot_filename = os.path.join(plot_output_directory, "sentiment_nested_volatility_month_plots.png")
#     plt.savefig(plot_filename)
#     plt.close(fig)

#     print(f"Nested 3x12 grid plot saved to: {plot_filename}")

# else:
#     print("Average daily sentiment DataFrame is empty after aggregation or no services have calculated volatility.")



Extracting Plotting Data:   0%|          | 0/3595301 [00:00<?, ?it/s]

Generating Monthly Plots:   0%|          | 0/12 [00:00<?, ?it/s]

  plt.tight_layout(rect=[0, 0.03, 0.95, 0.98])


Nested 3x12 grid plot saved to: YEAR_APPEND/batch_MULTI_PLOT/PLOTS/sentiment_nested_volatility_month_plots.png


In [18]:
import pandas as pd
import ast
import matplotlib.pyplot as plt
from itertools import cycle
import os
import calendar
from tqdm.notebook import tqdm
import numpy as np
import matplotlib.gridspec as gridspec

# Assuming combined_df is already loaded with necessary columns
# Assuming 'services' and 'plot_output_directory' are defined

# --- Configuration ---
# Assuming your DataFrame is named 'combined_df' and has the specified columns.
# Assuming 'services' and 'plot_output_directory' are defined.

# Example definitions if they are not defined elsewhere:
# services = [
#     "Comcast", "Airline", "Healthcare", "Trains", "Banks", "United States",
#     "ER", "Youtube", "Reddit", "Netflix",
#     "Xfinity", "Amtrak", "Septa", "Spirit", "American",
#     "Disney"
# ]

# Define your plot output directory
# special_identifier = "_your_identifier" # Example, replace with your value
# plot_output_directory = f"batch{special_identifier}/PLOTS"
# os.makedirs(plot_output_directory, exist_ok=True) # Ensure directory exists


# --- Extract relevant data and calculate daily average sentiment ---
entity_sentiments_over_time = []

# Assuming combined_df has: timestamp, entities, sentiment
for index, row in tqdm(combined_df.iterrows(), total=len(combined_df), desc="Extracting Plotting Data"):
    try:
        entities_value = row['entities']
        # Handle empty, NaN, or invalid entities
        if pd.isna(entities_value) or entities_value is None or str(entities_value).strip() == "":
            ents = []
        else:
            entities_str = str(entities_value)
            try:
                ents = ast.literal_eval(entities_str)
                if not isinstance(ents, list):
                     print(f"Warning: ast.literal_eval did not return a list for row {index}. Result type: {type(ents)}. Value: {entities_value}")
                     ents = []
            except (SyntaxError, ValueError, Exception) as e:
                print(f"Error evaluating entities string for row {index}: {e}. Value: {entities_value}")
                ents = []

        # Convert timestamp to datetime and then extract date
        # Assuming 'timestamp' is a timestamp (integer or float)
        if pd.isna(row['timestamp']) or row['timestamp'] is None:
            post_date = None # Handle missing timestamp
        else:
            try:
                timestamp_numeric = pd.to_numeric(row['timestamp'], errors='coerce')
                if not pd.isna(timestamp_numeric):
                    post_date = pd.to_datetime(timestamp_numeric, unit='s').date() # Adjust unit if needed
                else:
                    print(f"Warning: Could not convert timestamp to numeric for row {index}: {row['timestamp']}")
                    post_date = None
            except (ValueError, TypeError) as e:
                 print(f"Error converting timestamp {row['timestamp']} on row {index}: {e}")
                 post_date = None # Handle invalid timestamp


        sentiment = row['sentiment'] # Assuming sentiment is already a number or NaN

        if post_date is not None: # Only process if date is valid
             for ent_text, ent_label in ents:
                 if isinstance(ent_text, str) and isinstance(ent_label, str) and ent_label == 'SERVICE' and ent_text in services:
                    if np.isreal(sentiment): # Corrected: Use np.isreal
                        entity_sentiments_over_time.append((ent_text, post_date, float(sentiment))) # Ensure sentiment is float
                    else:
                         print(f"Warning: Skipping non-numeric sentiment {sentiment} for row {index}.")


    except (KeyError, TypeError, Exception) as e:
        print(f"Error processing row {index}: {e}")
        continue


# Create a DataFrame from the extracted data
entity_sentiment_df = pd.DataFrame(entity_sentiments_over_time, columns=['service', 'date', 'sentiment'])

# Convert the 'date' column to datetime objects and extract month and day
if not entity_sentiment_df.empty:
    if not pd.api.types.is_datetime64_any_dtype(entity_sentiment_df['date']):
         try:
             entity_sentiment_df['date'] = pd.to_datetime(entity_sentiment_df['date'])
         except ValueError as e:
              print(f"Error converting 'date' column to datetime: {e}")
              print("Please ensure the 'date' column is in a recognizable format.")
              entity_sentiment_df = pd.DataFrame() # Clear if date conversion fails

if not entity_sentiment_df.empty:
    entity_sentiment_df['month'] = entity_sentiment_df['date'].dt.month
    entity_sentiment_df['day'] = entity_sentiment_df['date'].dt.day

    # Calculate daily average sentiment
    if all(col in entity_sentiment_df.columns for col in ['service', 'month', 'day', 'sentiment']):
         # Group by service, month, and day to get daily average within each month
         avg_sentiment_per_day_month = entity_sentiment_df.groupby(['service', 'month', 'day'])['sentiment'].mean().reset_index()
    else:
         print("Required columns for daily average grouping ('service', 'month', 'day', 'sentiment') not found after extraction.")
         avg_sentiment_per_day_month = pd.DataFrame() # Ensure it's an empty DataFrame


    # Calculate volatility (standard deviation of sentiment) per service
    # Need sufficient data points per service to calculate std dev
    if not avg_sentiment_per_day_month.empty:
        # Calculate std dev of daily average sentiment for each service across all days/months
        sentiment_volatility = avg_sentiment_per_day_month.groupby('service')['sentiment'].std()
        # Drop NaNs that occur if a service has only one data point (std dev is NaN)
        sentiment_volatility = sentiment_volatility.dropna().sort_values(ascending=False)

        # Divide services into three groups based on volatility
        n_services_with_volatility = len(sentiment_volatility)
        if n_services_with_volatility >= 3: # Need at least 3 services to split into 3 groups
             third = n_services_with_volatility // 3
             high_volatility_services = sentiment_volatility.index[:third].tolist()
             medium_volatility_services = sentiment_volatility.index[third:2*third].tolist()
             low_volatility_services = sentiment_volatility.index[2*third:].tolist()
        elif n_services_with_volatility > 0:
             # Handle cases with less than 3 services that have calculated volatility
             print(f"Warning: Only {n_services_with_volatility} services have enough data for volatility calculation.")
             if n_services_with_volatility == 2:
                  high_volatility_services = [sentiment_volatility.index[0]]
                  medium_volatility_services = [sentiment_volatility.index[1]]
                  low_volatility_services = []
             elif n_services_with_volatility == 1:
                  high_volatility_services = [sentiment_volatility.index[0]]
                  medium_volatility_services = []
                  low_volatility_services = []
             else: # n_services_with_volatility == 0
                  high_volatility_services = []
                  medium_volatility_services = []
                  low_volatility_services = []
        else:
             print("No services have enough data for volatility calculation.")
             high_volatility_services = []
             medium_volatility_services = []
             low_volatility_services = []

        volatility_groups = {
            "High Volatility": high_volatility_services,
            "Medium Volatility": medium_volatility_services,
            "Low Volatility": low_volatility_services
        }

        # --- Plotting Sentiment vs. Date with Nested Subplots (4x3 Monthly Grid, 3 Volatility Rows per Month) ---
        # Increased figure width to provide more space for the legend
        fig = plt.figure(figsize=(22, 20))

        # Create a main grid for the months (4 rows, 3 columns)
        # Adjust wspace and hspace
        outer_grid = gridspec.GridSpec(4, 3, wspace=0.2, hspace=0.3)

        # Create a single color cycle for all services across all plots
        all_services_colors = cycle(plt.cm.tab10.colors)
        # Map each service to a consistent color
        service_color_map = {service: next(all_services_colors) for service in services}

        # Wrap the loop iterating through months (outer grid cells)
        for month_idx, month_num in tqdm(enumerate(range(1, 13)), total=12, desc="Generating Monthly Plots"):
            month_name = calendar.month_name[month_num]
            row_idx = month_idx // 3 # Determine the row in the outer grid (0-3)
            col_idx = month_idx % 3  # Determine the column in the outer grid (0-2)

            # Create a subgrid within the current outer grid cell (3 rows for volatility, 1 column)
            # Adjust hspace
            inner_grid = gridspec.GridSpecFromSubplotSpec(3, 1,
                                                          subplot_spec=outer_grid[row_idx, col_idx],
                                                          hspace=0.05) # Minimal vertical spacing

            # Filter data for the current month
            monthly_data = avg_sentiment_per_day_month[avg_sentiment_per_day_month['month'] == month_num].copy()


            volatility_level_names = ["High Volatility", "Medium Volatility", "Low Volatility"]

            # Create a list to hold the axes for sharing the y-axis within the inner grid
            inner_axes = []

            # Wrap the loop iterating through volatility levels within the month
            for inner_row_idx, level_name in enumerate(volatility_level_names):
                services_in_level = volatility_groups.get(level_name, []) # Get the services for this volatility level


                # Get the subplot axes for the current volatility level within the inner grid
                # Share y-axis among the three plots within this month's block
                if inner_row_idx == 0: # First inner plot (High Volatility)
                     ax = fig.add_subplot(inner_grid[inner_row_idx, 0])
                else: # Subsequent inner plots - share y-axis with the first inner plot in this cell
                     ax = fig.add_subplot(inner_grid[inner_row_idx, 0], sharey=inner_axes[0])

                inner_axes.append(ax) # Add the current axes to the list

                # Filter data for services in this volatility level for the current month
                if services_in_level:
                     monthly_level_data = monthly_data[
                         monthly_data['service'].isin(services_in_level)
                     ].copy()
                else:
                     monthly_level_data = pd.DataFrame()


                if not monthly_level_data.empty:
                     # Plot sentiment for each service within this inner subplot
                     for service in services_in_level:
                         service_monthly_data = monthly_level_data[monthly_level_data['service'] == service]
                         if not service_monthly_data.empty:
                             color = service_color_map.get(service, 'gray') # Get consistent color
                             # --- Removed marker='o' to remove points ---
                             ax.plot(
                                 service_monthly_data['day'],
                                 service_monthly_data['sentiment'],
                                 label=service,
                                 color=color,
                                 linestyle='-' # Use a line to connect daily points
                             )

                # --- Format the inner subplot ---
                ax.grid(True, axis='y', linestyle='--') # Add horizontal grid

                # Set x-axis ticks and limits for days
                ax.set_xticks(range(0, 32, 5))
                ax.set_xlim(0, 31)

                # Set title for the month (only on the top inner subplot)
                if inner_row_idx == 0:
                    ax.set_title(month_name)
                    # Add y-label for the first column's top inner plot
                    if col_idx == 0:
                         ax.set_ylabel("Avg Sentiment")
                else:
                    # Hide x-axis labels for inner plots that are not at the bottom of the inner grid
                     ax.tick_params(labelbottom=False)


                # Only show y-axis labels for the first column of outer grid
                if col_idx != 0:
                     ax.tick_params(labelleft=False)

                # Add volatility level name as text annotation on the right side of the inner plots
                # --- Added rotation=90 to rotate text ---
                ax.text(1.02, 0.5, level_name, transform=ax.transAxes,
                        fontsize=9, va='center', ha='left', rotation=90)


            # Set a common x-label for the inner grid (only on the bottom inner plot)
            inner_axes[-1].set_xlabel("Day of Month")


    # --- Add a single legend for all subplots outside the figure ---
    # Use the service_color_map to create handles for the legend
    # This legend already existed and shows line colors mapping to services
    legend_handles = [plt.Line2D([0], [0], color=service_color_map.get(service, 'gray'), lw=2) for service in services]
    # Place the legend on the right side of the figure
    # Adjust bbox_to_anchor and rect to ensure legend is visible
    fig.legend(legend_handles, services, title="Entity", bbox_to_anchor=(1.02, 0.5), loc='center left', borderaxespad=0.)


    # --- Adjust layout ---
    # Use tight_layout on the figure
    # rect parameter leaves space for the main title and the outside legend
    # Adjusted rect right boundary to 0.90 to allow more space for legend
    plt.tight_layout(rect=[0, 0.03, 0.90, 0.98])


    # --- Save the entire figure ---
    plot_filename = os.path.join(plot_output_directory, "sentiment_nested_volatility_month_plots2.png")
    plt.savefig(plot_filename)
    plt.close(fig)

    print(f"Nested 3x12 grid plot saved to: {plot_filename}")

else:
    print("Average daily sentiment DataFrame is empty after aggregation or no services have calculated volatility.")



Extracting Plotting Data:   0%|          | 0/3595301 [00:00<?, ?it/s]

Generating Monthly Plots:   0%|          | 0/12 [00:00<?, ?it/s]

  plt.tight_layout(rect=[0, 0.03, 0.90, 0.98])


Nested 3x12 grid plot saved to: YEAR_APPEND/batch_MULTI_PLOT/PLOTS/sentiment_nested_volatility_month_plots.png
