<a href="https://colab.research.google.com/github/drewhalfmann/drewhalfmann/blob/main/LLM_Text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Getting info about a csv file

In [None]:
## Getting info about a csv file

# Read csv into dataframe
filepath = '/content/drive/MyDrive/Presidents_Paper/CSV_files/Paragraphs.csv'
df_results = pd.read_csv(filepath)

#Display number of rows
print(f"Number of rows (excluding header): {len(df_results)}")

#Display column headings
print(df_results.columns)

# Display the first 5 rows of the DataFrame
print(df_results.head())

In [None]:
# Initialize your pretrained model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('/content/drive/MyDrive/Presidents_Paper/Model_1800')
model = RobertaForSequenceClassification.from_pretrained('/content/drive/MyDrive/Presidents_Paper/Model_1800')

# Health-Focus Analysis:


1) Uses Roberta to give health confidence scores to
speech titles and paragraphs
2) Combines short paragraphs with nearby ones, in part based on similar confidence scores
3) Determines health focus of each speech in a variety of ways, the best of which appears to be "health similarity"--calculates degree to which all paragraphs in a given speech are similar to paragraphs with high confidence scores

## Part 1: Split Speech Text into Paragraphs and Create a New Dataset--Done

In [None]:
## Part 1: Split Speech Text into Paragraphs and Create a New Dataset

# Import necessary libraries
from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd

## Define paths
dataset_path = '/content/drive/MyDrive/Presidents_Paper/Data_from_JupyterLab/All_noQ&A_250+NoSOTU.csv'
output_path = '/content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Paragraphs.csv'

# Load the dataset
dataset = load_dataset('csv', data_files=dataset_path)['train']

# Ensure Speech_ID exists or create it
if 'Speech_ID' not in dataset.column_names:
    dataset = dataset.add_column('Speech_ID', range(1, len(dataset) + 1))

# Function to split speeches into paragraphs and assign IDs
def split_into_paragraphs(example, idx_start=1):
    paragraphs = example['Speech'].split('\n\n')  # Adjust based on actual paragraph delimiters
    return [{
        'Speech_ID': example['Speech_ID'],
        'Paragraph_ID': f"{example['Speech_ID']}-{idx}",
        'Speaker': example['Speaker'],
        'Date': example['Date'],
        'Title': example['Title'],
        'Paragraph_Text': paragraph.strip()
    } for idx, paragraph in enumerate(paragraphs, start=idx_start)]

# Apply the function and flatten the output
paragraphs = [split_into_paragraphs(speech) for speech in dataset]
flattened_paragraphs = [paragraph for speech_paragraphs in paragraphs for paragraph in speech_paragraphs]

# Convert to pandas DataFrame, then to Dataset
df = pd.DataFrame(flattened_paragraphs)
final_dataset = Dataset.from_pandas(df)

# Display the first 5 rows of the DataFrame
print(df.head())


# Export to CSV
final_dataset.to_csv(output_path, index=False)

print(f"Dataset with Speech_ID and Paragraph_ID created and saved to {output_path}")


## Part 2: Split Paragraph dataset into six pieces

In [None]:
# Part 2: Split Paragraph dataset into six pieces-not done
import pandas as pd

# Load the CSV file
csv_path = '/content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Paragraphs.csv'
df = pd.read_csv(csv_path)

# Print the row count of the input dataset
print(f'Input dataset contains {len(df)} rows.')

# Calculate the size of each part
part_size = len(df) // 6

# Split the DataFrame into six parts and save each to a new CSV
for i in range(6):
    start_index = i * part_size
    # For the last part, include any remaining rows
    end_index = (i + 1) * part_size if i < 5 else len(df)
    df_part = df.iloc[start_index:end_index]

    # Construct the output path for each part
    output_path = f'/content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Paragraphs_part_{i+1}.csv'

    # Save each part to a new CSV
    df_part.to_csv(output_path, index=False)

    # Print the row count of each output dataset
    print(f'Part {i+1} saved to {output_path} contains {len(df_part)} rows.')


Input dataset contains 618093 rows.
Part 1 saved to /content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Paragraphs_part_1.csv contains 103015 rows.
Part 2 saved to /content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Paragraphs_part_2.csv contains 103015 rows.
Part 3 saved to /content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Paragraphs_part_3.csv contains 103015 rows.
Part 4 saved to /content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Paragraphs_part_4.csv contains 103015 rows.
Part 5 saved to /content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Paragraphs_part_5.csv contains 103015 rows.
Part 6 saved to /content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Paragraphs_part_6.csv contains 103018 rows.


In [None]:
# classification with distilbert
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import pandas as pd
import torch
from tqdm import tqdm
import time

# Ensure CUDA is used if available, and create a device object
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def print_cuda_memory_usage():
    print(f"Current CUDA Memory Usage:")
    print(f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
    print(f"Cached: {torch.cuda.memory_reserved()/1024**3:.2f} GB")

# Paths and candidate labels
dataset_path = '/content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Paragraphs_part_1.csv'
output_path = '/content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Classified_Paragraphs_and Titles_part_1.csv'

candidate_labels = ['health', 'prevention and health', 'drug and alcohol abuse', 'environmental health', 'safety and health', 'mental health', 'disability and health', 'occupational health', 'sexual health']

# Step 2: Load the dataset
dataset = load_dataset('csv', data_files=dataset_path)['train']

# Initialize the classifier with DistilBERT model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=0)  # Ensure the device is correctly set

# The rest of your code remains largely unchanged

# Function to perform batch classification
def classify_batch(batch, classifier, candidate_labels):
    titles = batch['Title']
    paragraphs = batch['Paragraph_Text']
    # Process titles and paragraphs in batch for efficiency
    title_results = classifier(titles, candidate_labels=candidate_labels, truncation=True)
    paragraph_results = classifier(paragraphs, candidate_labels=candidate_labels, truncation=True)
    return title_results, paragraph_results

# Batch processing
batch_size = 1024 # Adjust based on your GPU's memory capacity
num_batches = len(dataset) // batch_size + (0 if len(dataset) % batch_size == 0 else 1)

# Prepare data for storing results
results = []

# Print the number of batches
print(f"Number of batches: {num_batches}")

# Step 3: Classify the dataset
start_time = time.time()  # Start time for runtime estimation

# Print CUDA memory usage
print_cuda_memory_usage()


# Wrap the range with tqdm for a progress meter
for i in tqdm(range(num_batches), desc="Processing batches"):
    # Print CUDA memory usage before processing the batch
    print(f"Before processing batch {i+1}:")
    print_cuda_memory_usage()

    batch = dataset.select(range(i*batch_size, min((i+1)*batch_size, len(dataset))))
    title_results, paragraph_results = classify_batch(batch, classifier, candidate_labels)

    for j, (title_res, paragraph_res) in enumerate(zip(title_results, paragraph_results)):
        # Extract scores for each label
        title_scores = {f"{label}_Title_cs": score for label, score in zip(title_res['labels'], title_res['scores'])}
        paragraph_scores = {f"{label}_Paragraph_cs": score for label, score in zip(paragraph_res['labels'], paragraph_res['scores'])}

        # Combine with original data
        result_row = {**batch[j], **title_scores, **paragraph_scores}
        results.append(result_row)

for i in tqdm(range(num_batches), desc="Processing batches"):
    # Print CUDA memory usage before processing the batch
    print(f"Before processing batch {i+1}:")
    print_cuda_memory_usage()

    batch = dataset.select(range(i*batch_size, min((i+1)*batch_size, len(dataset))))
    title_results, paragraph_results = classify_batch(batch, classifier, candidate_labels)

end_time = time.time()  # End time for runtime estimation
total_time = end_time - start_time  # Total runtime
print(f"Total processing time: {total_time:.2f} seconds")
print_cuda_memory_usage()

# Convert results to DataFrame
df_results = pd.DataFrame(results)

print(f"Number of rows (excluding header): {len(df_results)}")

# Display the columns
print(df_results.columns)

# Display the first 5 rows of the DataFrame
print(df_results.head())

# Step 6: Export to CSV
df_results.to_csv(output_path, index=False)
print(f"Classification completed. Results saved to {output_path}.")


ModuleNotFoundError: No module named 'datasets'

In [None]:
from transformers import AutoTokenizer, DistilBertForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("AyoubChLin/distilbert_ag_cnn")
model = DistilBertForSequenceClassification.from_pretrained("AyoubChLin/distilbert_ag_cnn")

# Replace "input_text" with the actual news article
input_text = "The stock market had a strong performance today, driven by strong earnings reports from technology companies."

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors='pt')

# Make a prediction
outputs = model(**inputs)
predicted_label = torch.argmax(outputs.logits, dim=1)

# Print the predicted label
print(predicted_label)

tensor([1])


## Part 3: Uses Roberta to give health confidence scores to speech titles and paragraphs

In [None]:
## Part 3. Classify Title and Paragraph (Repeat 6 times)

# Step 1: Install and import necessary libraries
# !pip install transformers datasets torch tqdm

from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset, Dataset
import pandas as pd
import torch
from tqdm import tqdm  # Import tqdm for progress metering
import time  # Import time to measure runtime

# Ensure CUDA is used if available, and create a device object
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def print_cuda_memory_usage():
    print(f"Current CUDA Memory Usage:")
    print(f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")  # Note: removed device argument for simplicity
    print(f"Cached: {torch.cuda.memory_reserved()/1024**3:.2f} GB")

# Paths and candidate labels
dataset_path = '/content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Paragraphs_part_1.csv'
output_path = '/content/drive/MyDrive/Presidents_Paper/CSV_files/All_noQ&A_250+NoSOTU_Classified_Paragraphs_and Titles_part_1.csv'


candidate_labels = ['health', 'prevention and health', 'drug and alcohol abuse', 'environmental health', 'safety and health', 'mental health', 'disability and health', 'occupational health', 'sexual health']

# Step 2: Load the dataset
dataset = load_dataset('csv', data_files=dataset_path)['train']

# Initialize the classifier with the roberta-large-mnli model
model_name = "roberta-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=device)

# Function to perform batch classification
def classify_batch(batch, classifier, candidate_labels):
    titles = batch['Title']
    paragraphs = batch['Paragraph_Text']
    # Process titles and paragraphs in batch for efficiency
    title_results = classifier(titles, candidate_labels=candidate_labels, truncation=True)
    paragraph_results = classifier(paragraphs, candidate_labels=candidate_labels, truncation=True)
    return title_results, paragraph_results

# Batch processing
batch_size = 1024 # Adjust based on your GPU's memory capacity
num_batches = len(dataset) // batch_size + (0 if len(dataset) % batch_size == 0 else 1)

# Prepare data for storing results
results = []

# Print the number of batches
print(f"Number of batches: {num_batches}")

# Step 3: Classify the dataset
start_time = time.time()  # Start time for runtime estimation

# Print CUDA memory usage
print_cuda_memory_usage()


# Wrap the range with tqdm for a progress meter
for i in tqdm(range(num_batches), desc="Processing batches"):
    # Print CUDA memory usage before processing the batch
    print(f"Before processing batch {i+1}:")
    print_cuda_memory_usage()

    batch = dataset.select(range(i*batch_size, min((i+1)*batch_size, len(dataset))))
    title_results, paragraph_results = classify_batch(batch, classifier, candidate_labels)

    for j, (title_res, paragraph_res) in enumerate(zip(title_results, paragraph_results)):
        # Extract scores for each label
        title_scores = {f"{label}_Title_cs": score for label, score in zip(title_res['labels'], title_res['scores'])}
        paragraph_scores = {f"{label}_Paragraph_cs": score for label, score in zip(paragraph_res['labels'], paragraph_res['scores'])}

        # Combine with original data
        result_row = {**batch[j], **title_scores, **paragraph_scores}
        results.append(result_row)

for i in tqdm(range(num_batches), desc="Processing batches"):
    # Print CUDA memory usage before processing the batch
    print(f"Before processing batch {i+1}:")
    print_cuda_memory_usage()

    batch = dataset.select(range(i*batch_size, min((i+1)*batch_size, len(dataset))))
    title_results, paragraph_results = classify_batch(batch, classifier, candidate_labels)





end_time = time.time()  # End time for runtime estimation
total_time = end_time - start_time  # Total runtime
print(f"Total processing time: {total_time:.2f} seconds")
print_cuda_memory_usage()

# Convert results to DataFrame
df_results = pd.DataFrame(results)

print(f"Number of rows (excluding header): {len(df_results)}")

# Display the columns
print(df_results.columns)

# Display the first 5 rows of the DataFrame
print(df_results.head())

# Step 6: Export to CSV
df_results.to_csv(output_path, index=False)
print(f"Classification completed. Results saved to {output_path}.")


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Number of batches: 101
Current CUDA Memory Usage:
Allocated: 13.25 GB
Cached: 13.30 GB


Processing batches:   0%|          | 0/101 [00:00<?, ?it/s]

Before processing batch 1:
Current CUDA Memory Usage:
Allocated: 13.25 GB
Cached: 13.30 GB


Processing batches:   1%|          | 1/101 [05:46<9:37:15, 346.36s/it]

Before processing batch 2:
Current CUDA Memory Usage:
Allocated: 13.25 GB
Cached: 13.33 GB


Processing batches:   2%|▏         | 2/101 [11:34<9:33:30, 347.58s/it]

Before processing batch 3:
Current CUDA Memory Usage:
Allocated: 13.25 GB
Cached: 13.33 GB


Processing batches:   3%|▎         | 3/101 [17:24<9:29:01, 348.38s/it]

Before processing batch 4:
Current CUDA Memory Usage:
Allocated: 13.25 GB
Cached: 13.33 GB


Processing batches:   4%|▍         | 4/101 [23:10<9:22:12, 347.76s/it]

Before processing batch 5:
Current CUDA Memory Usage:
Allocated: 13.25 GB
Cached: 13.35 GB


Processing batches:   4%|▍         | 4/101 [24:07<9:44:59, 361.85s/it]


KeyboardInterrupt: 

## Part 4: Aggregate six classified paragraph files

In [None]:
#Part 4: Aggregate six classified paragraph files

import pandas as pd

# Base path and file pattern
base_path = '/content/drive/MyDrive/Presidents_Paper/CSV_files/'
file_pattern = 'All_noQ&A_250+NoSOTU_Classified_Paragraphs_and Titles_part_{}.csv'

# Initialize an empty list to hold DataFrames
dfs = []

# Loop through the six parts
for i in range(1, 7):
    file_path = base_path + file_pattern.format(i)
    # Read the CSV file and append the DataFrame to the list
    df = pd.read_csv(file_path)
    dfs.append(df)
    # Print the row count for the current part
    print(f'Loaded {len(df)} rows from "{file_path}"')

# Concatenate all DataFrames in the list
df_concatenated = pd.concat(dfs, ignore_index=True)

# Save the concatenated DataFrame to a new CSV file
output_path = base_path + 'Aggregated_Classified_Paragraphs_and_Titles.csv'
df_concatenated.to_csv(output_path, index=False)

# Print the total row count of the aggregated DataFrame
print(f'Aggregated dataset saved to "{output_path}" contains {len(df_concatenated)} rows.')


## Part 5: Combines small paragraphs with nearby ones. The new larger paragraph takes the confidence score of its highest-scored member

In [None]:
# Part 5: Combines small paragraphs with nearby ones. The new larger paragraph takes the confidence score of its highest-scored member
import pandas as pd

# Function to aggregate paragraphs
def aggregate_paragraphs(classified_df, confidence_threshold, length_threshold):
    rows_to_concat = []
    current_paragraph_or_title = ""
    current_confidence_scores = []
    current_component_paragraphs = []
    skip_until_index = -1

    for i, row in classified_df.iterrows():
        text, score, paragraph_type = row['Text'], row['Score'], row['Type']

        if paragraph_type != 'Paragraph':
            rows_to_concat.append(row)
            continue

        if i <= skip_until_index:
            continue

        if len(text.split()) < length_threshold and score >= confidence_threshold:
            current_paragraph_or_title += (" " if current_paragraph_or_title else "") + text
            current_confidence_scores.append(score)
            current_component_paragraphs.append(text)

            for j in range(i+1, min(i+3, len(classified_df))):
                next_row = classified_df.iloc[j]
                if next_row['Type'] != 'Paragraph':
                    break
                next_text, next_score = next_row['Text'], next_row['Score']
                if len(next_text.split()) < length_threshold and next_score >= confidence_threshold:
                    current_paragraph_or_title += " " + next_text
                    current_confidence_scores.append(next_score)
                    current_component_paragraphs.append(next_text)
                    skip_until_index = j
                else:
                    break

        if current_paragraph_or_title:
            aggregated_row = row.copy()
            aggregated_row['Text'] = current_paragraph_or_title
            aggregated_row['Score'] = max(current_confidence_scores)
            aggregated_row['Aggregated'] = True
            aggregated_row['Component_Paragraphs'] = ' || '.join(current_component_paragraphs)

            rows_to_concat.append(aggregated_row)

            current_paragraph_or_title = ""
            current_confidence_scores = []
            current_component_paragraphs = []
        else:
            rows_to_concat.append(row)

    # Create the aggregated DataFrame using concat
    aggregated_df = pd.concat(rows_to_concat, axis=1).T

    # Rename and remove columns as per your request
    aggregated_df = aggregated_df.drop(columns=['Original_Title'])
    aggregated_df = aggregated_df.rename(columns={
        'SpeechID': 'Speech_ID',
        'Text': 'Paragraph_Or_Title',
        'Score': 'Confidence_Score'
    })

    return aggregated_df

# Load the classified text DataFrame
classified_df = pd.read_csv('/content/drive/MyDrive/Presidents_Paper/CSV_files/Classified_Paragraphs_and_Titles.csv')

# Set thresholds
confidence_threshold = 0.5
length_threshold = 90

# Aggregate the paragraphs
aggregated_df = aggregate_paragraphs(classified_df, confidence_threshold, length_threshold)

# Save the data to a new CSV file
aggregated_paragraphs_csv_path = '/content/drive/MyDrive/Presidents_Paper/CSV_files/Aggregated_Short_Paragraphs.csv'
aggregated_df.to_csv(aggregated_paragraphs_csv_path, index=False)

# Print confirmation
print("Aggregated paragraphs saved to:", aggregated_paragraphs_csv_path)

# Print columns and rows of DataFrame
first_rows = aggregated_df.head(2)
columns = aggregated_df.columns
print(columns)
print(first_rows)

## Part 6: Health Focus Analysis--Calculates Hlth_Similarity (best one, Normalized_Hlth_Confidence, Proportion_Hlth_Graphs

   

 Hlth_Similarity:
        This measure represents how similar each speech is to a standard health-focused embedding.
        The standard health-focused embedding is created by averaging embeddings of paragraphs with a high confidence score (greater than 0.9) in health topics.
        Cosine similarity is used to compare the embedding of each speech with this standard embedding.
        A higher Hlth_Similarity score indicates that the speech's content is more closely aligned with typical health-related topics.

    Normalized_Hlth_Confidence:
        This is a normalized measure of the emphasis on health topics within each speech.
        It's calculated as the weighted sum of confidence scores for health-related paragraphs in a speech, normalized by the total sum of confidence scores across all paragraphs in that speech.
        The weighting is done by squaring the confidence score for each paragraph, emphasizing paragraphs with higher confidence.
        This measure accounts for both the quantity and the confidence of health-related content, providing a more nuanced view than simply counting paragraphs.

    Proportion_Hlth_Graphs:
        This measure calculates the proportion of paragraphs in each speech that are health-related.
        It's determined by dividing the number of paragraphs with a confidence score above a set threshold by the total number of paragraphs in the speech.
    

    Unnormalized_Hlth_Confidence:
        This is an unnormalized index representing the total confidence scores of health-related paragraphs in each speech.
        It sums the confidence scores of paragraphs in a speech where the score exceeds the confidence threshold, without any normalization.
        This measure indicates the overall presence of health-related content in a speech but doesn't account for the total amount of content.

In [None]:
# Health Focus Analysis--Calculates 'Hlth_Similarity','Normalized_Hlth_Confidence','Proportion_Hlth_Graphs'
import pandas as pd
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import numpy as np

# Load the dataset containing paragraphs and their confidence scores
file_path = '/content/drive/MyDrive/Presidents_Paper/CSV_files/Aggregated_Short_Paragraphs.csv'
df_paragraphs = pd.read_csv(file_path)

# Set the confidence threshold
confidence_threshold = 0.5

# Filter paragraphs with a Confidence_Score > 0.9 for health embedding calculation
health_paragraphs = df_paragraphs[df_paragraphs['Confidence_Score'] > 0.9]['Paragraph_Or_Title']

# Initialize the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the filtered health paragraphs
health_embeddings = [model.encode(paragraph) for paragraph in health_paragraphs]
standard_health_embedding = np.mean(health_embeddings, axis=0)

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Generate embeddings for each speech
speech_embeddings = {}
for speech_id, group in df_paragraphs.groupby('Speech_ID'):
    concatenated_text = " ".join(group['Paragraph_Or_Title'])
    speech_embeddings[speech_id] = model.encode(concatenated_text)

# Calculate similarity of each speech to the standard health embedding
for speech_id in df_paragraphs['Speech_ID'].unique():
    df_paragraphs.loc[df_paragraphs['Speech_ID'] == speech_id, 'Hlth_Similarity'] = cosine_similarity(speech_embeddings[speech_id], standard_health_embedding)

# Filter paragraphs based on the confidence threshold for index calculations
significant_health_paragraphs = df_paragraphs[df_paragraphs['Confidence_Score'] >= confidence_threshold]

# Calculate the proportion of health-related paragraphs
proportion_health_paragraphs = significant_health_paragraphs.groupby('Speech_ID').size() / df_paragraphs.groupby('Speech_ID').size()
proportion_health_paragraphs = proportion_health_paragraphs.reset_index(name='Proportion_Hlth_Graphs')

# Calculate the unnormalized health emphasis index
unnormalized_health_index = significant_health_paragraphs.groupby('Speech_ID')['Confidence_Score'].sum().reset_index(name='Unnormalized_Hlth_Confidence')

# Calculate the weighted sum of confidence scores for each paragraph
df_paragraphs['Weighted_Confidence'] = df_paragraphs['Confidence_Score'] * df_paragraphs['Confidence_Score']

# Calculate the total sum of confidence scores for normalization
total_confidence = df_paragraphs.groupby('Speech_ID')['Confidence_Score'].sum()

# Calculate the normalized health emphasis index
normalized_health_index = df_paragraphs.groupby('Speech_ID').apply(
    lambda x: x['Weighted_Confidence'].sum() / total_confidence[x.name]
).reset_index(name='Normalized_Hlth_Confidence')

# Combine the calculated indices with the df_paragraphs at the speech level
speech_level_df = df_paragraphs.drop_duplicates(subset='Speech_ID').copy()
speech_level_df = speech_level_df.merge(proportion_health_paragraphs, on='Speech_ID', how='left')
speech_level_df = speech_level_df.merge(unnormalized_health_index, on='Speech_ID', how='left')
speech_level_df = speech_level_df.merge(normalized_health_index, on='Speech_ID', how='left')

# Fill NaN values with 0 for speeches with no significant health content
speech_level_df['Proportion_Hlth_Graphs'].fillna(0, inplace=True)
speech_level_df['Unnormalized_Hlth_Confidence'].fillna(0, inplace=True)
speech_level_df['Normalized_Hlth_Confidence'].fillna(0, inplace=True)

# Select only the relevant columns for the final CSV
final_columns = ['Speech_ID', 'Speaker', 'Date', 'Title', 'Speech', 'Label', 'Hlth_Similarity','Normalized_Hlth_Confidence',
                 'Proportion_Hlth_Graphs', 'Unnormalized_Hlth_Confidence'
                ]
final_df = speech_level_df[final_columns]
final_df = final_df.round(2)

# Save the final analysis results to a CSV file
final_analysis_csv_path = '/content/drive/MyDrive/Presidents_Paper/CSV_files/Health_Focus_Combined.csv'
final_df.to_csv(final_analysis_csv_path, index=False)

print("Results saved to CSV.")
columns = final_df.columns
print(columns)


ModuleNotFoundError: No module named 'sentence_transformers'

# Program to Evaluate Fine-tuning Data
"We conducted a data quality analysis that included: (1) exploring data structure and characteristics, (2) identifying missing values, duplicates, and potential noise, (3) examining metadata like authorship, publication dates, and sources to understand dataset origins and biases, and (4) evaluating label quality by analyzing class distribution and identifying underrepresented labels."

In [None]:
# Program to Evaluate Fine-tuning Data

# Importing Necessary Libraries
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
import re

# Count rows in each part of dataset

def calculate_subset_rows_with_labels(dataset_name):
    """
    Calculate and print the number of rows in each subset (training, testing, and validation)
    of a Hugging Face dataset, along with the total number of rows, and list the label names.

    Args:
    dataset_name (str): The name of the dataset on Hugging Face.

    Returns:
    int: Total number of rows across all subsets.
    """
    # Load the dataset
    dataset = load_dataset(dataset_name)

    # Initialize total count
    total_rows = 0

    # Check and print the number of rows in each subset if it exists, along with label names
    for subset in ['train', 'validation', 'test']:
        if subset in dataset:
            subset_rows = len(dataset[subset])
            total_rows += subset_rows
            print(f"\nNumber of rows in '{subset}' subset: {subset_rows}")

            # Print label names if the subset has a 'label' feature
            if 'label' in dataset[subset].features:
                label_names = dataset[subset].features['label'].names
                print(f"Label names in '{subset}' subset: {label_names}")

    # Print the total number of rows across all subsets
    print(f"\nTotal number of rows across all subsets: {total_rows}")
    return total_rows

def comprehensive_dataset_analysis(dataset_name, subset='train'):
    """
    Perform a comprehensive analysis of a text classification dataset, including EDA,
    data quality assessment, metadata and annotations analysis, and label quality checks with label names.

    Args:
    dataset_name (str): The name of the dataset on Hugging Face.
    subset (str): The subset of the dataset to analyze (e.g., 'train', 'test', 'validation').

    Returns:
    None: This function prints out the analysis and shows plots.
    """
    # Load the dataset
    dataset = load_dataset(dataset_name)
    df = pd.DataFrame(dataset[subset])

    # Retrieve label names if available
    label_names = dataset[subset].features['label'].names if 'label' in dataset[subset].features else None
    if label_names:
        print(f"Label names in '{subset}' subset: {label_names}")

    # Basic Exploratory Data Analysis (EDA)
    print(f"\nBasic Exploratory Data Analysis (EDA) for '{subset}' subset:")
    print("First 5 Rows:")
    print(df.head())
    print("\nBasic Statistical Summary:")
    print(df.describe(include='all'))
    print("\nInformation about Data Types and Missing Values:")
    print(df.info())

    # Data Quality Assessment
    print("\nData Quality Assessment:")
    missing_values = df.isnull().sum()
    print(f"Missing Values in Each Column:\n{missing_values}")
    duplicates = df.duplicated().sum()
    print(f"Number of Duplicate Rows: {duplicates}")
    if 'text' in df.columns:
        noise_pattern = r'[^a-zA-Z0-9\s,.!?]'
        noisy_data = df['text'].str.contains(noise_pattern).sum()
        print(f"Number of Rows with Potential Noise in Text: {noisy_data}")

    # Metadata and Annotations Analysis
    print("\nMetadata and Annotations Analysis:")
    metadata_columns = ['author', 'timestamp', 'source']  # Example metadata columns
    for col in metadata_columns:
        if col in df.columns:
            print(f"\nAnalysis of '{col}' Column:")
            print(f"Unique values in '{col}':\n{df[col].unique()}")
            print(f"Value counts for '{col}':\n{df[col].value_counts()}")

    # Label Quality Checks
    print("\nLabel Quality Checks:")
    if 'label' in df.columns:
        class_counts = df['label'].value_counts()
        print(f"Class Distribution:\n{class_counts}")
        plt.figure(figsize=(10, 6))
        class_counts.plot(kind='bar')
        plt.xlabel('Classes')
        plt.ylabel('Number of Samples')
        plt.title('Class Distribution in the Dataset')
        plt.show()
        rare_threshold = 0.01 * len(df)
        rare_labels = class_counts[class_counts < rare_threshold]
        if not rare_labels.empty:
            print(f"Rare Labels (less than 1% of the dataset):\n{rare_labels}")
        else:
            print("No rare labels found based on the 1% threshold.")
    else:
        print("No 'label' column found in the dataset.")

# Example Usage
dataset_name = 'AyoubChLin/CNN_News_Articles_2011-2022'  # Replace with the actual dataset name
total_rows = calculate_subset_rows_with_labels(dataset_name)
comprehensive_dataset_analysis(dataset_name)



# Create Random Sample of Data

In [None]:
# # Create Random Sample of Data
# Import necessary libraries
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/Presidents_Paper/Data_from_JupyterLab/All_noQ&A.csv')

# Randomly select 1000 speeches
sampled_df = df.sample(n=5, random_state=42)

# Save the sampled data to a new CSV file
output_path = '/content/drive/MyDrive/Presidents_Paper/Data_from_JupyterLab/All_noQ&A_5.csv'
sampled_df.to_csv(output_path, index=False)

print(f"Sampled data saved to: {output_path}")



Sampled data saved to: /content/drive/MyDrive/Presidents_Paper/Data_from_JupyterLab/All_noQ&A_5.csv


# Program to assess OCR noise in dataset


In [None]:
# Program to assess OCR noise in dataset
# Importing Necessary Libraries
import pandas as pd
from datasets import load_dataset
import re
from collections import Counter

def analyze_text_noise(dataset_name, subset='train'):
    """
    Analyze potential noise in the text data of a dataset.

    Args:
    dataset_name (str): The name of the dataset on Hugging Face.
    subset (str): The subset of the dataset to analyze (e.g., 'train', 'test', 'validation').

    Returns:
    None: This function prints out the analysis.
    """
    # Load the dataset
    dataset = load_dataset(dataset_name)
    df = pd.DataFrame(dataset[subset])

    if 'text' not in df.columns:
        print("No 'text' column found in the dataset.")
        return

    # Define a regex pattern for potential noise
    noise_pattern = re.compile(r'[^a-zA-Z0-9\s,.!?]')

    # Function to find noise in a text
    def find_noise(text):
        if not isinstance(text, str):
            return []  # Return an empty list if the text is not a string
        return noise_pattern.findall(text)

    # Apply the function to each row in the text column
    noise_list = df['text'].apply(find_noise)

    # Flatten the list and count occurrences of each noise character
    noise_counter = Counter([item for sublist in noise_list for item in sublist])

    # Display the most common noise characters
    print("Most Common Noise Characters and their Counts:")
    for noise, count in noise_counter.most_common():
        print(f"{noise}: {count}")

# Example Usage
dataset_name = 'AyoubChLin/CNN_News_Articles_2011-2022'  # Replace with the actual dataset name
analyze_text_noise(dataset_name)



# Pip Install

In [None]:
! pip install LDA

In [None]:
! pip install gputil

In [None]:
!pip show LDA | grep Location


In [None]:
! pip install --upgrade pip


In [None]:
! pip install numpy scipy scikit-learn joblib


In [None]:
! pip install guidedlda

In [None]:
! pip install transformers


In [None]:
! pip install sentence_transformers

In [None]:
!pip install datasets

In [None]:
!watch -n 1 nvidia-smi

In [None]:
print(pd.read_csv('/content/drive/MyDrive/Presidents_Paper/Data_from_JupyterLab/All_noQ&A.csv').columns)

In [None]:
! pip install GPUtil

In [None]:
!pip install transformers datasets torch

# Scrap

In [None]:
## Old version Part 1: Uses Roberta to give health confidence scores to speech titles and paragraphs
## Remember to switch to GPU, remember to ! pip install transformers, sentence transformers, and datasets.

# Import libraries
import pandas as pd
from transformers import pipeline
from datasets import Dataset
from tqdm.auto import tqdm  # for progress meter

# Initialize the Zero-Shot Classifier with GPU
def initialize_classifier():
    try:
        classifier = pipeline("zero-shot-classification", model="roberta-large-mnli", device=0)
        return classifier
    except Exception as e:
        print(f"Error initializing the classifier: {e}")
        return None

# Classify text with detailed health categorization and multiple candidate labels
def classify_text(text, classifier, candidate_labels):
    result = classifier(text, candidate_labels)
    best_label = result['labels'][0]
    best_score = result['scores'][0]
    return best_label, best_score

# Specifying nuanced candidate labels
candidate_labels = ['health', 'prevention and health', 'drug and alcohol abuse', 'environmental health', 'safety and health', 'mental health', 'disability and health', 'occupational health', 'sexual health']

# Function to classify paragraphs and titles in batches with progress update
def classify_paragraphs_and_titles(batch, classifier, candidate_labels):
    """
    Classify paragraphs and titles in the given batch.

    Args:
        batch (dict): A batch of data containing speeches and titles.
        classifier: The zero-shot classifier.
        candidate_labels (list): Detailed labels linking specific issues to broader categories.

    Returns:
        pd.DataFrame: A DataFrame containing the classification results.
    """
    results = []
    for speech, title, speaker, date, speech_id in tqdm(zip(batch['Speech'], batch['Title'], batch['Speaker'], batch['Date'], batch['SpeechID']), total=len(batch['Speech']), desc="Classifying Text"):
        # Classify the title
        title_label, title_score = classify_text(title, classifier, candidate_labels) if title else (None, None)
        results.append({'SpeechID': speech_id, 'Speaker': speaker, 'Date': date, 'Original_Title': title, 'Text': title, 'Label': title_label, 'Score': title_score, 'Type': 'Title'})

        # Classify each paragraph in the speech
        if speech:
            paragraphs = speech.split('\n\n')
            for paragraph in paragraphs:
                if paragraph.strip():  # Ensure paragraph is not empty
                    label, score = classify_text(paragraph, classifier, candidate_labels)
                    results.append({'SpeechID': speech_id, 'Speaker': speaker, 'Date': date, 'Original_Title': title, 'Text': paragraph, 'Label': label, 'Score': score, 'Type': 'Paragraph'})

    return pd.DataFrame(results)


# Main execution block
if __name__ == "__main__":
    classifier = initialize_classifier()

    if classifier:
        # Load and prepare the dataset
        #df = pd.read_csv('/content/drive/MyDrive/Presidents_Paper/Data_from_JupyterLab/All_noQ&A_250+NoSOTU.csv')
        df = pd.read_csv('/content/drive/MyDrive/Presidents_Paper/Data_from_JupyterLab/All_noQ&A_100.csv')
        df['SpeechID'] = range(1, len(df) + 1)
        selected_df = df[['SpeechID', 'Speaker', 'Date', 'Title', 'Speech']]
        dataset = Dataset.from_pandas(selected_df)

        # Classify paragraphs and titles, creating the classified dataset
        classified_dataset = dataset.map(lambda batch: classify_paragraphs_and_titles(batch, classifier=classifier, candidate_labels=candidate_labels), batched=True, batch_size=48)

        # Convert the classified dataset back to a pandas DataFrame
        classified_df = classified_dataset.to_pandas()

        # Merge the original data with classification results
        combined_df = pd.merge(selected_df, classified_df, on=['SpeechID', 'Speaker', 'Date'], how='left')

        # Filter, sort, and save the results
        filtered_df = combined_df.dropna(subset=['Label', 'Score'])
        sorted_df = filtered_df.sort_values(by='Score', ascending=False)
        sorted_df.to_csv('/content/drive/MyDrive/Presidents_Paper/CSV_files/Classified_Paragraphs_and_Titles.csv', sep=',', index=False)

        print("Classification completed and saved to CSV.")


In [None]:
# OLD Part 1: Uses Roberta to give health confidence scores to speech titles and paragraphs
## Remember to switch to GPU, remember to ! pip install transformers, sentence transformers, and datasets.

import pandas as pd
import torch
from transformers import pipeline
from tqdm.auto import tqdm

def initialize_classifier():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    classifier = pipeline("zero-shot-classification", model="roberta-large-mnli", device=device)
    return classifier

# Define candidate labels for classification
candidate_labels = ['health', 'prevention and health', 'drug and alcohol abuse', 'environmental health', 'safety and health', 'mental health', 'disability and health', 'occupational health', 'sexual health']

def create_batches(dataset, batch_size=128):
    all_paragraphs = []
    all_metadata = []
    for index, row in dataset.iterrows():
        paragraphs = [p for p in row['Speech'].split('\n') if p.strip()]
        for paragraph in paragraphs:
            all_paragraphs.append(paragraph)
            all_metadata.append((row['SpeechID'], row['Speaker'], row['Date'], row['Title']))
    # Create batches
    paragraph_batches = [all_paragraphs[i:i+batch_size] for i in range(0, len(all_paragraphs), batch_size)]
    metadata_batches = [all_metadata[i:i+batch_size] for i in range(0, len(all_metadata), batch_size)]
    return paragraph_batches, metadata_batches

def classify_batches(classifier, paragraph_batches, metadata_batches, candidate_labels):
    results = []
    for paragraphs, metadata in tqdm(zip(paragraph_batches, metadata_batches), total=len(paragraph_batches), desc="Classifying Batches"):
        classifications = classifier(paragraphs, candidate_labels, multi_label=True)
        for meta, classification in zip(metadata, classifications):
            speech_id, speaker, date, title = meta
            # Extract and format classification results
            for label, score in zip(classification['labels'], classification['scores']):
                results.append({
                    'SpeechID': speech_id,
                    'Speaker': speaker,
                    'Date': date,
                    'Original_Title': title,
                    'Paragraph': paragraphs,
                    label: score
                })
    return pd.DataFrame(results)

if __name__ == "__main__":
    classifier = initialize_classifier()
    df = pd.read_csv('/content/drive/MyDrive/Presidents_Paper/Data_from_JupyterLab/All_noQ&A_100.csv')
    df['SpeechID'] = range(1, len(df) + 1)
    selected_df = df[['SpeechID', 'Speaker', 'Date', 'Title', 'Speech']].copy()
    selected_df.loc[:, 'SpeechID'] = selected_df['SpeechID'].astype(str)

    paragraph_batches, metadata_batches = create_batches(selected_df, batch_size=32)
    classified_df = classify_batches(classifier, paragraph_batches, metadata_batches, candidate_labels)

    classified_df.to_csv('/content/drive/MyDrive/Presidents_Paper/CSV_files/Classified_Paragraphs_and_Titles.csv', sep=',', index=False)
    print("Classification completed and saved to CSV.")





In [None]:
# OLD Part 1: Uses Roberta to give health confidence scores to speech titles and paragraphs, Remember to switch to GPU, remember to ! pip install transformers, sentence transformers, and datasets

# Import libraries
import pandas as pd
from transformers import pipeline
from datasets import Dataset

# Function to initialize the Zero-Shot Classifier
def initialize_classifier():
    """
    Initialize the zero-shot classification pipeline.

    Returns:
        classifier: The initialized zero-shot classifier.
    """
    try:
        classifier = pipeline("zero-shot-classification", model="roberta-large-mnli", device=0)
        return classifier
    except Exception as e:
        print(f"Error initializing the classifier: {e}")
        return None

# Function to classify text

def classify_text(text, classifier, candidate_labels):
    """
    Classify the given text with detailed health categorization.

    Args:
        text (str): The text to classify.
        classifier: The zero-shot classifier.
        candidate_labels (list): Detailed labels linking specific issues to broader categories.

    Returns:
        tuple: Contains the best matching label and its score.
    """
    # Using combined and specific labels for nuanced classification
    result = classifier(text, candidate_labels)
    best_label = result['labels'][0]
    best_score = result['scores'][0]
    return best_label, best_score

# Specifying nuanced candidate labels
candidate_labels = ['health', 'prevention and health', 'drug and alcohol abuse', 'environmental health', 'safety and health', 'mental health', 'disability and health', 'occupational health', 'sexual health']

# Function to classify paragraphs and titles in batches
def classify_paragraphs_and_titles(batch):
    """
    Classify paragraphs and titles in the given batch.

    Args:
        batch (dict): A batch of data containing speeches and titles.

    Returns:
        pd.DataFrame: A DataFrame containing the classification results.
    """
    results = []
    for idx, (speech, title, speaker, date, speech_id) in enumerate(zip(batch['Speech'], batch['Title'], batch['Speaker'], batch['Date'], batch['SpeechID'])):
        # Classify the title
        title_label, title_score = classify_text(title, classifier) if title else (None, None)
        results.append({'SpeechID': speech_id, 'Speaker': speaker, 'Date': date, 'Original_Title': title, 'Text': title, 'Label': title_label, 'Score': title_score, 'Type': 'Title'})

        # Classify each paragraph in the speech
        if speech:
            paragraphs = speech.split('\n\n')
            for paragraph in paragraphs:
                if paragraph.strip():  # Check if the paragraph is not empty
                    label, score = classify_text(paragraph, classifier)
                    results.append({'SpeechID': speech_id, 'Speaker': speaker, 'Date': date, 'Original_Title': title, 'Text': paragraph, 'Label': label, 'Score': score, 'Type': 'Paragraph'})

    return pd.DataFrame(results)

# Main execution block
if __name__ == "__main__":
    classifier = initialize_classifier()

    if classifier:
        # Load the dataset
        df = pd.read_csv('/content/drive/MyDrive/Presidents_Paper/Data_from_JupyterLab/All_noQ&A_250+NoSOTU.csv')

        # Add a unique identifier for each speech
        df['SpeechID'] = range(1, len(df) + 1)

        selected_df = df[['SpeechID', 'Speaker', 'Date', 'Title', 'Speech']]

        # Convert to a Dataset and classify
        dataset = Dataset.from_pandas(selected_df)
        classified_dataset = dataset.map(classify_paragraphs_and_titles, batched=True, batch_size=48)
        classified_df = classified_dataset.to_pandas()

        # Merge the original data with the classification results
        combined_df = pd.merge(selected_df, classified_df, on=['SpeechID', 'Speaker', 'Date'], how='left')

        # Further processing: Filter, Sort, and Save
        filtered_df = combined_df.dropna(subset=['Label', 'Score'])
        sorted_df = filtered_df.sort_values(by='Score', ascending=False)
        sorted_df.to_csv('/content/drive/MyDrive/Presidents_Paper/CSV_files/Classified_Paragraphs_and_Titles.csv', sep=',', index=False)

        print("Classified paragraphs and titles saved to CSV.")