In [None]:
!pip install requests beautifulsoup4 pandas
!pip install nltk

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to scrape content from a given URL
def scrape_content(url):
    # Send request to the URL
    response = requests.get(url)
    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

# Main function to process the congressional record
def main():
    base_url = "https://www.congress.gov"
    search_url = "https://www.congress.gov/quick-search/congressional-record?wordsPhrases=&wordVariants=on&congressGroups[0]=0&congresses[0]=114&dateOperator=equal&startDate=&endDate=&dateIsOption=yesterday&sectionHouse=on&representative[0]=B000589&senator[0]=&pageSort=issueAsc"
    
    # Initialize DataFrame to store results
    results_df = pd.DataFrame(columns=['Title', 'Date', 'Text', 'Remarks'])

    # Scrape the search page
    search_soup = scrape_content(search_url)

    # Find all links to Congressional Record details from the search result
    records = search_soup.find_all('a', class_='result-heading')
    
    for record in records:
        record_href = record['href']
        record_title = record.get_text(strip=True)
        record_url = base_url + record_href
        
        # Scrape the individual record page
        record_soup = scrape_content(record_url)

        # Extract text and date from the page
        date = record_soup.find('time').get_text(strip=True)
        all_text = record_soup.find_all('pre')
        text = '\n'.join([text.get_text(strip=True) for text in all_text])
        
        # Assuming the Remarks are within a section with a specific id or class
        remarks_section = record_soup.find('div', id='remarks') or record_soup.find('div', class_='remarks')
        remarks = remarks_section.get_text(strip=True) if remarks_section else 'No remarks found'
        
        # Append to DataFrame
        results_df = results_df.append({'Title': record_title, 'Date': date, 'Text': text, 'Remarks': remarks}, ignore_index=True)
    
    # Save results to CSV
    results_df.to_csv('Congressional_Record_Boehner.csv', index=False)

if __name__ == "__main__":
    main()


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Ensure nltk components are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Example function to preprocess text
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

# Vectorization
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocess_text)

# Example dataset
texts = ["I support this bill because it benefits our society.", 
         "This bill is harmful, and I oppose it.",
         "I have no comments on this matter."]
labels = [1, -1, 0]  # 1: support, -1: oppose, 0: neutral

# Transform texts
X = tfidf_vectorizer.fit_transform(texts)
y = labels

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Model training
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluation
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

# Classify new text
# import text from sampletext.txt
with open('sampletext.txt', 'r') as file:
    new_text = file.read()
new_text = "I think this bill could be improved, but it's a good start."
new_vector = tfidf_vectorizer.transform([new_text])
prediction = model.predict(new_vector)
print(f"Prediction for new text: {prediction}")

#  Classify for Boeher
 


In [34]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re


# Sample data with possible sentiments and classification labels
data = [
    {'text': "I support this bill because it benefits our society.", 'label': 1},
    {'text': "This bill is harmful, and I oppose it.", 'label': -1},
    {'text': "I have no comments on this matter.", 'label': 0},
    {'text': "The provisions of this bill will advance our economic growth, which is why I fully back it.", 'label': 1},
    {'text': "This legislation will negatively impact our education system, and I cannot support it.", 'label': -1},
    {'text': "I am still reviewing the details of the bill and have not formed an opinion yet.", 'label': 0},
    {'text': "I enthusiastically endorse this bill due to its environmental benefits.", 'label': 1},
    {'text': "I must oppose this bill as it risks our national security.", 'label': -1},
    {'text': "At this time, I choose to reserve my judgement until further information is available.", 'label': 0},
    {'text': "After careful consideration, I believe this bill will significantly help our community, hence my support.", 'label': 1},
    {'text': "I find the measures in this bill unacceptable and detrimental to our values, so I oppose it.", 'label': -1},
    {'text': "I am undecided on this issue as more analysis is needed.", 'label': 0},
    {'text': "This bill will strengthen our infrastructure, which is why I support it.", 'label': 1},
    {'text': "Given the financial burden this bill imposes, I am against it.", 'label': -1},
    {'text': "I have not yet decided where I stand on this legislation.", 'label': 0},
    {'text': "I wholeheartedly support this bill for its positive impact on health care.", 'label': 1},
    {'text': "The bill threatens to erode our civil liberties, and I cannot support it.", 'label': -1},
    {'text': "I am currently abstaining from making a statement until further debate.", 'label': 0},
    {'text': "This bill will create jobs, which is why it has my full support.", 'label': 1},
    {'text': "I oppose this bill because it could potentially lead to environmental degradation.", 'label': -1},
    {'text': "I am on the fence about this bill and will listen to my constituents further.", 'label': 0}
]


# Splitting the dataset into texts and labels
texts = [point['text'] for point in data]
labels = [point['label'] for point in data]

# Split data into training and test sets
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Creating a model pipeline with TF-IDF Vectorizer and Multinomial Naive Bayes Classifier
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Training the model
model.fit(texts_train, labels_train)

# Predicting the test set
predicted_labels = model.predict(texts_test)

# Printing the classification report
print(classification_report(labels_test, predicted_labels))

# Function to classify new texts
def predict_position(text):
    prediction = model.predict([text])
    return "Support" if prediction == 1 else "Against" if prediction == -1 else "No participation"

with open('sampletext.txt', 'r') as file:
    new_text = file.read()

# Predicting the position for a new text
prediction = predict_position(new_text)
print(f"Prediction for new text: {prediction}")


# Regular expression to find statements attributed to John Boehner
boehner_statements = re.compile(r'(Speaker Boehner|John Boehner|Boehner):? (.*)')
# Function to extract Boehner's statements from a given text file sampletext.txt
def extract_boehner_statements(file_path):
    boehner_statements = []
    with open(file_path, 'r') as file:
        for line in file:
            match = re.search(r'(Speaker Boehner|John Boehner|Boehner):? (.*)', line)
            if match:
                boehner_statements.append(match.group(2))
    return boehner_statements

# Extract Boehner's statements from the sample text file
statements = extract_boehner_statements('sampletext.txt')

# Print the extracted statements
for statement in statements:
    print(f"Boehner's statement: {statement}")

# Function to classify Boehner's statements
def classify_boehner_statements(statements):
    classified_statements = []
    for statement in statements:
        prediction = predict_position(statement)
        classified_statements.append((statement, prediction))
    return classified_statements


# Classify Boehner's statements
classified_boehner_statements = classify_boehner_statements(statements)

# Print the classified statements
for statement, prediction in classified_boehner_statements:
    print(f"Boehner's statement: {statement} - Prediction: {prediction}")


def extract_pelosi_statements_advanced(text):
    import re
    pelosi_statements = []
    in_pelosi_speech = False
    current_statement = []

    # Define regex for detecting any speaker or a new section in the text
    speaker_regex = re.compile(r'\sMr\.\s\w+\.\sMr\.\sSpeaker', re.M)
    pelosi_start_regex = re.compile(r'\sMs\.\sPELOSI\.', re.M)

    lines = text.split('\n')
    for i, line in enumerate(lines):
        # Check if a new speaker starts speaking
        if pelosi_start_regex.search(line):
            in_pelosi_speech = True
            # Skip the speaker's name part and capture the statement
            current_statement.append(line.split('.', 1)[-1].strip())
            continue
        
        if in_pelosi_speech:
            # If a new speaker section is detected and it's not Pelosi's continuation
            if speaker_regex.search(line) and not pelosi_start_regex.search(line):
                in_pelosi_speech = False
                pelosi_statements.append(' '.join(current_statement).strip())
                current_statement = []
                continue
            
            # Continue capturing Pelosi's speech
            current_statement.append(line.strip())

    # Add the last captured statement if any
    if in_pelosi_speech and current_statement:
        pelosi_statements.append(' '.join(current_statement).strip())

    return pelosi_statements

# Example usage assuming your file is properly named and located
pelosi_statements = extract_pelosi_statements('sampletext.txt')
print("Extracted Pelosi Statements:", pelosi_statements)


# classify for Kevin McCarthy
def extract_mccarthy_statements(file_path):
    mccarthy_statements = []
    pattern = re.compile(r'(Kevin McCarthy|McCarthy):?\s*(.*)')
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # print("Processing line:", line.strip())  # Debugging: show each line being processed
            match = pattern.search(line)
            if match:
                statement = match.group(2).strip()  # Capture the statement part and strip whitespace
                mccarthy_statements.append(statement)
                # print("Match found:", statement)  # Debugging: show the match found
            # else:
                # print("No match found in line.")  # Debugging: no match in this line
    return mccarthy_statements

# Example usage assuming your file is properly named and located
mccarthy_statements = extract_mccarthy_statements('sampletext.txt')

# Print the extracted statements
for statement in mccarthy_statements:
    print(f"McCarthy's statement: {statement}")

# Function to classify McCarthy's statements
def classify_mccarthy_statements(statements):
    classified_statements = []
    for statement in statements:
        prediction = predict_position(statement)
        classified_statements.append((statement, prediction))
    return classified_statements

# Classify McCarthy's statements
classified_mccarthy_statements = classify_mccarthy_statements(mccarthy_statements)

# Print the classified statements
for statement, prediction in classified_mccarthy_statements:
    print(f"McCarthy's statement: {statement} - Prediction: {prediction}")




              precision    recall  f1-score   support

          -1       0.25      1.00      0.40         1
           0       1.00      0.33      0.50         3
           1       0.00      0.00      0.00         2

    accuracy                           0.33         6
   macro avg       0.42      0.44      0.30         6
weighted avg       0.54      0.33      0.32         6

Prediction for new text: Against
Extracted Pelosi Statements: [', our distinguished', '. I thank the gentleman for yielding and for his leadership', '', '']
[', our distinguished', '. I thank the gentleman for yielding and for his leadership', '', '']
Pelosi's statement: , our distinguished
Pelosi's statement: . I thank the gentleman for yielding and for his leadership
Pelosi's statement: 
Pelosi's statement: 
McCarthy's statement: ), the distinguished House
McCarthy's statement: 
McCarthy's statement: 
McCarthy's statement: ), the distinguished House - Prediction: Against
McCarthy's statement:  - Prediction: Ag