<a href="https://colab.research.google.com/github/fedeholm/text-classifier-unified-agenda/blob/main/web_crawler_unified_agenda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
from collections import Counter
import pandas as pd

In [None]:
# import a csv file with hyperlinks ready to be read by a web crawler. This step is only to check that the file can be imported properly. This step is not needed, but can be useful when dealing with convoluted file paths

# Load the CSV file into a pandas DataFrame
try:
    data_links = pd.read_csv('all_years_rules_links.csv', encoding='latin-1') # The name of the file matches the name of the file in the repo.
    print("CSV file loaded successfully.")
    print("First 5 rows of the DataFrame:")
    print(data_links.head())

except UnicodeDecodeError:
    print("Error: Could not decode the file using 'latin-1' encoding. Please try another encoding like 'cp1252'.")
except FileNotFoundError:
    print("Error: 'Reg_agenda_links.csv' not found. Please make sure the file is in the correct directory.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
# Webcrawler to extract designations. This chunk streamlines the reading of the data and retrieves the EO designations for each rule

# Load the CSV file into a pandas DataFrame

try:
    data_links = pd.read_csv('all_years_rules_links.csv', encoding='latin-1')
except UnicodeDecodeError:
    print("Error: Could not decode the file using 'latin-1' encoding. Please try another encoding like 'cp1252'.")
except FileNotFoundError:
    print("Error: file not found. Please make sure the file is in the correct directory.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


links_df = pd.DataFrame(data_links)


# web crawler to capture the following 2 strings after "EO 13771 Designation" in the links explored

def crawl_and_extract_designations(df, url_column='Link'):
    """
    Crawls each URL in a DataFrame column, extracts the two strings following
    "EO 13771 Designation", and returns a new DataFrame with the extracted strings.

    Args:
        df (pd.DataFrame): The input DataFrame containing URLs.
        url_column (str): The name of the column in the DataFrame containing URLs.

    Returns:
        pd.DataFrame: A new DataFrame with added and 'Designation_2' column.
    """
    if url_column not in df.columns:
        raise ValueError(f"DataFrame does not have a column named '{url_column}'")

    # Initialize column to store the extracted strings
    df['Designation_2'] = None

    search_pattern = re.compile(r'EO 13771 Designation:\s*(.*?)\s*(.*)')

    for index, row in df.iterrows():
        url = row[url_column]
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text()

            # Search for the pattern in the text
            match = search_pattern.search(text)

            if match:
                designation2 = match.group(2).strip()
                df.loc[index, 'Designation_2'] = designation2
                print(f"Processed {url}: EO Designation: '{designation2}'")
            else:
                print(f"Processed {url}: 'EO 13771 Designation' not found or pattern mismatch.")

        except requests.exceptions.RequestException as e:
            print(f"Error processing {url}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred while processing {url}: {e}")

    return df

# Example Usage with the sample DataFrame defined previously:
processed_df_designations = crawl_and_extract_designations(links_df, url_column='Link')

#print("\nDataFrame with extracted designations:")
#processed_df_designations

In [None]:
# export the new DataFrame to CSV
processed_df_designations.to_csv('FILENAME.csv', index=False)

In [None]:
############# IF NEEDED - LOAD A NEW VERSION OF THE DF

# Load the CSV file into a pandas DataFrame

try:
    data_links = pd.read_csv('all_years_rules_links.csv', encoding='latin-1') # The name of the file matches the name of the file in the repo.
except UnicodeDecodeError:
    print("Error: Could not decode the file using 'latin-1' encoding. Please try another encoding like 'cp1252'.")
except FileNotFoundError:
    print("Error: 'Reg_agenda_links.csv' not found. Please make sure the file is in the correct directory.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


links_df = pd.DataFrame(data_links)

In [None]:
# Webcrawler to extract abstracts

# web crawler to capture the following 2 strings after "Abstract" in the links
# explored

def crawl_and_extract_designations(df, url_column='Link'):
    """
    Crawls each URL in a DataFrame column, extracts the two strings following
    "Abstract", and returns a new DataFrame with the extracted strings.

    Args:
        df (pd.DataFrame): The input DataFrame containing URLs.
        url_column (str): The name of the column in the DataFrame containing URLs.

    Returns:
        pd.DataFrame: A new DataFrame with added 'Abstract' column.
    """
    if url_column not in df.columns:
        raise ValueError(f"DataFrame does not have a column named '{url_column}'")

    # Initialize columns to store the extracted strings

    df['Abstract'] = None

    search_pattern = re.compile(r'Abstract:\s*(.*?)\s*(.*)')

    for index, row in df.iterrows():
        url = row[url_column]
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text()

            # Search for the pattern in the text
            match = search_pattern.search(text)

            if match:
                abstract2 = match.group(2).strip()
                df.loc[index, 'Abstract'] = abstract2
                print(f"Processed {url}: Abstract: '{abstract2}'")
            else:
                print(f"Processed {url}: 'Abstract' not found or pattern mismatch.")

        except requests.exceptions.RequestException as e:
            print(f"Error processing {url}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred while processing {url}: {e}")

    return df

# Example Usage with the sample DataFrame defined previously:
processed_df_abstracts = crawl_and_extract_designations(links_df, url_column='Link')



In [None]:
# export the new DataFrame to CSV
processed_df_abstracts.to_csv('FILENAME.csv', index=False)

Next we import the necessary packages and start coding our text classifier.

In [None]:
# !pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df = pd.read_csv("TrainTest_designations_all_years.csv")
X = df['Abstract']
y = df['Designation']

In [None]:
df['Designation'].value_counts()

In [None]:
# splitting our dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=57)


In [None]:
# We will use the scikit-learn pipeline which is just a sequence of steps to take
# to build our classifier. In this pipeline, we convert the data to a numerical
# format using TfidfVectorizer and then specify our classifiers

pipeMNB = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB())])
pipeCNB = Pipeline([('tfidf', TfidfVectorizer()),('clf', ComplementNB())])
pipeSVC = Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC())])

In [None]:
# We can try to optimize the best performing models with additional arguments

pipeMNB = Pipeline([('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,3))),('clf', MultinomialNB())])
pipeCNB = Pipeline([('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,3))),('clf', ComplementNB())])
pipeSVC = Pipeline([('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,3))),('clf', LinearSVC())])

In [None]:
# Now we will build our models using MultinomialNB, ComplementNB, and LinearSVC,
# and training it (fitting) on our train data. After that, we predict the labels
# for our TEST data, and then we print out the accuracy score based on a
# comparison of the correct labels and our predictions

pipeMNB.fit(X_train, y_train)
predictMNB = pipeMNB.predict(X_test)
print(f"MNB: {accuracy_score(y_test, predictMNB):.2f}")

pipeCNB.fit(X_train, y_train)
predictCNB = pipeCNB.predict(X_test)
print(f"CNB: {accuracy_score(y_test, predictCNB):.2f}")


In [None]:
# best performing so far
pipeSVC.fit(X_train, y_train)
predictSVC = pipeSVC.predict(X_test)
print(f"SVC: {accuracy_score(y_test, predictSVC):.2f}")

In [None]:
# we can see how the classifeir performs in a new piece of text

# noticed that 'new' 'updated' is attached to 'dereg'.

abstract = "This rule will safeguard health of outdoor workers"
result = pipeSVC.predict([abstract])
print("Result: ", result[0])