In [71]:
import pandas as pd
from gnews import GNews
import logging
import os
import time
import re
from collections import Counter

In [53]:
# Start the timer
start_time = time.time()

#Import the consolidated data set
file_path = "data\\3_intermediate\\int_texas_border_report.csv"

border_report = pd.read_csv(file_path)

#From border_report, create a list of the unique agency_names
agency_names = border_report['agency_name'].unique()
agency_names = agency_names.tolist()

#Check to see if there are 85 unique agency_names in the list
print(len(agency_names))
#Inspect the list
print(agency_names[:5])

85
['ALAMO PD', 'ALPINE PD', 'ALTON PD', 'ANTHONY PD', 'BREWSTER CO SO']


In [79]:
# Specify the directory where you want to store the log file
log_directory = "\\logs"
log_filename = "01_news_fetch_log.txt"
log_file_path = os.path.join(log_directory, log_filename)

# Ensure the directory exists
os.makedirs(log_directory, exist_ok=True)

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler(log_file_path),
                        logging.StreamHandler()
                    ])

# Define your lists of agency names and corruption keywords
#Test Case
# agency_names = ['Alamo PD']

#From border_report, create a list of the unique agency_names
agency_names = border_report['agency_name'].unique()
agency_names = agency_names.tolist()

corruption_keywords = [
    'officer misconduct',
    'corruption investigation',
    'bribery',
    'administrative leave',
]

# Initialize GNews
google_news = GNews()
google_news.max_results = 10

# List to hold all news results
all_news = []

# Counters for progress tracking
total_combinations = len(agency_names) * len(corruption_keywords)
current_combination = 1

# Start the timer
start_time = time.time()

# Iterate through each combination of agency name and corruption keyword
for agency_name in agency_names:
    for corruption_keyword in corruption_keywords:
        # Generate search query
        search_query = f'{agency_name} {corruption_keyword}'
        logging.info(f'Fetching news for: {search_query} ({current_combination}/{total_combinations})')
        
        # Get news for the current search query
        news = google_news.get_news(search_query)
        
        # Add agency_name and corruption_keyword to each news item
        for item in news:
            item['agency_name'] = agency_name
            item['corruption_keyword'] = corruption_keyword
            all_news.append(item)
        
        current_combination += 1

# Convert the list of news items to a DataFrame
news_df = pd.DataFrame(all_news)

# Rearrange columns to have agency_name and corruption_keyword first
cols = ['agency_name', 'corruption_keyword'] + [col for col in news_df.columns if col not in ['agency_name', 'corruption_keyword']]
news_df = news_df[cols]

# Log the completion of the process
logging.info("News fetching completed. DataFrame created.")

# End the timer and calculate total runtime
end_time = time.time()
total_runtime = end_time - start_time

#Convert the total runtime to minutes and seconds
minutes = total_runtime // 60
seconds = total_runtime % 60
print(f"Total Run Time: {minutes:.0f} minutes, {seconds:.2f} seconds.")


#Save the DataFrame to a CSV file for future reference
news_df.to_csv("data\\2_staging\\stg_police_news.csv", index=False)
logging.info("DataFrame saved to stg_police_news.csv.")


03/04/2024 12:49:06 PM - Fetching news for: ALAMO PD officer misconduct (1/340)
03/04/2024 12:49:13 PM - Fetching news for: ALAMO PD corruption investigation (2/340)
03/04/2024 12:49:19 PM - Fetching news for: ALAMO PD bribery (3/340)
03/04/2024 12:49:24 PM - Fetching news for: ALAMO PD administrative leave (4/340)
03/04/2024 12:49:29 PM - Fetching news for: ALPINE PD officer misconduct (5/340)
03/04/2024 12:49:33 PM - Fetching news for: ALPINE PD corruption investigation (6/340)
03/04/2024 12:49:37 PM - Fetching news for: ALPINE PD bribery (7/340)
03/04/2024 12:49:38 PM - Fetching news for: ALPINE PD administrative leave (8/340)
03/04/2024 12:49:42 PM - Fetching news for: ALTON PD officer misconduct (9/340)
03/04/2024 12:49:48 PM - Fetching news for: ALTON PD corruption investigation (10/340)
03/04/2024 12:49:52 PM - Fetching news for: ALTON PD bribery (11/340)
03/04/2024 12:49:57 PM - Fetching news for: ALTON PD administrative leave (12/340)
03/04/2024 12:50:01 PM - Fetching news for

In [80]:
news_df.head()

Total Run Time: 18 minutes, 11.59 seconds.


Unnamed: 0,agency_name,corruption_keyword,title,description,published date,url,publisher
0,ALAMO PD,officer misconduct,Black jogger's lawsuit against San Antonio hea...,Black jogger's lawsuit against San Antonio hea...,"Tue, 02 May 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiWWh0d...,"{'href': 'https://sanantonioreport.org', 'titl..."
1,ALAMO PD,officer misconduct,Mission Police address arrest of two officers ...,Mission Police address arrest of two officers ...,"Sat, 25 Mar 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiX2h0d...,"{'href': 'https://www.valleycentral.com', 'tit..."
2,ALAMO PD,officer misconduct,Stephen Clare bail hearing: SAPD details crime...,Stephen Clare bail hearing: SAPD details crime...,"Thu, 18 May 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiWWh0d...,"{'href': 'https://www.expressnews.com', 'title..."
3,ALAMO PD,officer misconduct,The FBI Used an Undercover Cop With Pink Hair ...,The FBI Used an Undercover Cop With Pink Hair ...,"Tue, 21 Mar 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiRmh0d...,"{'href': 'https://theintercept.com', 'title': ..."
4,ALAMO PD,officer misconduct,“Systemic failures” in Uvalde shooting went fa...,“Systemic failures” in Uvalde shooting went fa...,"Sun, 17 Jul 2022 07:00:00 GMT",https://news.google.com/rss/articles/CBMiXmh0d...,"{'href': 'https://www.texastribune.org', 'titl..."


In [72]:
def normalize_text(text):
    """
    Normalize text by making it lower case and removing non-alphanumeric characters.
    """
    return re.sub(r'\W+', ' ', text.lower()).strip()

#This is not working. I would need something more robust. There's got to be a better way to do this.
def calculate_relevance_score(title, agency_name, corruption_keyword):
    """
    Calculate the relevance score based on the presence of the agency name and
    the corruption keyword in the title.

    Parameters:
    - title (str): The title of the article.
    - agency_name (str): The name of the agency.
    - corruption_keyword (str): The keyword related to corruption.

    Returns:
    - int: The calculated relevance score.
    """
    # Normalize inputs for consistent comparison
    title_normalized = normalize_text(title)
    agency_name_normalized = normalize_text(agency_name)
    corruption_keyword_normalized = normalize_text(corruption_keyword)
    
    # Tokenize the normalized title for word frequency analysis
    title_tokens = title_normalized.split()
    word_frequencies = Counter(title_tokens)
    
    # Initialize score and calculate
    score = 0
    
    # Check for agency name - considering partial matches as valid
    for part in agency_name_normalized.split():
        score += word_frequencies.get(part, 0)
    
    # Check for corruption keyword
    score += word_frequencies.get(corruption_keyword_normalized, 0)
    
    return score


In [78]:
# Assuming 'news_df' is your actual dataframe with 'agency_name', 'corruption_keyword', and 'title' columns
news_df['relevance_score'] = news_df.apply(lambda row: calculate_relevance_score(row['title'], row['agency_name'], row['corruption_keyword']), axis=1)

#Re-arrange the columns to have relevance_score as a the fourth column
cols = ['agency_name', 'corruption_keyword', 'title', 'relevance_score'] + [col for col in news_df.columns if col not in ['agency_name', 'corruption_keyword', 'title', 'relevance_score']]
news_df = news_df[cols]

#Call the head() method to inspect the first 5 rows of the DataFrame
news_df.head()


Unnamed: 0,agency_name,corruption_keyword,title,relevance_score,description,published date,url,publisher
0,Alamo PD,officer misconduct,Black jogger's lawsuit against San Antonio hea...,0,Black jogger's lawsuit against San Antonio hea...,"Tue, 02 May 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiWWh0d...,"{'href': 'https://sanantonioreport.org', 'titl..."
1,Alamo PD,officer misconduct,Mission Police address arrest of two officers ...,0,Mission Police address arrest of two officers ...,"Sat, 25 Mar 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiX2h0d...,"{'href': 'https://www.valleycentral.com', 'tit..."
2,Alamo PD,officer misconduct,Stephen Clare bail hearing: SAPD details crime...,0,Stephen Clare bail hearing: SAPD details crime...,"Thu, 18 May 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiWWh0d...,"{'href': 'https://www.expressnews.com', 'title..."
3,Alamo PD,officer misconduct,The FBI Used an Undercover Cop With Pink Hair ...,0,The FBI Used an Undercover Cop With Pink Hair ...,"Tue, 21 Mar 2023 07:00:00 GMT",https://news.google.com/rss/articles/CBMiRmh0d...,"{'href': 'https://theintercept.com', 'title': ..."
4,Alamo PD,officer misconduct,“Systemic failures” in Uvalde shooting went fa...,0,“Systemic failures” in Uvalde shooting went fa...,"Sun, 17 Jul 2022 07:00:00 GMT",https://news.google.com/rss/articles/CBMiXmh0d...,"{'href': 'https://www.texastribune.org', 'titl..."


In [None]:
#Need to talk to Polina to see how far I can push this. Once I comment on the human trafficking part, I think I have a good 
#first draft