# 509 Final Project

## Rsolve dependencies

In [1]:
! pip install newsapi-python



## Globally import libraries

In [132]:
import numpy as np
import pandas as pd
import pymysql as mysql
import matplotlib.pyplot as plt
import os
import shutil
import re
import logging
import time
import zipfile
import requests
from bs4 import BeautifulSoup
import datetime
import re
import regex as rex
from collections import defaultdict, Counter
import random
import requests
from bs4 import BeautifulSoup

import sqlite3
import nltk
from string import punctuation
from nltk.corpus import stopwords
import re
import emoji
from nltk.metrics import ConfusionMatrix
#import mysql.connector

# Set pandas global options
pd.options.display.max_rows = 17

Data Preprocessing Functions:

In [98]:
punctuation = set(punctuation) # speeds up comparison
tw_punct = punctuation - {"#"}

# Stopwords - added the 'nan' to this to remove nulls:
# next step could be to add pronouns like she/her, he/him, etc.

sw = stopwords.words("english")
sw = sw + ['nan']

# Two useful regex
whitespace_pattern = re.compile(r"\s+")
hashtag_pattern = re.compile(r"^#[0-9a-zA-Z]+")

def remove_stop(tokens) :
    # modify this function to remove stopwords

    return[t for t in tokens if t not in sw]
 
def remove_punctuation(text, punct_set=tw_punct) : 
    return("".join([ch for ch in text if ch not in punct_set]))

def tokenize(text) : 
    """ Splitting on whitespace rather than the book's tokenize function. That 
        function will drop tokens like '#hashtag' or '2A', which we need for Twitter. """
    
    return([item.lower() for item in whitespace_pattern.split(text)])
    

# two pipelines to either tokenize or simply remove punctuation
# and lowercase as we will need to extract feature words:

full_pipeline = [str.lower, remove_punctuation, tokenize, remove_stop]
first_pipeline = [str.lower, remove_punctuation]


def prepare(text, pipeline) : 
    tokens = str(text)
    
    for transform in pipeline : 
        tokens = transform(tokens)
        
    return(tokens)

In [99]:
def conv_features(text,fw) :
     feature_set=dict()
     for word in text.split():
          if word in fw:
               feature_set[word]=True
     return(feature_set)

In [2]:
today = datetime.date.today()
print(today)
print(type(today))

2023-06-03
<class 'datetime.date'>


## Connect to NewsAPI client

In [3]:
from newsapi import NewsApiClient

api_key = os.environ['NewsAPIKey']

# Init
newsapi = NewsApiClient(api_key=api_key)

## Pull article info from API

# /v2/top-headlines/sources
sources = newsapi.get_sources()
print(sources)

In [6]:
def news_api_urls(q=None,
                  s=None,
                  d_from='2023-05-01',
                  d_to='2023-05-31',
                  api_lst=[]):
    all_articles = newsapi.get_everything(q=q,
                                          sources=s,
                                          from_param=d_from,
                                          to=d_to,
                                          language='en',
                                          sort_by='relevancy',
                                          page=1)

    print(type(all_articles))
    print(all_articles)
    #print('Article list: ', all_articles['articles'])
    for article in all_articles['articles']:
        print('Source ID:', article['source']['id'])
        print('Source name:', article['source']['name'])
        print('Author:', article['author'])
        print('Title:', article['title'])
        print('URL:', article['url'])
        print('Publish date:', article['publishedAt'])
        print('Article text:', article['content'], '\n')

    # Create a list of tuples from the dictionary data
    source_data01 = [(a['source']['name'],
                      a['author'],
                      a['title'],
                      a['url'],
                      a['publishedAt'],
                      a['content'])
                     for a in all_articles['articles']]

    api_lst.extend(source_data01)
    #print(api_lst)
    print(len(api_lst))

## Connect to API to access URLs

### Set API filter parameters

In [157]:
# Citation comes from Allsides Media Bias Chart
#source_lst = ['cnn', 'reuters', 'fox-news']
#source_lst = ['cnn', 'newsweek', 'fox-news']
#source_lst = ['newsweek']
#source_lst = ['axios']
#source_lst = ['cnn']
#source_lst = ['cnn','fox-news', 'msnbc']
source_lst = ['cnn','fox-news','breitbart-news']

#date_lst = ['2023-06-01', '2023-05-31']
#date_lst = ['2023-05-30', '2023-05-29']
#date_lst = ['2023-05-30']
date_lst = ['2023-05-30']

# Comes from academci text (fill in citation from DF)
#q_word_lst = ['gender OR male OR female OR transgender', 'security AND (social OR national)',
#              'justice OR surveillance', 'healthcare OR "health care"',
#              '(political AND (bias OR party)) OR republican OR democrat OR election', '(policy AND (drug OR "affirmative action")) OR regulate OR regulation']

q_word_lst = ['justice OR surveillance', 'healthcare OR "health care"',
              '(political AND (bias OR party)) OR republican OR democrat OR election',
              'security AND (social OR national)']

### Access API

In [158]:
api_record_lst01 = []
for s in source_lst:
    print(f'Source: {s}')
    for d in date_lst:
        print(f'Date: {d}')
        for q in q_word_lst:
            print(f'Query word: {q}')
            time.sleep(5 + 11 * random.random())
            news_api_urls(q=q,
                          s=s,
                          d_from=d,
                          d_to=d,
                          api_lst=api_record_lst01)
            print(s, d, q)
    time.sleep(10 + 13 * random.random())

print(api_record_lst01)
print(len(api_record_lst01))

Source: cnn
Date: 2023-05-30
Query word: justice OR surveillance
<class 'dict'>
{'status': 'ok', 'totalResults': 10, 'articles': [{'source': {'id': 'cnn', 'name': 'CNN'}, 'author': 'Hannah Rabinowitz', 'title': 'US expatriate arrested after allegedly threatening US senators and Marines', 'description': 'An American living abroad was arrested last week on a charge of threatening US senators and political groups over the number of unsolicited political emails he was receiving, the Justice Department announced Tuesday.', 'url': 'https://www.cnn.com/2023/05/30/politics/us-expatriate-arrested-allegedly-threatening-us-senators/index.html', 'urlToImage': 'https://media.cnn.com/api/v1/images/stellar/prod/210209035327-doj-seal.jpg?c=16x9&q=w_800,c_fill', 'publishedAt': '2023-05-30T22:48:22Z', 'content': 'An American living abroad was arrested last week on a charge of threatening US senators and political groups over the number of unsolicited political emails he was receiving, the Justice Depart

Article data saved into dataframe:

In [159]:
api_record_df = pd.DataFrame (list (api_record_lst01),
                                   columns = ['Source','Author','Title','URL','date','content'])

CNN only:

In [93]:
#import requests
#from bs4 import BeautifulSoup
#article=[]

#for index, rows in api_record_df.iterrows():
#    article=[]
#    url = api_record_df.at[index,'URL']
#    response = requests.get(url)
#    time.sleep(5 + 10*random.random())

#    soup = BeautifulSoup(response.text, 'html.parser')
#    article_body = soup.find('div', class_='article__content-container')
    
#    if article_body is not None:
#        article_text = article_body.get_text()
#        api_record_df.at[index,'full_text']=article_text


#    #article_body = soup.find('div', {'class': 'article__body-content'})
#    #article_body = soup.find('div', class_='paragraph inline-placeholder')
#    #article_text = soup.find('div', {'class': 'zn-body__paragraph'})




FOX, CNN, Reuters, and WSJ:

In [160]:
article=[]

for index, rows in api_record_df.iterrows():
    article=[]
    url = api_record_df.at[index,'URL']
    response = requests.get(url)
    time.sleep(5 + 10*random.random())

    soup = BeautifulSoup(response.text, 'html.parser')
    #forcnn
    article_body = soup.find('div', class_='article__content-container')
    
    if article_body is None: #forfoxandreuters
        article_body = soup.find('p', class_="speakable")
        #if article_body is None:
            #article_body = soup.find('div', class_='')
        if article_body is None: #forWSJ
            article_body = soup.find('div',re.compile('^article-container *'))
            #else:
                #break
                                         

    if article_body is not None:
        article_text = article_body.get_text()
        api_record_df.at[index,'full_text']=article_text



In [153]:
# Debug
#soup.get_text()

In [161]:
api_record_df['Source'].value_counts()

Fox News                   84
Breitbart News             53
CNN                        37
The Wall Street Journal     1
Name: Source, dtype: int64

In [162]:
api_record_df.isnull().sum()

Source        0
Author        0
Title         0
URL           0
date          0
content       0
full_text    56
dtype: int64

In [165]:
api_record_df=api_record_df.dropna()

Save dataframe to csv file:

In [93]:
api_record_df.to_csv("News_API_FOX_CNN_Reuters.csv", sep=',')

In [89]:
#api_record_df_fox_msnbc.isna().sum()

In [166]:
api_record_df.tail(100)

Unnamed: 0,Source,Author,Title,URL,date,content,full_text
19,CNN,Christian Edwards,Why did ethnic Serbs attack NATO peacekeepers ...,https://www.cnn.com/2023/05/30/europe/serbia-k...,2023-05-30T17:01:42Z,Dozens of NATO peacekeepers were injured after...,\n\n\n\n\nCNN\n — \n \n\n\n Do...
20,CNN,"David Wright,Steve Contorno",Watchdog group files FEC complaint over planne...,https://www.cnn.com/2023/05/30/politics/super-...,2023-05-30T20:38:24Z,A watchdog group on Tuesday filed a complaint ...,\n\n\n\n\nCNN\n — \n \n\n\n Fl...
21,CNN,"Clare Foran,Lauren Fox,Haley Talbot,Melanie Za...",McCarthy faces key test ahead of House vote on...,https://www.cnn.com/2023/05/30/politics/house-...,2023-05-30T16:29:31Z,House Speaker Kevin McCarthy helped secure a d...,\n\n\n\n\nCNN\n — \n \n\n\n Ho...
22,CNN,AJ Willingham,"5 things to know for May 30: Debt limit, Turke...",https://www.cnn.com/2023/05/30/us/five-things-...,2023-05-30T10:37:53Z,Going on a cruise soon? You may want to skip t...,\n\n\n\n\nCNN\n — \n \n\n\n\n Ge...
24,CNN,Hannah Rabinowitz,US expatriate arrested after allegedly threate...,https://www.cnn.com/2023/05/30/politics/us-exp...,2023-05-30T22:48:22Z,An American living abroad was arrested last we...,\n\n\n\n\nCNN\n — \n \n\n\n An...
...,...,...,...,...,...,...,...
118,Fox News,Yael Halon,DeSantis scorches 'totally inadequate' debt ce...,https://www.foxnews.com/media/desantis-scorche...,2023-05-30T21:50:51Z,Florida Governor and GOP presidential candidat...,Florida Governor and GOP presidential candidat...
119,Fox News,Danielle Wallace,"Biden, Cruz condemn Uganda law allowing death ...",https://www.foxnews.com/politics/biden-cruz-co...,2023-05-30T13:51:30Z,"Sen. Ted Cruz, R-Texas, joined President Biden...","Sen. Ted Cruz, R-Texas, joined President Biden..."
120,Fox News,"Bret Baier, Amy Munneke",'Special Report' Spotlight: Private companies ...,https://www.foxnews.com/media/special-report-s...,2023-05-30T23:01:07Z,Adversaries like Russia and China are increasi...,Adversaries like Russia and China are increasi...
127,Breitbart News,"Joel B. Pollak, Joel B. Pollak",Report: Man Let Go by Gascon After Stabbing Co...,https://www.breitbart.com/crime/2023/05/30/rep...,2023-05-30T22:32:00Z,"A man who avoided jail time, thanks to Los Ang...","Stefen Sutherland, 31, was charged with assaul..."


In [10]:
#api_record_lst01 = [(1, 1, 1), (2, 2, 2), (3, 3, 3), (1, 1, 1), (4, 4, 4), (3, 3, 3)]
#print(api_record_lst01)
api_record_set01 = set(api_record_lst01)
print(api_record_set01)
api_record_lst02 = list(api_record_set01)
print(api_record_lst02)
print(len(api_record_lst02))

109


Create feature word set based on common words in dataset:

In [167]:
# Tokenize text:

api_record_df['tokens']= api_record_df['full_text'].apply(prepare,
                                                     pipeline=full_pipeline)

In [168]:
# Clean data into lowercase/no punctuation:

api_record_df['cleaner_text']= api_record_df['full_text'].apply(prepare,
                                                     pipeline=first_pipeline)

In [169]:
# Add target variable values:

api_record_df['Political_Lean'] = np.where((api_record_df['Source'] == "CNN"), 'Left', 'Right')

In [170]:
api_record_df.head(100)

Unnamed: 0,Source,Author,Title,URL,date,content,full_text,tokens,cleaner_text,Political_Lean
0,CNN,Hannah Rabinowitz,US expatriate arrested after allegedly threate...,https://www.cnn.com/2023/05/30/politics/us-exp...,2023-05-30T22:48:22Z,An American living abroad was arrested last we...,\n\n\n\n\nCNN\n — \n \n\n\n An...,"[, cnn, —, american, living, abroad, arrested,...",\n\n\n\n\ncnn\n — \n \n\n\n an...,Left
1,CNN,"Nick Valencia,Christina Maxouris,Devon M. Sayers",The 11-year-old boy shot in the chest by Missi...,https://www.cnn.com/2023/05/30/us/mississippi-...,2023-05-30T21:15:44Z,The 11-year-old boy who was shot in the chest ...,\n\n\n\n\nCNN\n — \n \n\n\n Th...,"[, cnn, —, 11yearold, boy, shot, chest, missis...",\n\n\n\n\ncnn\n — \n \n\n\n th...,Left
2,CNN,Eric Levenson,Opening statements set for today in Pittsburgh...,https://www.cnn.com/2023/05/30/us/pittsburgh-t...,2023-05-30T12:51:47Z,Opening statements are set for Tuesday in the ...,\n\n\n\n\nCNN\n — \n \n\n\n Th...,"[, cnn, —, federal, death, penalty, trial, man...",\n\n\n\n\ncnn\n — \n \n\n\n th...,Left
3,CNN,Nicki Brown,Colleges will soon have option to hide student...,https://www.cnn.com/2023/05/30/us/college-comm...,2023-05-30T17:37:28Z,Colleges will soon have the option to hide stu...,\n\n\n\n\nCNN\n — \n \n\n\n Co...,"[, cnn, —, colleges, soon, option, hide, stude...",\n\n\n\n\ncnn\n — \n \n\n\n co...,Left
4,CNN,Rob Frehse,Man arrested and charged with first-degree mur...,https://www.cnn.com/2023/05/30/us/new-jersey-c...,2023-05-30T22:34:22Z,A man in Virginia has been arrested and charge...,\n\n\n\n\nCNN\n — \n \n\n\n A ...,"[, cnn, —, man, virginia, arrested, charged, f...",\n\n\n\n\ncnn\n — \n \n\n\n a ...,Left
...,...,...,...,...,...,...,...,...,...,...
99,Fox News,Lawrence Richard,Texas legislature's first special session focu...,https://www.foxnews.com/politics/texas-legisla...,2023-05-30T06:00:33Z,Texas Gov. Greg Abbott revealed agenda items f...,Texas Gov. Greg Abbott revealed agenda items f...,"[texas, gov, greg, abbott, revealed, agenda, i...",texas gov greg abbott revealed agenda items fo...,Right
100,Fox News,Brianna Herlihy,Debt deal with Biden a 'betrayal' that could c...,https://www.foxnews.com/politics/debt-deal-bid...,2023-05-30T17:51:11Z,"Rep. Chip Roy, R-Texas, a key member the House...","Rep. Chip Roy, R-Texas, a key member the House...","[rep, chip, roy, rtexas, key, member, house, f...",rep chip roy rtexas a key member the house fre...,Right
101,Fox News,Hanna Panreck,Biden team using 'public staging' to boost Kam...,https://www.foxnews.com/media/biden-team-using...,2023-05-30T20:03:19Z,President Biden and the White House are report...,President Biden and the White House are report...,"[president, biden, white, house, reportedly, p...",president biden and the white house are report...,Right
102,Fox News,"Sen. Chuck Grassley, Bill Cassidy",Biden's student loan handout: Senate should ju...,https://www.foxnews.com/opinion/bidens-student...,2023-05-30T14:00:20Z,The quickly approaching debt limit gives Congr...,The quickly approaching debt limit gives Congr...,"[quickly, approaching, debt, limit, gives, con...",the quickly approaching debt limit gives congr...,Right


In [171]:
# Obtain total counts of words in the entire corpus:

word_dist=Counter([item for sublist in api_record_df['tokens'] 
                   for item in sublist])

Preparing data for classifier ingestion (as dictionary)

In [172]:
# generate the dict data to then be filtered through
# the filtered_words list:

news_data=dict()

    
news_data=[(api_record_df.at[index,'cleaner_text'],
            api_record_df.at[index,'Political_Lean']) for (index,row) in api_record_df.iterrows()]

Determine featureword set:

In [173]:
word_cutoff=5
feature_words2=[]
for word, count in word_dist.items() :
    #for convention_data_df['word_dist']
    if count > word_cutoff :
        feature_words2.append(word)
            
print(f"With a word cutoff of {word_cutoff}, we have {len(feature_words2)} as features in the model.")

With a word cutoff of 5, we have 953 as features in the model.


In [155]:
#feature_words2

In [175]:
# remove cnn and fox-news from the key featurewords:

feature_words2.remove("cnn")
feature_words2.remove("fox")
#feature_words2.remove("breitbart")


In [176]:
# Filter the data through the feature words set
# determined above:

featuresets=dict()
featuresets = [(conv_features(text,feature_words2), 
                lean) for (text, lean) in news_data]

In [156]:
#featuresets

Rough Naive-Bayes Classifier setup and run-through:

In [177]:
random.seed(20220507)
random.shuffle(featuresets)
test_size = 20


In [178]:
test_set=dict()
train_set=dict()
test_set, train_set = featuresets[:test_size], featuresets[test_size:]

In [179]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.3


In [180]:
# Confusion Matrix:

predicted_labels = [classifier.classify(features) for features, 
                    label in test_set]
gold_labels = [label for features, label in test_set]

cm = ConfusionMatrix(gold_labels, predicted_labels)
print(cm.pretty_format(sort_by_count=True, show_percents=True))

      |      R        |
      |      i      L |
      |      g      e |
      |      h      f |
      |      t      t |
------+---------------+
Right |     <.> 70.0% |
 Left |      . <30.0%>|
------+---------------+
(row = reference; col = test)



## Initiate MySQL connection

In [14]:
'''Set local environment variables to hide user name & password citation:
https://www.geeksforgeeks.org/how-to-hide-sensitive-credentials-using-python/'''

user_name = "sqluser"
user_pass = "password"

#user_name = os.environ['MySQLUSRAC']
#user_pass = os.environ['MySQLPWDAC']

# Instantiate connection
db_conn = mysql.connect(host='localhost',
                        port=int(3306),
                        user=user_name,
                        passwd=user_pass,
                        db='509_final_proj')

# Create a cursor object
cursor = db_conn.cursor()

In [15]:
tbl_names = pd.read_sql('SHOW TABLES', db_conn)

display(tbl_names)
print(type(tbl_names))



Unnamed: 0,Tables_in_509_final_proj


<class 'pandas.core.frame.DataFrame'>


### Establish logging policy

In [20]:
'''Logging citations (see additional code in following code blocks:
OpenAI. (2021). ChatGPT [Computer software]. https://openai.com/;
https://docs.python.org/3/howto/logging.html#logging-basic-example;
https://docs.python.org/3/howto/logging.html#logging-to-a-file;
https://docs.python.org/3/howto/logging-cookbook.html#using-a-rotating-log-file-handler;
https://docs.python.org/3/howto/logging-cookbook.html#using-a-timed-rotating-file-handler'''

# Set up logging
logging.basicConfig(level=logging.INFO,
                    filename='pymysql.log',
                    filemode='a',
                    format='>>>>>>>>>>>>>><<<<<<<<<<<<<<\n%(asctime)s - %(levelname)s - %(message)s')

### Update individual tables

#### Update `news_articles` table from API

In [21]:
nat_tbl_name = 'nar_temp'
nwa_tbl_name = 'news_articles'

In [22]:
'''Using cursor and loading into temp file:
OpenAI. (2021). ChatGPT [Computer software]. https://openai.com/;
https://pynative.com/python-mysql-insert-data-into-database-table/'''

# Execute query and measure execution time
start_time = time.time()

# Wipe temp table
try:
    nat_dlt_tble_stmnt = f"""DELETE FROM {nat_tbl_name}"""
    cursor.execute(nat_dlt_tble_stmnt)
    logging.info(f'Successfully executed query:\n{nat_dlt_tble_stmnt}\n\nRecords scanned: {cursor.rowcount}')
except mysql.Error as e:
    logging.error(f'Error executing query:\n{nat_dlt_tble_stmnt}\n\n{e}')
finally:
    end_time = time.time()
    logging.info(f'Time taken: {end_time - start_time:.3f} seconds\n>>>>>>>>>>>>>><<<<<<<<<<<<<<\n\n')

# Execute query and measure execution time
start_time = time.time()

# Load data from CSV file into a temporary table
try:
    nat_csv_load_stmnt = f"""
    INSERT INTO {nat_tbl_name}
    (
    source_name,
    author,
    title,
    url,
    publish_date,
    content
    )
    VALUES (%s, %s, %s, %s, %s, %s)
    """

    # Execute the query with multiple values
    cursor.executemany(nat_csv_load_stmnt, api_record_lst02)
    #cursor.execute(nat_csv_load_stmnt)
    logging.info(f'Successfully executed query:\n{nat_csv_load_stmnt}\n\nRecords scanned: {cursor.rowcount}')
except mysql.Error as e:
    logging.error(f'Error executing query:\n{nat_csv_load_stmnt}\n\n{e}')
finally:
    end_time = time.time()
    logging.info(f'Time taken: {end_time - start_time:.3f} seconds\n>>>>>>>>>>>>>><<<<<<<<<<<<<<\n\n')

# Execute query and measure execution time
start_time = time.time()

# Insert new records into main table
try:
    nwa_load_stmnt = f"""
    INSERT INTO {nwa_tbl_name}
    (
    source_name,
    author,
    title,
    url,
    publish_date,
    content
    )
    SELECT
        tp.source_name,
        tp.author,
        tp.title,
        tp.url,
        tp.publish_date,
        tp.content
    FROM {nat_tbl_name} AS tp
    LEFT JOIN {nwa_tbl_name} AS mn
        ON tp.title = mn.title
            AND CAST(LEFT(tp.publish_date, 10) AS DATE) = CAST(LEFT(mn.publish_date, 10) AS DATE)
            AND tp.author = mn.author
    """
    cursor.execute(nwa_load_stmnt)
    logging.info(f'Successfully executed query:\n{nwa_load_stmnt}\n\nRecords scanned: {cursor.rowcount}')
except mysql.Error as e:
    logging.error(f'Error executing query:\n{nwa_load_stmnt}\n\n{e}')
finally:
    end_time = time.time()
    logging.info(f'Time taken: {end_time - start_time:.3f} seconds\n>>>>>>>>>>>>>><<<<<<<<<<<<<<\n\n')

# Execute query and measure execution time
start_time = time.time()

# Wipe temp table
try:
    cursor.execute(nat_dlt_tble_stmnt)
    logging.info(f'Successfully executed query:\n{nat_dlt_tble_stmnt}\n\nRecords scanned: {cursor.rowcount}')
except mysql.Error as e:
    logging.error(f'Error executing query:\n{nat_dlt_tble_stmnt}\n\n{e}')
finally:
    end_time = time.time()
    logging.info(f'Time taken: {end_time - start_time:.3f} seconds\n>>>>>>>>>>>>>><<<<<<<<<<<<<<\n\n')

### Commit changes and close cursor and connection instances

In [23]:
# Commit the changes to the database
db_conn.commit()

# Close the cursor and database connection
cursor.close()
db_conn.close()