# News Bias analysis notebook
#### Authors: Alexander Lambert, Casey Mathews, and Shivam Patel
#### Githubs: alambe22, cmathew9, and spatel90
## Description: 
The goal of this notebook is to analysze a given set of articles (found in datasets/articles.txt) with known biases and leanings to see if patterns can be determined in their writing. 
We analyze the following parts of the articles:
- Buzzwords and phrases count
- Emotional word count
- Average word length
- Use of words with negative connotations
- Use of words with positive connotations
- Use of words that indicate opinion (I think, I believe, etc.)
- Use of words that indicate fact (We know, research indicates, etc)
- First person pronoun usage (Does the author present this as their perspective, or as information)

Using the data we gather, the hope is to find patterns that could be used to analyze new articles for bias or factuality.
 


In [28]:
#Article class defintion and reading in
class article:
        __slots__ =["link", "bias", "cont", "buzz", "emo", "neg", "pos", "avgLen", 
                    "opi", "fac", "fPro"]
        def __init__ (self, link, bias):
            self.link = link
            self.bias = bias
            
articles = []
with open("datasets/articles.txt") as fin:
    for line in fin:
        lineList = line.rstrip().split(" ")
        articles.append(article(lineList[1], lineList[0]))


In [29]:
from bs4 import BeautifulSoup
import requests, re

# Return a list of all words in <img> alt text
def get_alt_text(soup):
    img_elements = soup.find_all("img")
    alt_text_words = []
    for img_element in img_elements:
        if('alt' in img_element):
            img_text = img_element['alt']
            img_text = string_cleaner(img_text)
            img_text = img_text.lower()
            words = list(filter(None, img_text.split(" ")))
            alt_text_words += words
    return alt_text_words

# Clean up a string for splitting on space
def string_cleaner(paragraph):
    # Remove apostrophes from the word
    paragraph = paragraph.replace("'", "")
    paragraph = paragraph.replace("’", "")
    # Replace non-alpha-numeric characters with a space
    paragraph = re.sub('[^A-Za-z]+', ' ', paragraph)
    return paragraph

# Returns a list of all words in an article
def get_article(url):
    # Setup
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Contains a list of all words from <p> and <li> elements
    article_words = []
    
    # Contains all <p> and <li> elements
    p_li_elements = soup.find_all(["p", "li"])
    for p_li_element in p_li_elements:
        p_li = p_li_element.getText()
        p_li = string_cleaner(p_li)
        # Convert the string to lowercase
        p_li = p_li.lower()
        # Filter out empty strings created by cleaning
        words = list(filter(None, p_li.split(" ")))
        article_words += words
    
    alt_text = get_alt_text(soup)
    article_words += alt_text
    return " ".join(article_words)



In [None]:
def parsePhraseFile(file, phrases):
    with open(file) as fin:
        for line in fin:
            phrases.append(line.strip())

def filterWords(file, words):
    phrases = []
    count = 0
    parsePhraseFile(file, phrases)
    cur_pat = None
    
    for phrase in phrases:
        cur_pat = re.compile(r'\b'+re.escape(phrase)+r'\b')
        count += len(cur_pat.findall(words))
    return count
    
def wordLength(words):
    sum = 0
    wordList = words.split(' ')
    for word in wordList:
        sum = sum + len(word)
    
    return sum/len(wordList)

for article in articles:
    print("Processing article:" + article.link)
    article.cont = get_article(article.link)
    article.buzz = filterWords("./datasets/buzzwords.txt", article.cont)
    article.emo = filterWords("./datasets/emotional_words.txt", article.cont)
    article.neg = filterWords("./datasets/negative-words.txt", article.cont)
    article.pos = filterWords("./datasets/positive-words.txt", article.cont)
    article.avgLen = wordLength(article.cont)
    article.opi = filterWords("./datasets/opinion.txt", article.cont)
    article.fac = filterWords("./datasets/fact_phrases.txt", article.cont)
    article.fPro = filterWords("./datasets/first_person.txt", article.cont)
    print("Processed article:" + article.link +" avg_len: " + str(article.avgLen) + " negative:" + str(article.neg) + " positive:" + str(article.pos))