In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import nltk

In [2]:
from data_pipline_and_eda_helper_class import *
data_pipeline = DataPipeline()

In [11]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('breitbart')
stopwords.append('follow')
stopwords.append('facebook')
stopwords.append('twitter')
stopwords.append('email')
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,stop_words=stopwords,max_features=500)
tfidf = vectorizer.fit_transform(articles['content'])
X_train, X_test, y_train, y_test = train_test_split(tfidf, articles['source'],test_size=0.2, random_state=0)

In [12]:
model1 = MultinomialNB(0.1)
model1.fit(X_train,y_train)
model1.score(X_test,y_test)

0.913730255164034

In [13]:
model2 = RandomForestClassifier(n_estimators=60)
model2.fit(X_train, y_train)
model2.score(X_test,y_test)

0.9586877278250304

In [14]:
def test_bias(text,model):
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english',max_features=500)
    X = vectorizer.fit_transform(text)
    yhat= model.predict_proba(X)
    breit = [i[0] for i in yhat]
    occ = [i[1] for i in yhat]
    yhat2 = model.predict(X)
    count=0
    for pred in yhat2:
        if pred=='breitbart':
            count+=1
    breit_freq = (count/31)*100
    return f'Predicted as Breitbart {round(breit_freq,2)}% of the time.'




In [15]:
reuters = pipeline('reuters')
fox_news = pipeline('fox')

In [16]:
test_bias(fox_news['content'],model2)

'Predicted as Breitbart 74.19% of the time.'

In [17]:
test_bias(reuters['content'],model2)

'Predicted as Breitbart 58.06% of the time.'

In [19]:
def get_most_important_words(model,num_of_words):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    feature_names = vectorizer.get_feature_names()
    top_words = []

    for i in range(num_of_words):
        top_words.append(feature_names[indices[i]])
    return top_words

In [24]:
get_most_important_words(model2,20)

['coronavirus',
 'trump',
 'said',
 'pandemic',
 'added',
 'april',
 '2020',
 'virus',
 'thursday',
 'covid',
 'republican',
 'tuesday',
 'chinese',
 'add',
 'president',
 'biden',
 '2017',
 'friday',
 '19',
 'donald']

In [21]:
articles['content'].iloc[0]

'In a historic victory for gay and human rights advocates of America and abroad, the Boy Scouts of America announced today that it will propose a measure at May 20th’s National Council meeting to lift the long-standing ban on gay youth members. The move is a response to unprecedented progressive pressure on the BSA exerted by social networks, which helped sway public opinion to the point where the Boy Scouts of America folded under the weight of the public’s ever increasing disapproval of their bigoted and discriminatory practices. Along with their announcement, The BSA revealed that it previously sent out a survey to it’s over 1 million members, to which respondents answered that they supported the current policy of excluding gays by a margin of 61% to 34%, while a majority of younger parents and teens opposed the policy. For the first time in the 103 year history of the Boy Scouts of America, American gays and lesbians will be able to take part in an organization that is as American 

In [23]:
articles['content'].iloc[7307]

"Singer-songwriter Meredith Brooks called out Alyssa Milano’s endorsement of Vice President Joe Biden’s presidential campaign, accusing the actress of hypocrisy over her claims to support all victims of sexual misconduct. Brooks specifically addressed Milano, who led an unsuccessful crusade against President Donald Trump’s Supreme Court nominee Brett Kavanaugh over unsubstantiated claims of past sexual misconduct. The singer, wrote a huge hit called “Bitch” about female empowerment, said that as a victim of child molestation, seeing photos of Biden touching children made her “blood curl.” “[Alyssa Milano]\u2069 as a child victim of molestation this curdles my blood I’ve never spoken about this before but I have no choice now,” Brooks said, alongside a collage of Biden touching and kissing various children. “You need to wake up.” Brooks’ collage of Biden is accompanied by the caption: “Stop pretending this is normal.” \u2066\u2066@Alyssa_Milano\u2069 as a child victim of molestation thi