In [1]:
import pandas as pd
import numpy as np
import boto3
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [2]:
#Copy Data From S3
s3 = boto3.client('s3')
file = 'full_articles.csv'
#s3.download_file('media-bais', file, file)

In [3]:
df = pd.read_csv(file)
df.head()

Unnamed: 0,text,url_match,agree,agree_ratio,agreeance_text,allsides_page,bias,disagree,name,total_votes
0,"男模的大鸟,避孕膜,杜达雄\nHome\nHome\nBlogs\nPopular Twee...",aj,263,2.481132,Strongly Agrees,https://www.allsides.com/news-source/aj-media-...,left,106,AJ+,369
1,"Meet Elizabeth347, 26 (United States, Aixàs) |...",bet,131,0.803681,Somewhat Disagrees,https://www.allsides.com/news-source/bet-media...,left-center,163,BET,294
2,Trajectory Path Planning of Cable Driven Paral...,aj,263,2.481132,Strongly Agrees,https://www.allsides.com/news-source/aj-media-...,left,106,AJ+,369
3,Adverse drug reaction | Asian Journal of Pharm...,aj,263,2.481132,Strongly Agrees,https://www.allsides.com/news-source/aj-media-...,left,106,AJ+,369
4,Artificial Light at Night (ALAN) Research Lite...,mit,4,1.333333,Somewhat Agrees,https://www.allsides.com/news-source/mit-media...,left,3,Mitú,7


In [4]:
print(df.shape)
df['name'].unique()

(509387, 10)


array(['AJ+', 'BET', 'Mitú', 'The Atlantic', 'FAIR', 'Quartz',
       'Bloomberg', 'The Hill', 'CNN (Web News)', 'Idaho Statesman',
       'NPR Online News', 'The Advocate', 'Forbes', 'Salon', 'Slate',
       'The Justice', 'CNET', 'KSL', 'The Dispatch', 'MarketWatch',
       'Reason', 'ABC News (Online)', 'The Guardian',
       'Scientific American', 'Boing Boing', 'The Appeal', 'Newsweek',
       'Detroit News', 'The Daily Caller', 'The Daily Dot',
       'Foreign Policy', 'Gizmodo', 'Politico', 'SFGate', 'Mashable',
       'Fox News (Online)', 'New York Times (Online News)',
       'The Economist', 'Reuters', 'Splinter', 'Jubilee Media',
       'The Flip Side', 'Al Jazeera', 'Axios', 'Business Insider', 'CNBC',
       'Eurek Alert', 'The Fulcrum', 'InfoWars', 'New Republic',
       'Psychology Today', 'RollingStone.com', 'Teen Vogue',
       'Daily Beast', 'The Epoch Times', 'Breitbart News',
       'The Gateway Pundit', 'AZ Central', 'Des Moines Register',
       'Miami Herald', 'M

In [17]:
#'The Daily Show (humor)'
#'Babylon Bee (Humor)'
#https://en.wikipedia.org/wiki/List_of_satirical_news_websites

left            234144
left-center     148592
center          102539
right-center     17495
right             6101
allsides           516
Name: bias, dtype: int64

In [5]:
def convert_to_num(target):
    if target == 'The Daily Show (humor)':
        return 1
    elif target == 'Babylon Bee (Humor)':
        return 1
    else:
        return 0

In [6]:
df = df[df['bias'] != 'allsides']
df['target'] = df['name'].apply(convert_to_num)

In [7]:
train_data, test_data = train_test_split(df, test_size=0.25, random_state=42)
print(train_data['target'].unique())
print(test_data['target'].unique())

[0 1]
[0 1]


In [8]:
#create count object
count_vectorizer = CountVectorizer(stop_words='english')

#Transform the training data using only the 'text' column values
count_X_train = count_vectorizer.fit_transform(train_data['text'])

#Transform the test data using only the 'text' column values
count_X_test = count_vectorizer.transform(test_data['text'])

In [9]:
y_train = train_data['target']
y_test = test_data['target']

In [10]:
svm_obj = SGDClassifier()
svm = svm_obj.fit(count_X_train, y_train)

In [11]:
print('Testing Model.')
y_pred = svm.predict(count_X_test)

Testing Model.


In [12]:
acc = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
perc = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

#model results
print('Model Results')
print('Accuracy: {}'.format(acc))
print('Recall: {}'.format(recall))
print('Precision: {}'.format(perc))
print('F1-Score: {}'.format(f1))

Model Results
Accuracy: 0.9998192079737144
Recall: 0.9998192079737144
Precision: 0.9998877324569507
F1-Score: 0.9998443942640383
