In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

Load Data

In [39]:
fake = pd.read_csv('politifact_fake.csv')
trueNew = pd.read_csv('politifact_real.csv')

fake.columns

Index(['id', 'news_url', 'title', 'tweet_ids'], dtype='object')

Make a new column for fake and true data sets


In [40]:
fake['Truth'] = 0
trueNew['Truth'] = 1

Combine fake and true datasets

In [41]:
# news = pd.concat([fake,trueNew])
news = pd.read_csv('Fake_news_content_detection.csv')
news.columns

Index(['Unnamed: 0', 'Text', 'Text_Tag', 'Barely-True', 'False', 'Half-True',
       'Mostly-True', 'Not-Known', 'True'],
      dtype='object')

Clean data

In [42]:
news.dropna()

#news['text'] = news['text'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', 'replacement_string', str(x)))



Unnamed: 0.1,Unnamed: 0,Text,Text_Tag,Barely-True,False,Half-True,Mostly-True,Not-Known,True
0,0,Says the Annies List political group supports ...,abortion,0,1,0,0,0,0
1,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",0,0,1,0,0,0
2,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,0,0,0,1,0,0
3,3,Health care reform legislation is likely to ma...,health-care,0,1,0,0,0,0
4,4,The economic turnaround started at the end of ...,"economy,jobs",0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
10235,10235,There are a larger number of shark attacks in ...,"animals,elections",0,0,0,1,0,0
10236,10236,Democrats have now become the party of the [At...,elections,0,0,0,1,0,0
10237,10237,Says an alternative to Social Security that op...,"retirement,social-security",0,0,1,0,0,0
10238,10238,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",0,1,0,0,0,0


Get labels

In [43]:
news['truth'] = 0

for index, row in news.iterrows():
    if row['Barely-True'] == 1:
        news.at[index, 'truth'] = 1
    elif row['False'] == 1:
        news.at[index, 'truth'] = 2
    elif row['Half-True'] == 1:
        news.at[index, 'truth'] = 3
    elif row['Mostly-True'] == 1:
        news.at[index, 'truth'] = 4
    elif row['Not-Known'] == 1:
        news.at[index, 'truth'] = 5
    elif row['True'] == 1:
        news.at[index, 'truth'] = 6

# news['truth']
#news['target'] = news[['Barely-True', 'False', 'Half-True', 'Mostly-True', 'Not-Known', 'True']].idxmax(axis=1)

news.shape

(10240, 10)

Split dataset

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(news['Text'], news['truth'] , test_size=0.2, random_state=42)

Use the TfidfVectorizer

In [45]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

Initialize PassiveAggressiveClassifier

In [46]:
pac = PassiveAggressiveClassifier(max_iter=100)
pac.fit(tfidf_train,Y_train)

Set Accuracy 

In [47]:
y_pred = pac.predict(tfidf_test)
score = accuracy_score(Y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 20.85%


Confusion Matrix

In [48]:
confusion_matrix(Y_test,y_pred, labels = [1,2,3,4,5,6])

array([[ 57,  63,  81,  73,  22,  43],
       [ 56,  96,  77,  81,  36,  55],
       [ 66,  95, 111,  80,  21,  65],
       [ 57,  69,  86,  87,  18,  65],
       [ 29,  31,  24,  22,  21,  21],
       [ 49,  56,  73,  89,  18,  55]], dtype=int64)