<a href="https://colab.research.google.com/github/chernobylx/Fake-News-Identification/blob/EDA/Fake_Real_News_Exploratory_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import kagglehub

# Download latest version
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")

print("Path to dataset files:", path)


Path to dataset files: /kaggle/input/fake-and-real-news-dataset


In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

dfs = {}
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        dfs[filename] = pd.read_csv(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-and-real-news-dataset/True.csv
/kaggle/input/fake-and-real-news-dataset/Fake.csv


In [6]:
dfs.keys()

dict_keys(['True.csv', 'Fake.csv'])

In [7]:
dfs['True.csv'].sample()

Unnamed: 0,title,text,subject,date
5379,"Trump on Twitter (Feb 17) - Tom Price, Boeing,...",The following statements were posted to the ve...,politicsNews,"February 17, 2017"


In [8]:
dfs['Fake.csv'].sample()

Unnamed: 0,title,text,subject,date
11720,DINGBAT DEMOCRAT Maxine Waters Hopes Trump Won...,When will the Democrats realize they lost the ...,politics,"Feb 6, 2017"


In [141]:
from sklearn.base import TransformerMixin
class Cleaner(TransformerMixin):
  def __init__(self):
    pass

  def __repr__(self):
    return str(self.data.shape)

  def __str__(self):
    return self.data.__str__()

  def fit(self, dfs, y=None):
    self.dfs = dfs

    self.real = dfs['True.csv']
    self.real['fake'] = False
    self.fake = dfs['Fake.csv']
    self.fake['fake'] = False

    self.data = pd.concat([self.real, self.fake])
    self.data['word_count'] = self.data.text.str.split().apply(len)

    return self

  def transform(self, X=None, y=None):
    self.clean()
    return self.data

  def clean(self):
      self.clean_date()
      self.convert_date()
      self.deduplicate()
      self.remove_outliers()
      self.map_subjects()
      return self

  def sample(self, n):
    return self.data.sample(n)

  def clean_date(self):
    if self.data.date.dtype == 'object':
      self.data.date = self.data.date.apply(lambda date: '01/01/2000' if date.find('htt') == 0 or date.find('MSN') ==0 else date)
    return self


  def convert_date(self):
    if not (self.data.date.dtype == 'object'):
      self.data.date = pd.to_datetime(self.data.date, format = 'mixed')
    return self

  def deduplicate(self):
    self.data.drop(self.data[self.data.duplicated()].index, inplace=True)
    self.data.drop(self.data[self.data.duplicated(subset='text')].index, inplace=True)
    self.data.drop(self.data[self.data.duplicated(subset='title')].index, inplace=True)
    return self

  def remove_outliers(self):
    self.data = self.data[self.data.word_count > 50]
    return self

  def map_subjects(self):
    self.data.subject = self.data.subject.map({'politicsNews': 'politics',
                                               'News': 'news',
                                               'politics': 'politics',
                                               'worldnews': 'news',
                                               'US_News': 'news',
                                               'left-news': 'news',
                                               'Government News': 'news'})
    return self

In [142]:
from sklearn.pipeline import Pipeline
pl = Pipeline([('clean', Cleaner())])
pl.fit_transform(dfs)

Unnamed: 0,title,text,subject,date,fake,word_count
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politics,"December 31, 2017",False,749
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politics,"December 29, 2017",False,624
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politics,"December 31, 2017",False,457
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politics,"December 30, 2017",False,376
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politics,"December 29, 2017",False,852
...,...,...,...,...,...,...
22698,The White House and The Theatrics of ‘Gun Cont...,21st Century Wire says All the world s a stage...,news,"January 7, 2016",False,1226
22699,Activists or Terrorists? How Media Controls an...,Randy Johnson 21st Century WireThe majority ...,news,"January 7, 2016",False,4257
22700,"BOILER ROOM – No Surrender, No Retreat, Heads ...",Tune in to the Alternate Current Radio Network...,news,"January 6, 2016",False,183
22701,Federal Showdown Looms in Oregon After BLM Abu...,21st Century Wire says A new front has just op...,news,"January 4, 2016",False,3480
