# Pré processamento

In [1]:
# Importando as bibliotecas e construindo o dataframe
import pandas as pd

In [2]:
# concatenando dois dataframes (um com fake news e outro com noticias verdadeiras)
fake_df = pd.read_csv('https://raw.githubusercontent.com/guilopesrbc/fake_news_detector/main/Fake.csv')
true_df = pd.read_csv('https://raw.githubusercontent.com/guilopesrbc/fake_news_detector/main/True.csv')

# adicionando coluna label, correspondente a veracidade da noticias (1 para verdadeiro e 0 para falsa)
fake_df['label'] = 0
true_df['label'] = 1

combined_df = pd.concat([fake_df, true_df], ignore_index=True)

combined_df.to_csv('Combined.csv', index=False)
combined_df.reset_index(drop=True,inplace=True)

In [3]:
combined_df

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


# Setando campos como categóricos

In [18]:
combined_df.dtypes

title      object
text       object
subject    object
date       object
label       int64
dtype: object

In [19]:
combined_df['subject'] = combined_df['subject'].astype('category')

In [20]:
combined_df.dtypes

title        object
text         object
subject    category
date         object
label         int64
dtype: object

In [23]:
combined_df['subject'] = combined_df['subject'].cat.codes

In [25]:
combined_df['subject'].unique()

array([2, 5, 0, 4, 3, 1, 6, 7], dtype=int8)

In [26]:
combined_df.dtypes

title      object
text       object
subject      int8
date       object
label       int64
dtype: object

# Verificando valores nulos e vazios

In [27]:
# verificando valores nulo
combined_df.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [28]:
# verificando valores vazios de titulo
remove_spaces = lambda x: ''.join(x.split()) if type(x) == str else x
empty_values = combined_df.applymap(remove_spaces)
empty_values['title'] = empty_values['title'] == ''
len(empty_values[empty_values['title'] == True])

0

In [29]:
# verificando valores vazios de texto
remove_spaces = lambda x: ''.join(x.split()) if type(x) == str else x
empty_values = combined_df.applymap(remove_spaces)
empty_values['text'] = empty_values['text'] == ''
len(empty_values[empty_values['text'] == True])

631

In [30]:
# verificando valores vazios de tipo de conteúdo
remove_spaces = lambda x: ''.join(x.split()) if type(x) == str else x
empty_values = combined_df.applymap(remove_spaces)
empty_values['subject'] = empty_values['subject'] == ''
len(empty_values[empty_values['subject'] == True])

0

# Removendo valores vazios

In [31]:
print('total de linhas ', len(combined_df))
dt = combined_df.applymap(remove_spaces)
dt = dt[dt['text'] != '']

print('total de linhas após remoção', len(dt))
print('total de linhas removidas ', len(combined_df) - len(dt))

total de linhas  44898
total de linhas após remoção 44267
total de linhas removidas  631
