In [454]:
import os
import json
import re
import string
import warnings
import numpy
import pandas
import matplotlib.pyplot as pyplot
import seaborn
import nltk
import gensim

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize



In [455]:
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [456]:
data = pandas.DataFrame()

for dir in os.listdir('dataset'):
    for datadir in os.listdir('dataset/' + dir + '/fake'):
        try:
            with open('dataset/politifact/fake/' + datadir + '/news content.json') as json_file:
                json_obj = json.load(json_file)
                json_obj['real'] = 0
                data = data.append(json_obj, ignore_index=True)
        finally:
            continue

    for datadir in os.listdir('dataset/' + dir + '/real'):
        try:
            with open('dataset/politifact/real/' + datadir + '/news content.json') as json_file:
                json_obj = json.load(json_file)
                json_obj['real'] = 1
                data = data.append(json_obj, ignore_index=True)
        finally:
            continue

data.columns

Index(['url', 'text', 'images', 'top_img', 'keywords', 'authors',
       'canonical_link', 'title', 'meta_data', 'movies', 'publish_date',
       'source', 'summary', 'real'],
      dtype='object')

In [457]:
special_characters = '!?@#$%^&*()-+_=,<>/'

#this is probably can be done during import

data['text_character_cnt'] = data['text'].str.len()
data['text_word_cnt'] = data['text'].str.split().str.len()
data['text_character_per_word'] = data['text_character_cnt'].combine(data['text_word_cnt'], lambda x, y: x / y if y else 0)

data['text_special_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if any(char in special_characters for char in x)]))
data['text_special_per_char'] = data['text'].combine(data['text_character_cnt'], lambda x, y: (len([x for x in x.split() if any(char in special_characters for char in x)]) / y) if y else 0)
data['text_special_per_word'] = data['text'].combine(data['text_word_cnt'], lambda x, y: (len([x for x in x.split() if any(char in special_characters for char in x)]) / y) if y else 0)

for char in special_characters:
    data['text_' + char + '_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if char in x]))
    data['text_' + char + '_per_char'] = data['text'].combine(data['text_character_cnt'], lambda x, y: (len([x for x in x.split() if char in x]) / y) if y else 0)
    data['text_' + char + '_per_word'] = data['text'].combine(data['text_word_cnt'], lambda x, y: (len([x for x in x.split() if char in x]) / y if y else 0))

data['text_http_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if 'http' in x]))
data['text_www_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if 'www' in x]))
data['text_number_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))


data['title_character_cnt'] = data['title'].str.len()
# if(data['title_character_cnt'] > 0):
data['title_word_cnt'] = data['title'].str.split().str.len()
# data['title_character_per_word'] = data['title_character_cnt'] / data['title_word_cnt']
data['title_character_per_word'] = data['title_character_cnt'].combine(data['title_word_cnt'], lambda x, y: x / y if y else 0)

data['special_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if any(char in special_characters for char in x)]))

for char in special_characters:
    data['title_' + char + '_per_char'] = data['title'].apply(lambda x: len([x for x in x.split() if char in x]))
    data['title_' + char + '_per_word'] = data['title'].apply(lambda x: len([x for x in x.split() if char in x]))

data['title_http_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'http' in x]))
data['title_www_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'www' in x]))
data['title_number_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

  data['title_' + char + '_per_char'] = data['title'].apply(lambda x: len([x for x in x.split() if char in x]))
  data['title_' + char + '_per_word'] = data['title'].apply(lambda x: len([x for x in x.split() if char in x]))
  data['title_http_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'http' in x]))
  data['title_www_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'www' in x]))
  data['title_number_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))


In [458]:
data['autor_cnt'] = data['authors'].apply(lambda x: len(x))

authors_dict = gensim.corpora.Dictionary(data['authors'])

for author in authors_dict:
    data[authors_dict[author]] = data['authors'].apply(lambda x: 1 if authors_dict[author] in x else 0)

  data['autor_cnt'] = data['authors'].apply(lambda x: len(x))
  data[authors_dict[author]] = data['authors'].apply(lambda x: 1 if authors_dict[author] in x else 0)


In [459]:
source_dict = gensim.corpora.Dictionary([data['source'].unique()])
data['source_id'] = data['source'].apply(lambda x: list(source_dict.values()).index(x))

  data['source_id'] = data['source'].apply(lambda x: list(source_dict.values()).index(x))


In [460]:
X = data.drop(columns=['url', 'text', 'images', 'top_img', 'keywords', 'authors',
       'canonical_link', 'title', 'meta_data', 'movies', 'publish_date',
       'source', 'summary', 'real'])
Y = data['real']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

X_train

Unnamed: 0,text_character_cnt,text_word_cnt,text_character_per_word,text_special_cnt,text_special_per_char,text_special_per_word,text_!_cnt,text_!_per_char,text_!_per_word,text_?_cnt,...,Topics.Nytimes.Com Top Reference Timestopics People S Katharine_Q_Seelye Index.Html,Republican National Committee,Written On September,David Espo,Paul Ryan,Chris Wallace,Fox News Sunday,Published June,Carly Fiorina,source_id
219,683,106,6.443396,6,0.008785,0.056604,0,0.000000,0.000000,0,...,0,0,0,0,0,0,0,0,0,202
121,1702,253,6.727273,11,0.006463,0.043478,0,0.000000,0.000000,0,...,0,0,0,0,0,0,0,0,0,248
855,348,59,5.898305,2,0.005747,0.033898,0,0.000000,0.000000,0,...,0,0,0,0,0,0,0,0,0,79
551,252,40,6.300000,0,0.000000,0.000000,0,0.000000,0.000000,0,...,0,0,0,0,0,0,0,0,0,246
435,53429,9317,5.734571,781,0.014618,0.083825,1,0.000019,0.000107,65,...,0,0,0,0,0,0,0,0,0,123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557,1382,224,6.169643,9,0.006512,0.040179,1,0.000724,0.004464,0,...,0,0,0,0,0,0,0,0,0,248
748,0,0,0.000000,0,0.000000,0.000000,0,0.000000,0.000000,0,...,0,0,0,0,0,0,0,0,0,212
810,82,13,6.307692,1,0.012195,0.076923,0,0.000000,0.000000,0,...,0,0,0,0,0,0,0,0,0,248
187,1959,325,6.027692,23,0.011741,0.070769,0,0.000000,0.000000,0,...,0,0,0,0,0,0,0,0,0,237


In [461]:
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

prediction = model.predict(X_test)
score = accuracy_score(Y_test, prediction)
score

0.7447916666666666

In [404]:
# real_data = pandas.concat([
#     pandas.read_csv('dataset/gossipcop_real.csv'),
#     pandas.read_csv('dataset/politifact_fake.csv')
# ], ignore_index=True)
# real_data['real'] = 1

In [46]:
# data = pandas.concat([fake_data, real_data], ignore_index=True)
# data = data.sample(frac=1)
# data = data.reset_index()

In [47]:
data['character_cnt'] = data['title'].str.len()
data['word_cnt'] = data['title'].str.split().str.len()
data['character_per_word'] = data['character_cnt'] / data['word_cnt']

special_characters = '!?@#$%^&*()-+_=,<>/'
data['special_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if any(char in special_characters for char in x)]))

data['hashtag_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '#' in x]))
data['at_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '@' in x]))
data['explanation_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '!' in x]))
data['question_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '?' in x]))
data['interrobang_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '?!' in x]))
data['ellipsis_cnt'] = data['title'].apply(lambda x: len([x for datax in x.split() if '...' in x]))
data['semicolon_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if ';' in x]))

data['http_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'http' in x]))
data['www_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'www' in x]))

data['number_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

In [48]:


data['title_words'] = data['title'].apply(lambda x: word_tokenize(x))
data['title_sentences'] = data['title'].apply(lambda x: sent_tokenize(x))
title_word_dict = gensim.corpora.Dictionary(data['title_words'])
title_sentence_dict = gensim.corpora.Dictionary(data['title_sentences'])
# title_word_dict = data['title_words'].apply(lambda x: gensim.corpora.Dictionary(x))
data['title_words'] = data['title_words'].apply(lambda x: [title_word_dict.doc2bow(x) for word in x])
data['title_sentences'] = data['title_sentences'].apply(lambda x: [title_sentence_dict.doc2bow(x) for word in x])

data['title_words']

# data['title_corpus']

# corpus = [title_word_dict.doc2bow(data['title_words']) for word in data['title_words']]


# data['processed_title'] = data['title'].str.replace('[^\w\s]', '')
# data['processed_title'] = data['processed_title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# data['processed_title'] = data['processed_title'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# stemmer = PorterStemmer()
# data['processed_title'] = data['processed_title'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))



0      [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...
1      [[(2, 1), (7, 1), (11, 1), (12, 1), (13, 1), (...
2      [[(9, 1), (10, 1), (31, 1), (32, 1), (33, 1), ...
3      [[(28, 1), (40, 1), (41, 1), (42, 1), (43, 1),...
4      [[(47, 1), (48, 1), (49, 1), (50, 1), (51, 1)]...
                             ...                        
954    [[(11, 1), (51, 1), (2084, 1), (2106, 1), (236...
955    [[(245, 1), (1220, 1), (3483, 1), (3484, 1), (...
956                                          [[(71, 1)]]
957    [[(105, 1), (275, 1), (276, 1), (1029, 1), (34...
958    [[(31, 1), (468, 1), (469, 1), (831, 1), (2315...
Name: title_words, Length: 959, dtype: object

In [49]:
# tfidvector = TfidfVectorizer(max_features=500, lowercase=True, analyzer='word', stop_words='english', ngram_range=(1, 1))

# data_vector = tfidvector.fit_transform(data['processed_title'])

# for name, value in zip(tfidvector.get_feature_names_out(), tfidvector.idf_):
#     print(name, ': ', value)

In [50]:
data = data.drop(columns=['title', 'index', 'id', 'news_url', 'tweet_ids'])
data

KeyError: "['index' 'id' 'news_url' 'tweet_ids'] not found in axis"

In [None]:
X = data.drop(columns=['real'])
Y = data['real']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

prediction = model.predict(X_test)
score = accuracy_score(Y_test, prediction)
score

ValueError: setting an array element with a sequence.