In [None]:
import os
import json
import pandas
import nltk
import gensim

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
data = pandas.DataFrame()
realorfake = {"real":1,"fake":0}

for dir in os.listdir('../data/politifact'):
    for datadir in os.listdir('../data/politifact/' + dir):
        try:
            with open('../data/politifact/' + dir + '/' + datadir + '/news content.json') as json_file:
                json_obj = json.load(json_file)
                json_obj['real'] = realorfake.get(dir)
                data = data.append(json_obj, ignore_index=True)
        except Exception:
            None
        finally:
            None

In [None]:
special_characters = '!?@#$%^&*()-+_=,<>/'

data['text_character_cnt'] = data['text'].str.len()
data['text_word_cnt'] = data['text'].str.split().str.len()
data['text_character_per_word'] = data['text_character_cnt'].combine(data['text_word_cnt'], lambda x, y: x / y if y else 0)

data['text_special_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if any(char in special_characters for char in x)]))
data['text_special_per_char'] = data['text'].combine(data['text_character_cnt'], lambda x, y: (len([x for x in x.split() if any(char in special_characters for char in x)]) / y) if y else 0)
data['text_special_per_word'] = data['text'].combine(data['text_word_cnt'], lambda x, y: (len([x for x in x.split() if any(char in special_characters for char in x)]) / y) if y else 0)

for char in special_characters:
    data['text_' + char + '_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if char in x]))
    data['text_' + char + '_per_char'] = data['text'].combine(data['text_character_cnt'], lambda x, y: (len([x for x in x.split() if char in x]) / y) if y else 0)
    data['text_' + char + '_per_word'] = data['text'].combine(data['text_word_cnt'], lambda x, y: (len([x for x in x.split() if char in x]) / y if y else 0))

data['text_http_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if 'http' in x]))
data['text_www_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if 'www' in x]))
data['text_number_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

data['title_character_per_word'] = data['title_character_cnt'].combine(data['title_word_cnt'], lambda x, y: x / y if y else 0)

In [None]:
data['autor_cnt'] = data['authors'].apply(lambda x: len(x))

authors_dict = gensim.corpora.Dictionary(data['authors'])

for author in authors_dict:
    data[authors_dict[author]] = data['authors'].apply(lambda x: 1 if authors_dict[author] in x else 0)

In [None]:
source_dict = gensim.corpora.Dictionary([data['source'].unique()])
data['source_id'] = data['source'].apply(lambda x: list(source_dict.values()).index(x))

In [None]:
X = data.drop(columns=['url', 'text', 'images', 'top_img', 'keywords', 'authors',
       'canonical_link', 'title', 'meta_data', 'movies', 'publish_date',
       'source', 'summary', 'real'])
Y = data['real']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

prediction = model.predict(X_test)
score = accuracy_score(Y_test, prediction)
score

In [None]:
data['character_cnt'] = data['title'].str.len()
data['word_cnt'] = data['title'].str.split().str.len()
data['character_per_word'] = data['character_cnt'] / data['word_cnt']

special_characters = '!?@#$%^&*()-+_=,<>/'
data['special_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if any(char in special_characters for char in x)]))

data['hashtag_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '#' in x]))
data['at_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '@' in x]))
data['explanation_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '!' in x]))
data['question_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '?' in x]))
data['interrobang_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '?!' in x]))
data['ellipsis_cnt'] = data['title'].apply(lambda x: len([x for datax in x.split() if '...' in x]))
data['semicolon_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if ';' in x]))

data['http_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'http' in x]))
data['www_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'www' in x]))

data['number_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

In [None]:


data['title_words'] = data['title'].apply(lambda x: word_tokenize(x))
data['title_sentences'] = data['title'].apply(lambda x: sent_tokenize(x))
title_word_dict = gensim.corpora.Dictionary(data['title_words'])
title_sentence_dict = gensim.corpora.Dictionary(data['title_sentences'])
data['title_words'] = data['title_words'].apply(lambda x: [title_word_dict.doc2bow(x) for word in x])
data['title_sentences'] = data['title_sentences'].apply(lambda x: [title_sentence_dict.doc2bow(x) for word in x])

In [None]:
data = data.drop(columns=['title', 'index', 'id', 'news_url', 'tweet_ids'])

In [None]:
X = data.drop(columns=['real'])
Y = data['real']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

prediction = model.predict(X_test)
score = accuracy_score(Y_test, prediction)