In [408]:
import re
import string
import warnings
import numpy
import pandas
import matplotlib.pyplot as pyplot
import seaborn
import nltk
import gensim

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize



In [409]:
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [410]:
fake_data = pandas.concat([
    pandas.read_csv('dataset/gossipcop_fake.csv'), 
    pandas.read_csv('dataset/politifact_fake.csv')
], ignore_index=True)
fake_data['real'] = 0

In [411]:
real_data = pandas.concat([
    pandas.read_csv('dataset/gossipcop_real.csv'),
    pandas.read_csv('dataset/politifact_fake.csv')
], ignore_index=True)
real_data['real'] = 1

In [412]:
data = pandas.concat([fake_data, real_data], ignore_index=True)
data = data.sample(frac=1)
data = data.reset_index()

In [413]:
data['character_cnt'] = data['title'].str.len()
data['word_cnt'] = data['title'].str.split().str.len()
data['character_per_word'] = data['character_cnt'] / data['word_cnt']

special_characters = '!?@#$%^&*()-+_=,<>/'
data['special_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if any(char in special_characters for char in x)]))

data['hashtag_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '#' in x]))
data['at_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '@' in x]))
data['explanation_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '!' in x]))
data['question_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '?' in x]))
data['interrobang_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '?!' in x]))
data['ellipsis_cnt'] = data['title'].apply(lambda x: len([x for datax in x.split() if '...' in x]))
data['semicolon_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if ';' in x]))

data['http_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'http' in x]))
data['www_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'www' in x]))

data['number_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

In [414]:


data['title_words'] = data['title'].apply(lambda x: word_tokenize(x))
data['title_sentences'] = data['title'].apply(lambda x: sent_tokenize(x))
title_word_dict = gensim.corpora.Dictionary(data['title_words'])
title_sentence_dict = gensim.corpora.Dictionary(data['title_sentences'])
# title_word_dict = data['title_words'].apply(lambda x: gensim.corpora.Dictionary(x))
data['title_words'] = data['title_words'].apply(lambda x: [title_word_dict.doc2bow(x) for word in x])
data['title_sentences'] = data['title_sentences'].apply(lambda x: [title_sentence_dict.doc2bow(x) for word in x])
# data['title_corpus']

# corpus = [title_word_dict.doc2bow(data['title_words']) for word in data['title_words']]


# data['processed_title'] = data['title'].str.replace('[^\w\s]', '')
# data['processed_title'] = data['processed_title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# data['processed_title'] = data['processed_title'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# stemmer = PorterStemmer()
# data['processed_title'] = data['processed_title'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))



In [415]:
# tfidvector = TfidfVectorizer(max_features=500, lowercase=True, analyzer='word', stop_words='english', ngram_range=(1, 1))

# data_vector = tfidvector.fit_transform(data['processed_title'])

# for name, value in zip(tfidvector.get_feature_names_out(), tfidvector.idf_):
#     print(name, ': ', value)

In [416]:
data = data.drop(columns=['title', 'index', 'id', 'news_url', 'tweet_ids'])
data

Unnamed: 0,real,character_cnt,word_cnt,character_per_word,special_cnt,hashtag_cnt,at_cnt,explanation_cnt,question_cnt,interrobang_cnt,ellipsis_cnt,semicolon_cnt,http_cnt,www_cnt,number_cnt,title_words,title_sentences
0,0,14,2,7.000000,0,0,0,0,0,0,0,0,0,0,0,"[[(0, 1), (1, 1)], [(0, 1), (1, 1)]]","[[(0, 1)]]"
1,1,56,8,7.000000,1,0,0,1,0,0,0,0,0,0,0,"[[(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, ...","[[(1, 1), (2, 1)], [(1, 1), (2, 1)]]"
2,1,73,10,7.300000,0,0,0,0,0,0,0,0,0,0,0,"[[(12, 1), (13, 1), (14, 1), (15, 1), (16, 1),...","[[(3, 1)]]"
3,0,77,16,4.812500,0,0,0,0,0,0,0,0,0,0,0,"[[(22, 1), (23, 1), (24, 1), (25, 1), (26, 1),...","[[(4, 1), (5, 1)], [(4, 1), (5, 1)]]"
4,1,156,26,6.000000,0,0,0,0,0,0,0,0,0,0,0,"[[(38, 1), (39, 1), (41, 1), (42, 1), (43, 2),...","[[(6, 1)]]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22999,1,67,9,7.444444,2,0,0,0,0,0,0,0,0,0,0,"[[(41, 1), (45, 1), (93, 2), (809, 1), (986, 1...","[[(22341, 1)]]"
23000,1,38,7,5.428571,0,0,0,0,0,0,0,0,0,0,0,"[[(76, 1), (316, 1), (485, 1), (936, 1), (2247...","[[(22342, 1)]]"
23001,1,43,6,7.166667,0,0,0,0,0,0,0,0,0,0,0,"[[(64, 1), (80, 1), (5766, 1), (6042, 1), (640...","[[(22343, 1)]]"
23002,1,80,12,6.666667,0,0,0,0,0,0,0,0,0,0,0,"[[(10, 1), (62, 1), (451, 1), (715, 1), (876, ...","[[(22344, 1)]]"


In [419]:
X = data.drop(columns=['real'])
Y = data['real']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

prediction = model.predict(X_test)
score = accuracy_score(Y_test, prediction)
score

ValueError: setting an array element with a sequence.