In [3]:
! pip install gensim

Collecting gensim
  Downloading gensim-4.1.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 3.5 MB/s 
Collecting smart-open>=1.8.1
  Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 4.0 MB/s 
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.1.2 smart-open-5.2.1


In [84]:
import os
import json
import re
import string
import warnings
import numpy
import pandas
import matplotlib.pyplot as pyplot
import seaborn
import nltk
import gensim
import pickle

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.pipeline import Pipeline

In [85]:
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/tristan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/tristan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [86]:
data = pandas.DataFrame()
realorfake = {"real":1,"fake":0}

for dir in os.listdir('../data/politifact'):
    for datadir in os.listdir('../data/politifact/' + dir):
        try:
            with open('../data/politifact/' + dir + '/' + datadir + '/news content.json') as json_file:
                json_obj = json.load(json_file)
                json_obj['real'] = realorfake.get(dir)
                data = data.append(json_obj, ignore_index=True)
        except Exception:
            None
        finally:
            None



data.columns



Index(['authors', 'canonical_link', 'images', 'keywords', 'meta_data',
       'movies', 'publish_date', 'real', 'source', 'summary', 'text', 'title',
       'top_img', 'url'],
      dtype='object')

In [87]:
data.head()

Unnamed: 0,authors,canonical_link,images,keywords,meta_data,movies,publish_date,real,source,summary,text,title,top_img,url
0,[February],https://www.politifact.com/factchecks/2010/feb...,[https://static.politifact.com/CACHE/images/po...,[],"{'viewport': 'width=device-width, initial-scal...",[],1265238000.0,1.0,http://politifact.com,,The recent Massachusetts Senate election capti...,Krugman calls Senate health care bill similar ...,https://static.politifact.com/politifact/rulin...,http://politifact.com/truth-o-meter/statements...
1,[],https://www.guttmacher.org/laws-affecting-repr...,[https://www.guttmacher.org/sites/default/file...,[],"{'MobileOptimized': 'width', 'HandheldFriendly...",[],1388513502.0,1.0,http://www.guttmacher.org,,"by Elizabeth Nash, Rachel Benson Gold, Andrea ...",Laws Affecting Reproductive Health and Rights:...,https://www.guttmacher.org/sites/default/files...,http://www.guttmacher.org/statecenter/updates/...
2,"[Abc News, January]",https://abcnews.go.com/ThisWeek/week-transcrip...,"[data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP/...",[],{'description': 'Transcript of 'This Week' int...,[],,1.0,http://abcnews.go.com,,"August 8, 2010  -- AMANPOUR: Good morning. I'...",'This Week' Transcript: Odierno and Chiarelli,http://abcnews.go.com/ThisWeek/week-transcript...,http://abcnews.go.com/ThisWeek/week-transcript...
3,[],https://ohioart.com/etch-a-sketch-sold-to-spin...,[https://ohioart.com/wp-content/uploads/2016/0...,[],"{'viewport': 'width=device-width, initial-scal...",[],,1.0,http://www.ohioart.com,,"In February of 2016, we announced the sale of ...",Sale of the Etch A Sketch Brand,https://ohioart.com/wp-content/uploads/2016/03...,http://www.ohioart.com/etch/
4,[Gardiner Harris],https://www.nytimes.com/2009/02/03/health/poli...,[https://static01.nyt.com/newsgraphics/images/...,[],"{'robots': 'noarchive', 'description': 'Federa...",[],1233615600.0,1.0,http://www.nytimes.com,,WASHINGTON  Even though federal health offici...,Peanut Product Recall Took Company Approval,https://static01.nyt.com/newsgraphics/images/i...,http://www.nytimes.com/2009/02/03/health/polic...


In [88]:
special_characters = '!?@#$%^&*()-+_=,<>/'

data['text_character_cnt'] = data['text'].str.len()
# if(data['text_character_cnt'] > 0):
data['text_word_cnt'] = data['text'].str.split().str.len()
# data['text_character_per_word'] = data['text_character_cnt'] / data['text_word_cnt']

data['text_special_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if any(char in special_characters for char in x)]))

for char in special_characters:
    data['text_' + char + '_per_char'] = data['text'].apply(lambda x: len([x for x in x.split() if char in x]))
    data['text_' + char + '_per_word'] = data['text'].apply(lambda x: len([x for x in x.split() if char in x]))

data['text_http_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if 'http' in x]))
data['text_www_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if 'www' in x]))
data['text_number_cnt'] = data['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))


#data['title_character_cnt'] = data['title'].str.len()
# if(data['title_character_cnt'] > 0):
#data['title_word_cnt'] = data['title'].str.split().str.len()
# data['title_character_per_word'] = data['title_character_cnt'] / data['title_word_cnt']

#data['special_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if any(char in special_characters for char in x)]))

#for char in special_characters:
#    data['title_' + char + '_per_char'] = data['title'].apply(lambda x: len([x for x in x.split() if char in x]))
#    data['title_' + char + '_per_word'] = data['title'].apply(lambda x: len([x for x in x.split() if char in x]))

#data['title_http_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'http' in x]))
#data['title_www_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'www' in x]))
#data['title_number_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

In [89]:
def prepare_test(text):
    data_small = pandas.DataFrame(numpy.array([text]),columns=['text'])



    special_characters = '!?@#$%^&*()-+_=,<>/'

    data_small['text_character_cnt'] = data_small['text'].str.len()
    # if(data['text_character_cnt'] > 0):
    data_small['text_word_cnt'] = data_small['text'].str.split().str.len()
    # data['text_character_per_word'] = data['text_character_cnt'] / data['text_word_cnt']

    data_small['text_special_cnt'] = data_small['text'].apply(lambda x: len([x for x in x.split() if any(char in special_characters for char in x)]))

    for char in special_characters:
        data_small['text_' + char + '_per_char'] = data_small['text'].apply(lambda x: len([x for x in x.split() if char in x]))
        data_small['text_' + char + '_per_word'] = data_small['text'].apply(lambda x: len([x for x in x.split() if char in x]))

    data_small['text_http_cnt'] = data_small['text'].apply(lambda x: len([x for x in x.split() if 'http' in x]))
    data_small['text_www_cnt'] = data_small['text'].apply(lambda x: len([x for x in x.split() if 'www' in x]))
    data_small['text_number_cnt'] = data_small['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

    
    return data_small

In [90]:
X = data.drop(columns=['url', 'text', 'images', 'top_img', 'keywords', 'authors',
       'canonical_link', 'title', 'meta_data', 'movies', 'publish_date',
       'source', 'summary', 'real'])
Y = data['real']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

X_train

Unnamed: 0,text_character_cnt,text_word_cnt,text_special_cnt,text_!_per_char,text_!_per_word,text_?_per_char,text_?_per_word,text_@_per_char,text_@_per_word,text_#_per_char,...,"text_,_per_word",text_<_per_char,text_<_per_word,text_>_per_char,text_>_per_word,text_/_per_char,text_/_per_word,text_http_cnt,text_www_cnt,text_number_cnt
675,1299,238,11,0,0,2,2,0,0,0,...,8,0,0,0,0,0,0,0,0,0
610,5451,902,51,0,0,0,0,0,0,0,...,44,0,0,0,0,1,1,0,0,4
766,2197,371,22,0,0,0,0,0,0,0,...,20,0,0,0,0,0,0,0,0,2
651,37,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
883,3316,559,40,0,0,1,1,0,0,0,...,22,0,0,0,0,1,1,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269,17517,2808,244,0,0,2,2,0,0,0,...,166,0,0,0,0,1,1,0,0,26
768,2065,348,22,0,0,1,1,0,0,0,...,18,0,0,0,0,0,0,0,0,1
418,11053,1652,94,0,0,6,6,0,0,0,...,68,0,0,0,0,0,0,0,0,18
474,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

prediction = model.predict(X_test)
score = accuracy_score(Y_test, prediction)
score

0.6927083333333334

In [92]:
with open("decisionTreeModel.pkl", 'wb') as file:
    pickle.dump(model, file)

In [94]:
model.predict(prepare_test("Donald Trump is President").drop(columns=['text']))

array([1.])

In [77]:
class model1_News:
    def __init__(self):
        self.model = pickle.load(open("decisionTreeModel.pkl",'rb'))
    
    def preditWithText(self, text):
        self.model.predit(prepare_test(text).drop(columns=['text']))
    

Unnamed: 0,text,text_character_cnt,text_word_cnt,text_special_cnt,text_!_per_char,text_!_per_word,text_?_per_char,text_?_per_word,text_@_per_char,text_@_per_word,...,"text_,_per_word",text_<_per_char,text_<_per_word,text_>_per_char,text_>_per_word,text_/_per_char,text_/_per_word,text_http_cnt,text_www_cnt,text_number_cnt
0,Donald Trump is President,25,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
X = data.drop(columns=['url', 'text', 'images', 'top_img', 'keywords', 'authors',
       'canonical_link', 'title', 'meta_data', 'movies', 'publish_date',
       'source', 'summary', 'real', "title_character_cnt",  "title_word_cnt",       "special_cnt",          "title_!_per_char",    
"title_!_per_word",    "title_?_per_char",    "title_?_per_word",    "title_@_per_char",    
"title_@_per_word",    "title_#_per_char",    "title_#_per_word",    "title_$_per_char",    
"title_$_per_word",    "title_%_per_char",    "title_%_per_word",    "title_^_per_char",    
"title_^_per_word",    "title_&_per_char",    "title_&_per_word",    "title_*_per_char",    
"title_*_per_word",    "title_(_per_char",    "title_(_per_word",    "title_)_per_char",    
"title_)_per_word",    "title_-_per_char",    "title_-_per_word",    "title_+_per_char",    
"title_+_per_word",    "title___per_char",    "title___per_word",    "title_=_per_char",    
"title_=_per_word",    "title_,_per_char",    "title_,_per_word",    "title_<_per_char",    
"title_<_per_word",    "title_>_per_char",    "title_>_per_word",    "title_/_per_char",    
"title_/_per_word",    "title_http_cnt",    "title_www_cnt",    "title_number_cnt"    ])

Y = data['real']

X_train_limited, X_test_limited, Y_train_limited, Y_test_limited = train_test_split(X, Y, test_size=0.2)


In [99]:
X.head()

Unnamed: 0,text_character_cnt,text_word_cnt,text_special_cnt,text_!_per_char,text_!_per_word,text_?_per_char,text_?_per_word,text_@_per_char,text_@_per_word,text_#_per_char,...,"text_,_per_word",text_<_per_char,text_<_per_word,text_>_per_char,text_>_per_word,text_/_per_char,text_/_per_word,text_http_cnt,text_www_cnt,text_number_cnt
0,7892,1258,102,0,0,3,3,0,0,0,...,71,0,0,0,0,0,0,0,0,10
1,23903,3627,241,0,0,0,0,0,0,0,...,141,0,0,0,0,0,0,0,0,50
2,44882,7805,666,0,0,59,59,0,0,0,...,474,0,0,0,0,1,1,0,0,23
3,2437,395,19,0,0,0,0,0,0,0,...,18,0,0,0,0,0,0,0,0,3
4,975,162,4,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,0,0,0


In [100]:
model = DecisionTreeClassifier()
model.fit(X_train_limited, Y_train_limited)

prediction = model.predict(X_test_limited)
score = accuracy_score(Y_test_limited, prediction)
score

0.734375

In [101]:
with open("decisionTreeModel_noTitle.pkl", 'wb') as file:
    pickle.dump(model, file)

In [45]:
# real_data = pandas.concat([
#     pandas.read_csv('dataset/gossipcop_real.csv'),
#     pandas.read_csv('dataset/politifact_fake.csv')
# ], ignore_index=True)
# real_data['real'] = 1

In [46]:
# data = pandas.concat([fake_data, real_data], ignore_index=True)
# data = data.sample(frac=1)
# data = data.reset_index()

In [75]:
data['character_cnt'] = data['title'].str.len()
data['word_cnt'] = data['title'].str.split().str.len()
data['character_per_word'] = data['character_cnt'] / data['word_cnt']

special_characters = '!?@#$%^&*()-+_=,<>/'
data['special_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if any(char in special_characters for char in x)]))

data['hashtag_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '#' in x]))
data['at_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '@' in x]))
data['explanation_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '!' in x]))
data['question_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '?' in x]))
data['interrobang_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if '?!' in x]))
data['ellipsis_cnt'] = data['title'].apply(lambda x: len([x for datax in x.split() if '...' in x]))
data['semicolon_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if ';' in x]))

data['http_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'http' in x]))
data['www_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if 'www' in x]))

data['number_cnt'] = data['title'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

In [76]:


data['title_words'] = data['title'].apply(lambda x: word_tokenize(x))
data['title_sentences'] = data['title'].apply(lambda x: sent_tokenize(x))
title_word_dict = gensim.corpora.Dictionary(data['title_words'])
title_sentence_dict = gensim.corpora.Dictionary(data['title_sentences'])
# title_word_dict = data['title_words'].apply(lambda x: gensim.corpora.Dictionary(x))
data['title_words'] = data['title_words'].apply(lambda x: [title_word_dict.doc2bow(x) for word in x])
data['title_sentences'] = data['title_sentences'].apply(lambda x: [title_sentence_dict.doc2bow(x) for word in x])

data['title_words']

# data['title_corpus']

# corpus = [title_word_dict.doc2bow(data['title_words']) for word in data['title_words']]


# data['processed_title'] = data['title'].str.replace('[^\w\s]', '')
# data['processed_title'] = data['processed_title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# data['processed_title'] = data['processed_title'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# stemmer = PorterStemmer()
# data['processed_title'] = data['processed_title'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))



0      [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...
1      [[(11, 1), (12, 1), (13, 1), (14, 1), (15, 1),...
2      [[(12, 1), (21, 1), (22, 1), (23, 1), (24, 1),...
3      [[(28, 1), (29, 1), (30, 1), (31, 1), (32, 1),...
4      [[(35, 1), (36, 1), (37, 1), (38, 1), (39, 1),...
                             ...                        
954    [[(21, 1), (33, 1), (3184, 1), (3469, 1), (347...
955    [[(135, 1), (140, 1), (339, 1), (500, 1), (540...
956    [[(10, 1), (33, 1), (47, 1), (117, 1), (217, 1...
957    [[(160, 1), (433, 1), (485, 1), (528, 1), (802...
958    [[(33, 1), (34, 1), (97, 1), (135, 1), (140, 1...
Name: title_words, Length: 959, dtype: object

In [49]:
# tfidvector = TfidfVectorizer(max_features=500, lowercase=True, analyzer='word', stop_words='english', ngram_range=(1, 1))

# data_vector = tfidvector.fit_transform(data['processed_title'])

# for name, value in zip(tfidvector.get_feature_names_out(), tfidvector.idf_):
#     print(name, ': ', value)

In [50]:
data = data.drop(columns=['title', 'index', 'id', 'news_url', 'tweet_ids'])
data

KeyError: "['index' 'id' 'news_url' 'tweet_ids'] not found in axis"

In [77]:
X = data.drop(columns=['real'])
Y = data['real']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

prediction = model.predict(X_test)
score = accuracy_score(Y_test, prediction)
score

ValueError: setting an array element with a sequence.