In [1]:
import pandas as pd

true_df = pd.read_csv("data/DataSet_Misinfo_TRUE.csv", encoding='latin-1')
true_df.insert(2, "Classifier", 0)

false_df = pd.read_csv("data/DataSet_Misinfo_FAKE.csv", encoding='latin-1')
false_df.insert(2, "Classifier", 1)

data = [true_df, false_df]
combo_df = pd.concat(data, ignore_index=True)

In [2]:
import numpy as np
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/melannienimocks/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/melannienimocks/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/melannienimocks/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/melannienimocks/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/melannienimocks/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-

In [3]:
np.random.seed(333)


In [4]:
#remove any blank rows
combo_df['text'].dropna(inplace=True)
#convert all to lowercase
#combo_df['text'] = combo_df['text'].astype("string")

In [5]:
combo_df['text'] = combo_df['text'].astype(str)
print(combo_df.head(10))
combo_df['text'] = combo_df['text'].str.lower()

#tokenization
#combo_df['text'] = combo_df['text'].apply(word_tokenize) 
combo_df['tokenized_text'] = combo_df.apply(lambda row: nltk.word_tokenize(row["text"]), axis=1)

   Unnamed: 0                                               text  Classifier
0           0  The head of a conservative Republican faction ...           0
1           1  Transgender people will be allowed for the fir...           0
2           2  The special counsel investigation of links bet...           0
3           3  Trump campaign adviser George Papadopoulos tol...           0
4           4  President Donald Trump called on the U.S. Post...           0
5           5  The White House said on Friday it was set to k...           0
6           6  President Donald Trump said on Thursday he bel...           0
7           7  While the Fake News loves to talk about my so-...           0
8           8  Together, we are MAKING AMERICA GREAT AGAIN! b...           0
9           9  Alabama Secretary of State John Merrill said h...           0


In [6]:
#WordNetLemmatizer
pos_map = defaultdict(lambda : wn.NOUN)
pos_map['J'] = wn.ADJ
pos_map['V'] = wn.VERB
pos_map['R'] = wn.ADV

In [7]:
for i, text in enumerate(combo_df['tokenized_text']):
    final_words = []
    word_lem = WordNetLemmatizer()
    for word, tag in pos_tag(text):
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lem.lemmatize(word, pos=pos_map[tag[0]])
            final_words.append(word_final)
    combo_df.loc[i, 'tokenized_text'] = str(final_words)

In [8]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(combo_df['tokenized_text'], combo_df['Classifier'], test_size=0.3)

In [9]:
#encoding to numerical values that the model can understand
Encoder = LabelEncoder()
train_y = Encoder.fit_transform(train_y)
test_y = Encoder.fit_transform(test_y)

In [10]:
#word vectorization--turn collection of text into numerical feature vectors using
# term frequency -- inverse document (TF-IDF)

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(combo_df['tokenized_text'])

train_x_Tfidf = Tfidf_vect.transform(train_x)
test_x_Tfidf = Tfidf_vect.transform(test_x)

print(Tfidf_vect.vocabulary_)
print(train_x_Tfidf)

  (0, 4895)	0.2482517509720427
  (0, 4658)	0.18168333527923383
  (0, 4539)	0.23090153540928474
  (0, 3887)	0.19709033587240485
  (0, 3833)	0.33218378144001914
  (0, 3817)	0.39204980065819184
  (0, 3635)	0.26249508624459067
  (0, 3267)	0.11726143660929737
  (0, 2102)	0.20794579196220317
  (0, 1918)	0.3219191710766633
  (0, 1664)	0.2618609728036976
  (0, 1035)	0.14206994270133202
  (0, 871)	0.2480060270861397
  (0, 544)	0.20872729302000423
  (0, 484)	0.3546781265957541
  (1, 4978)	0.028314211303735883
  (1, 4963)	0.050968970552629084
  (1, 4951)	0.033290305004925094
  (1, 4903)	0.08578700531866641
  (1, 4881)	0.044300000808075585
  (1, 4869)	0.03467096929529958
  (1, 4863)	0.32444789338521357
  (1, 4801)	0.0448817327374432
  (1, 4796)	0.041489108639577395
  (1, 4670)	0.07851578469294761
  :	:
  (55030, 4513)	0.19587701843647723
  (55030, 4410)	0.2412648471812401
  (55030, 4209)	0.19277148380477097
  (55030, 3950)	0.272599571578628
  (55030, 3413)	0.12205386780429647
  (55030, 3357)	0.221

In [11]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(train_x_Tfidf, train_y)
prediction_SVM = SVM.predict(test_x_Tfidf)

print("SVM Accuracy Score -> ", accuracy_score(prediction_SVM, test_y)*100)

SVM Accuracy Score ->  93.50462138556772


In [None]:
#todo: import the data :D 
import math
#calculate avg from the dataset 
mean_retweet_count = 0
mean_follower_count = 0
total_tweet_count = 0
def cred_score(retweets, followers, user_tweet_count):
    utility = math.abs(((retweets * followers)/user_tweet_count) - ((mean_retweet_count * mean_follower_count)/total_tweet_count))
    return utility
monkeypox_df['credibility'] = cred_score(monkeypox_df['retweet_count'],
                                         monkeypox_df['followers count'],
                                         monkeypox_df['tweet count'])