In [None]:
pip install stanza

In [None]:
pip install tweet-preprocessor

In [None]:
from __future__ import unicode_literals, print_function
import stanza
import pandas as pd
stanza.download('en')
import pandas as pd
from sklearn.model_selection import train_test_split
import preprocessor as p

In [None]:
nlpStanza = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos')

In [None]:
# tag the sentences which have a verb and have at least 4 words
def tag_verb(sent):
  doc = nlpStanza(sent)
  s = [item for sublist in doc.sentences for item in sublist.words]
  words = list(filter(lambda x: x.upos != "PUNCT",s))
  if len(words) < 4:
    return False
  for word in s:
    if word.upos == "VERB" or word.upos == "AUX":
      return True
  return False

In [None]:
# split each abstract from the cord19 dataset into sentences using stanza and create a new row for each sentence
df=pd.read_csv("initial_datasets/cord19_metadata.csv")

df["abstract"] = df["abstract"].apply(lambda x: [y.text for y in nlpStanza(x).sentences])
df = df.apply( pd.Series.explode )
# only keep the sentences from the cord dataset which contain a verb and are more than 3 words
df = df.drop_duplicates(subset="abstract")
df["verb"] = df["abstract"].apply(lambda x: tag_verb(x))
df = df[(df["verb"] == True)]
df = df[["abstract"]]
df["labels"] = 0
df = df.rename(columns={"abstract": "text"})
# get a random subset with the same size as the fake news dataset
subsetdf = df.sample(n = 7908)
subsetdf.to_csv("preprocessed_datasets/real_sentences.csv", index=False)

In [None]:
# only keep the column containing the sentences from both fake news datasets
df1 = pd.read_csv("initial_datasets/fake_covid_sentences.csv")
df2 = pd.read_csv("https://raw.githubusercontent.com/cuilimeng/CoAID/master/11-01-2020/NewsFakeCOVID-19.csv")
df1 = df1.rename(columns={"sentence": "text"})
df2 = df2.rename(columns={"title": "text"})
df1 = df1[["text"]]
df2 = df2[["text"]]
# combine both datasets and set their label to 1
result = pd.concat([df1, df2])
result["labels"] = 1
result = result.drop_duplicates(subset="text")
result.to_csv("preprocessed_datasets/fake_sentences.csv", index=False)

In [None]:
# merge the real and fake sentences
df1 = pd.read_csv("preprocessed_datasets/real_sentences.csv")
df2 = pd.read_csv("preprocessed_datasets/fake_sentences.csv")
result = pd.concat([df1, df2])
result.to_csv("preprocessed_datasets/covid_sentences_all.csv", index=False)

In [None]:
# split the dataset into train and test dataset
df = pd.read_csv("preprocessed_datasets/covid_sentences_all.csv")
df_train, df_test = train_test_split(df,test_size=0.2)
df_train.to_csv("preprocessed_datasets/train_dataset_1.csv", index=False)
df_test.to_csv("preprocessed_datasets/test_dataset_1.csv", index=False)

In [None]:
p.set_options(p.OPT.URL, p.OPT.EMOJI)
# remove all urls and emojis from each data sample
def preprocess(row):
  text = row["tweet"]
  text = p.clean(text)
  return text

In [None]:
# map 'real' and 'fake' to numerical values
def map_label(row):
  return 0 if row["label"]=="real" else 1

In [None]:
# preprocess second (additional) train dataset
df = pd.read_csv("https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv")
df["labels"] = df.apply(lambda x: map_label(x), 1)
df["tweet"] = df.apply(lambda x: preprocess(x), 1)
df = df.rename(columns={"tweet": "text"})
df = df[["text", "labels"]]
df.to_csv("preprocessed_datasets/train_dataset_2.csv", index=False)

In [None]:
# preprocess second (additional) test dataset
df = pd.read_csv("https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Val.csv")
df["labels"] = df.apply(lambda x: map_label(x), 1)
df["tweet"] = df.apply(lambda x: preprocess(x), 1)
df = df.rename(columns={"tweet": "text"})
df = df[["text", "labels"]]
df.to_csv("preprocessed_datasets/test_dataset_2.csv", index=False)

In [None]:
# combine the original train dataset with the additional one
df1 = pd.read_csv("preprocessed_datasets/train_dataset_1.csv")
df2 = pd.read_csv("preprocessed_datasets/train_dataset_2.csv")
result = pd.concat([df1, df2])
result.to_csv("preprocessed_datasets/train_dataset_final.csv", index=False)

In [None]:
# ombine the original test dataset with the additional one
df1 = pd.read_csv("preprocessed_datasets/test_dataset_1.csv")
df2 = pd.read_csv("preprocessed_datasets/test_dataset_2.csv")
result = pd.concat([df1, df2])
result.to_csv("preprocessed_datasets/test_dataset_final.csv", index=False)