In [None]:
import os
from tqdm import tqdm
import json
import spacy
from textblob import TextBlob
import numpy as np
import re
import pandas as pd
import pickle as pk
import copy
from sklearn import preprocessing
import datetime
import emoji
from pprint import pprint
from nltk.tree import Tree 
from stanfordcorenlp.corenlp import StanfordCoreNLP

In [None]:
# Path to Zubiaga path
zubiagaPath = ""

In [None]:
# Path to Kwon path
kwonPath = ""

In [None]:
# Loading zubiaga dataset into a multi-level dictionary
zubiagaRumours = next(os.walk(zubiagaPath))[1]
zubiagaTweets = {}
zubiagaSourceTweets = {} 
try:
    for topic in tqdm(zubiagaRumours):
        zubiagaTweets[topic] = {"rumour":{}, "nonRumour":{}}
        zubiagaSourceTweets[topic] = {"rumour":[], "nonRumour":[]}
        for tweetCode in next(os.walk(zubiagaPath+topic+"/rumours/"))[1]:
            zubiagaTweets[topic]["rumour"][tweetCode] = {"source":[] , "reactions":[]}
            for tweet in os.listdir(zubiagaPath+topic+"/rumours/"+tweetCode+"/source-tweets"):
                if tweet[0] != ".":
                    zubiagaTweets[topic]["rumour"][tweetCode]["source"] = json.load(open(zubiagaPath+topic+"/rumours/"+tweetCode+"/source-tweets/"+tweet,"rb"))
                    zubiagaSourceTweets[topic]["rumour"].append(json.load(open(zubiagaPath+topic+"/rumours/"+tweetCode+"/source-tweets/"+tweet,"rb")))
            for tweet in os.listdir(zubiagaPath+topic+"/rumours/"+tweetCode+"/reactions"):
                if tweet[0] != ".":
                    zubiagaTweets[topic]["rumour"][tweetCode]["reactions"].append(json.load(open(zubiagaPath+topic+"/rumours/"+tweetCode+"/reactions/"+tweet,"rb")))
        for tweetCode in next(os.walk(zubiagaPath+topic+"/non-rumours/"))[1]:
            zubiagaTweets[topic]["nonRumour"][tweetCode] = {"source":[] , "reactions":[]}
            for tweet in os.listdir(zubiagaPath+topic+"/non-rumours/"+tweetCode+"/source-tweets"):
                if tweet[0] != ".":
                    zubiagaTweets[topic]["nonRumour"][tweetCode]["source"] = json.load(open(zubiagaPath+topic+"/non-rumours/"+tweetCode+"/source-tweets/"+tweet,"rb"))
                    zubiagaSourceTweets[topic]["nonRumour"].append(json.load(open(zubiagaPath+topic+"/non-rumours/"+tweetCode+"/source-tweets/"+tweet,"rb")))
            for tweet in os.listdir(zubiagaPath+topic+"/non-rumours/"+tweetCode+"/reactions"):
                if tweet[0] != ".":
                    zubiagaTweets[topic]["nonRumour"][tweetCode]["reactions"].append(json.load(open(zubiagaPath+topic+"/non-rumours/"+tweetCode+"/reactions/"+tweet,"rb")))
except:
    print(tweet)

In [None]:
# Counting the number of rumours and non-rumours
r=0
n=0
for i in zubiagaTweets:
    for j in zubiagaTweets[i]:
        if j == "rumour":
            r += len(zubiagaTweets[i][j])
        elif j == "nonRumour":
            n += len(zubiagaTweets[i][j])
            
#The correct number is rumour=2402 and non-rumour=4023
print(f'r={r}, n={n}')

In [None]:
# Loading Kwon dataset into a multi-level dictionary
kwonRumours = next(os.walk(kwonPath))[1]
kwonTweets = {}
kwonSourceTweets = {} 

for topic in tqdm(kwonRumours):
    if topic[0] == "N":
        kwonSourceTweets[topic] = {"nonRumour":[]}    
    elif topic[0] == "R":
        kwonSourceTweets[topic] = {"rumour":[]} 
    for tweetCode in os.listdir(kwonPath+topic):
        if topic[0] == "N":
            kwonSourceTweets[topic]["nonRumour"].append(json.load(open(kwonPath+topic+"/"+tweetCode,"rb")))
        elif topic[0] == "R":
            kwonSourceTweets[topic]["rumour"].append(json.load(open(kwonPath+topic+"/"+tweetCode,"rb")))

In [None]:
# Counting the number of rumour and non-rumour
p,q = 0,0
for top in kwonSourceTweets:
    if top[0] == "N":
        p += len(kwonSourceTweets[top]["nonRumour"])
    elif top[0] == "R":
        q += len(kwonSourceTweets[top]["rumour"])
# The correct number is rumour=44394, nonRumour=96516
print(f'rumour={q}, nonRumour={p}')

In [None]:
# Writing each tweet of Zubiaga dataset in a separate file for cognitive analysis (LIWC Scores)
for topic in tqdm(zubiagaSourceTweets):
    for category in zubiagaSourceTweets[topic]:
        for tweet in zubiagaSourceTweets[topic][category]:
            handle = open(f'./LIWC/Zubiaga/{topic}#{category}#{tweet["id"]}.txt', encoding="utf-8", mode="w")
            handle.write(tweet["text"])
            handle.close()

# Writing each tweet of Kwon dataset in a separate file for cognitive analysis
for topic in tqdm(kwonSourceTweets):
    for category in kwonSourceTweets[topic]:
        for tweet in kwonSourceTweets[topic][category]:
            handle = open(f'./LIWC/Kwon/{topic}#{category}#{tweet["id"]}.txt', encoding="utf-8", mode="w")
            handle.write(tweet["full_text"])
            handle.close()

In [None]:
#Loading the results of LIWC for both KWON and Zubiaga dataset

#Loading LIWC Roles
kLIWCroles = [j.strip() for j in [i.replace("\t", " ") for i in open("./LIWC/LIWC2015 Results (Kwon (140910 files)).txt").readlines()][0].split(" ")]
zLIWCroles = [j.strip() for j in [i.replace("\t", " ") for i in open("./LIWC/LIWC2015 Results (Zubiaga (6425 files)).txt").readlines()][0].split(" ")]

#Loading LIWC Scores
zLIWC_raw = [j.split(" ") for j in [i.replace("\t", " ").strip() for i in open("./LIWC/LIWC2015 Results (Zubiaga (6425 files)).txt").readlines()][1:]]
kLIWC_raw = [j.split(" ") for j in [i.replace("\t", " ").strip() for i in open("./LIWC/LIWC2015 Results (Kwon (140910 files)).txt").readlines()][1:]]

In [None]:
# Making a dictionary out of raw LIWC scores
zLIWC = {}
kLIWC = {}
for i in zubiagaSourceTweets:
    zLIWC[i]={"rumour":{}, "nonRumour":{}}
for j in kwonSourceTweets:
    kLIWC[j]={"rumour":{}, "nonRumour":{}}
    

for item in zLIWC_raw:
    zipped = list(zip(zLIWCroles, item))
    txt = zipped[0][1].replace(".txt","").split("#")
    topic = txt[0]
    category = txt[1]
    _id = txt[2]
    zLIWC[topic][category][_id]={}
    for element in zipped:
        zLIWC[topic][category][_id][element[0]] = element[1]

for item in kLIWC_raw:
    zipped = list(zip(kLIWCroles, item))
    txt = zipped[0][1].replace(".txt","").split("#")
    topic = txt[0]
    category = txt[1]
    _id = txt[2]
    kLIWC[topic][category][_id]={}
    for element in zipped:
        kLIWC[topic][category][_id][element[0]] = element[1]

In [None]:
# Loading abbreviations, vuglar terms and emoticons for feature extraction
abbrAdr = "./Abbreviations/abbr.txt"
abbrList = [w.strip() for w in open(abbrAdr).readlines() if w != "\n"]
    
emotiAdr = "./Emoticon/emoticons.txt"
emotiList = [w.strip() for w in open(emotiAdr).readlines() if w != "\n"]

vuglarAdr = "./Vuglar terms/vuglarTerms.txt"
vuglarList = [w.strip() for w in open(vuglarAdr).readlines() if w != "\n"]

In [None]:
# Feature extraction from Zubiaga dataset

nlp = spacy.load("en")
#Path to CoreNLP installation directory
nlp2 = StanfordCoreNLP('')


zubiagafeatures = {}
for topic in tqdm(zubiagaSourceTweets):
    zubiagafeatures[topic] = {}
    for category in zubiagaSourceTweets[topic]:
        zubiagafeatures[topic][category] = {}
        for tweet in zubiagaSourceTweets[topic][category]:
            zubiagafeatures[topic][category][tweet["id"]] = {}
            zubiagafeatures[topic][category][tweet["id"]]["category"] = category
            zubiagafeatures[topic][category][tweet["id"]]["topic"] = topic
            zubiagafeatures[topic][category][tweet["id"]]["id"] = tweet["id"]
            zubiagafeatures[topic][category][tweet["id"]]["screenName"] = tweet["user"]["screen_name"]
            zubiagafeatures[topic][category][tweet["id"]]["text"] = tweet["text"]
            zubiagafeatures[topic][category][tweet["id"]]["tweetUrl"] = "https://twitter.com/" + tweet["user"]["screen_name"] + "/status/" + str(tweet["id"]) 
            
#Linguistic features

            spacyTweetText = nlp(tweet["text"])        
            zubiagafeatures[topic][category][tweet["id"]]["exclamationMarkCount"] = tweet["text"].count("!")
            zubiagafeatures[topic][category][tweet["id"]]["questionMarkCount"] = tweet["text"].count("?")
            zubiagafeatures[topic][category][tweet["id"]]["characterCount"] = len(tweet["text"])
            zubiagafeatures[topic][category][tweet["id"]]["tokenCount"] = len(spacyTweetText)
            zubiagafeatures[topic][category][tweet["id"]]["subjectivity"] = TextBlob(tweet["text"]).sentiment.subjectivity
            zubiagafeatures[topic][category][tweet["id"]]["polarity"] = TextBlob(tweet["text"]).sentiment.polarity
            zubiagafeatures[topic][category][tweet["id"]]["uppercaseCount"] = sum(1 for i in tweet["text"] if i.isupper())
            zubiagafeatures[topic][category][tweet["id"]]["lowerCaseCount"] = sum(1 for i in tweet["text"] if i.islower())
            zubiagafeatures[topic][category][tweet["id"]]["firstPersonPronounCount"] = tweet["text"].lower().split(" ").count("i") + tweet["text"].lower().split(" ").count("me") + tweet["text"].lower().split(" ").count("my") + tweet["text"].lower().split(" ").count("mine") + tweet["text"].lower().split(" ").count("we") + tweet["text"].lower().split(" ").count("us") + tweet["text"].lower().split(" ").count("our")  + tweet["text"].lower().split(" ").count("ours") + tweet["text"].lower().split(" ").count("i'm") + tweet["text"].lower().split(" ").count("we're") + tweet["text"].lower().split(" ").count("i've") + tweet["text"].lower().split(" ").count("we've") + tweet["text"].lower().split(" ").count("i'd") + tweet["text"].lower().split(" ").count("we'd")
            zubiagafeatures[topic][category][tweet["id"]]["secondPersonPronounCount"] = tweet["text"].lower().split(" ").count("you") + tweet["text"].lower().split(" ").count("your") + tweet["text"].lower().split(" ").count("yours") +  tweet["text"].lower().split(" ").count("you're") + tweet["text"].lower().split(" ").count("you've") + tweet["text"].lower().split(" ").count("you'd")
            zubiagafeatures[topic][category][tweet["id"]]["thirdPersonPronounCount"] = tweet["text"].lower().split(" ").count("he") + tweet["text"].lower().split(" ").count("she") + tweet["text"].lower().split(" ").count("it") + tweet["text"].lower().split(" ").count("his") + tweet["text"].lower().split(" ").count("her") + tweet["text"].lower().split(" ").count("its") + tweet["text"].lower().split(" ").count("him") + tweet["text"].lower().split(" ").count("hers") + tweet["text"].lower().split(" ").count("they") + tweet["text"].lower().split(" ").count("them") + tweet["text"].lower().split(" ").count("their") + tweet["text"].lower().split(" ").count("theirs")+ tweet["text"].lower().split(" ").count("they're") + tweet["text"].lower().split(" ").count("he's") + tweet["text"].lower().split(" ").count("she's") + tweet["text"].lower().split(" ").count("it's") + tweet["text"].lower().split(" ").count("they've") + tweet["text"].lower().split(" ").count("they'd") + tweet["text"].lower().split(" ").count("he'd") + tweet["text"].lower().split(" ").count("she'd") + tweet["text"].lower().split(" ").count("it'd")
            zubiagafeatures[topic][category][tweet["id"]]["capitalWordsCount"] = len([b for b in [i for i in tweet["text"].split(" ")] if b.isupper()])
            zubiagafeatures[topic][category][tweet["id"]]["averageWordComplexity"] = np.average([len(i) for i in tweet["text"].split(" ")])
            zubiagafeatures[topic][category][tweet["id"]]["vuglarTermsCount"] = len([a for a in tweet["text"].split(" ") if a.lower() in vuglarList])
            zubiagafeatures[topic][category][tweet["id"]]["emoticonCount"] = len([a for a in tweet["text"].split(" ") if a.lower() in emotiList])
            zubiagafeatures[topic][category][tweet["id"]]["abbreviationCount"] = len([a for a in tweet["text"].split(" ") if a.lower() in abbrList])
            zubiagafeatures[topic][category][tweet["id"]]["emojiCount"] = len([x for x in tweet["text"].split(" ") if x in emoji.EMOJI_UNICODE.keys() or x in emoji.EMOJI_UNICODE.values()])
            
            zubiagafeatures[topic][category][tweet["id"]]["posAdjectiveCount"] = len([x for x in spacyTweetText if x.pos_ == "ADJ"])
            zubiagafeatures[topic][category][tweet["id"]]["posAdpositionCount"] = len([x for x in spacyTweetText if x.pos_ == "ADP"])
            zubiagafeatures[topic][category][tweet["id"]]["posAdverbCount"] = len([x for x in spacyTweetText if x.pos_ == "ADV"])
            zubiagafeatures[topic][category][tweet["id"]]["posAuxiliaryCount"] = len([x for x in spacyTweetText if x.pos_ == "AUX"])
            zubiagafeatures[topic][category][tweet["id"]]["posConjunctionCount"] = len([x for x in spacyTweetText if x.pos_ == "CONJ"])
            zubiagafeatures[topic][category][tweet["id"]]["posCoordinatingConjunctionCount"] = len([x for x in spacyTweetText if x.pos_ == "CCONJ"])
            zubiagafeatures[topic][category][tweet["id"]]["posDeterminerCount"] = len([x for x in spacyTweetText if x.pos_ == "DET"])
            zubiagafeatures[topic][category][tweet["id"]]["posInterjectionCount"] = len([x for x in spacyTweetText if x.pos_ == "INTJ"])
            zubiagafeatures[topic][category][tweet["id"]]["posNounCount"] = len([x for x in spacyTweetText if x.pos_ == "NOUN"])
            zubiagafeatures[topic][category][tweet["id"]]["posNumeralCount"] = len([x for x in spacyTweetText if x.pos_ == "NUM"])
            zubiagafeatures[topic][category][tweet["id"]]["posParticleCount"] = len([x for x in spacyTweetText if x.pos_ == "PART"])
            zubiagafeatures[topic][category][tweet["id"]]["posPronounCount"] = len([x for x in spacyTweetText if x.pos_ == "PRON"])
            zubiagafeatures[topic][category][tweet["id"]]["posProperNounCount"] = len([x for x in spacyTweetText if x.pos_ == "PROPN"])
            zubiagafeatures[topic][category][tweet["id"]]["posPunctuationCount"] = len([x for x in spacyTweetText if x.pos_ == "PUNCT"])
            zubiagafeatures[topic][category][tweet["id"]]["posSubordinatingConjunctionCount"] = len([x for x in spacyTweetText if x.pos_ == "SCONJ"])
            zubiagafeatures[topic][category][tweet["id"]]["posSymbolCount"] = len([x for x in spacyTweetText if x.pos_ == "SYM"])
            zubiagafeatures[topic][category][tweet["id"]]["posVerbCount"] = len([x for x in spacyTweetText if x.pos_ == "VERB"])
            zubiagafeatures[topic][category][tweet["id"]]["posOtherCount"] = len([x for x in spacyTweetText if x.pos_ == "X"])
            zubiagafeatures[topic][category][tweet["id"]]["posSpaceCount"] = len([x for x in spacyTweetText if x.pos_ == "SPACE"])
            zubiagafeatures[topic][category][tweet["id"]]["nerPersonCount"] = len([x for x in spacyTweetText if x.ent_type_ == "Person"])
            zubiagafeatures[topic][category][tweet["id"]]["nerNationalityCount"] = len([x for x in spacyTweetText if x.ent_type_ == "NORP"])
            zubiagafeatures[topic][category][tweet["id"]]["nerBuildingCount"] = len([x for x in spacyTweetText if x.ent_type_ == "FAC"])
            zubiagafeatures[topic][category][tweet["id"]]["nerOrganizationCount"] = len([x for x in spacyTweetText if x.ent_type_ == "ORG"])
            zubiagafeatures[topic][category][tweet["id"]]["nerCountriesCount"] = len([x for x in spacyTweetText if x.ent_type_ == "GPE"])
            zubiagafeatures[topic][category][tweet["id"]]["nerLocationCount"] = len([x for x in spacyTweetText if x.ent_type_ == "LOC"])
            zubiagafeatures[topic][category][tweet["id"]]["nerProductCount"] = len([x for x in spacyTweetText if x.ent_type_ == "PRODUCT"])
            zubiagafeatures[topic][category][tweet["id"]]["nerEventCount"] = len([x for x in spacyTweetText if x.ent_type_ == "EVENT"])
            zubiagafeatures[topic][category][tweet["id"]]["nerArtCount"] = len([x for x in spacyTweetText if x.ent_type_ == "WORK_OF_ART"])
            zubiagafeatures[topic][category][tweet["id"]]["nerLawCount"] = len([x for x in spacyTweetText if x.ent_type_ == "LAW"])
            zubiagafeatures[topic][category][tweet["id"]]["nerLanguageCount"] = len([x for x in spacyTweetText if x.ent_type_ == "LANGUAGE"])
            zubiagafeatures[topic][category][tweet["id"]]["nerDateCount"] = len([x for x in spacyTweetText if x.ent_type_ == "DATE"])
            zubiagafeatures[topic][category][tweet["id"]]["nerTimeCount"] = len([x for x in spacyTweetText if x.ent_type_ == "TIME"])
            zubiagafeatures[topic][category][tweet["id"]]["nerMoneyCount"] = len([x for x in spacyTweetText if x.ent_type_ == "MONEY"])
            zubiagafeatures[topic][category][tweet["id"]]["nerQuantityCount"] = len([x for x in spacyTweetText if x.ent_type_ == "QUANTITY"])
            zubiagafeatures[topic][category][tweet["id"]]["nerOrdinalCount"] = len([x for x in spacyTweetText if x.ent_type_ == "ORDINAL"])
            zubiagafeatures[topic][category][tweet["id"]]["nerCardinalCount"] = len([x for x in spacyTweetText if x.ent_type_ == "Cardinal"])  

            zubiagafeatures[topic][category][tweet["id"]]["insight"] = zLIWC[topic][category][str(tweet["id"])]["insight"]
            zubiagafeatures[topic][category][tweet["id"]]["tentative"] = zLIWC[topic][category][str(tweet["id"])]["tentat"]
            zubiagafeatures[topic][category][tweet["id"]]["positiveEmotion"] = zLIWC[topic][category][str(tweet["id"])]["posemo"]
            zubiagafeatures[topic][category][tweet["id"]]["negativeEmotion"] = zLIWC[topic][category][str(tweet["id"])]["negemo"]
            zubiagafeatures[topic][category][tweet["id"]]["anxiety"] = zLIWC[topic][category][str(tweet["id"])]["anx"]
            zubiagafeatures[topic][category][tweet["id"]]["certainty"] = zLIWC[topic][category][str(tweet["id"])]["certain"]
            zubiagafeatures[topic][category][tweet["id"]]["tone"] = zLIWC[topic][category][str(tweet["id"])]["Tone"]
            parser=nlp2.parse(tweet["text"]) 
            tree=Tree.fromstring(parser.__str__()) 
            zubiagafeatures[topic][category][tweet["id"]]["sentenceComplexity"] = tree.height() 
         
# User features

            zubiagafeatures[topic][category][tweet["id"]]["hasProfileDescription"] = True if tweet["user"]["description"] != '' else False
            zubiagafeatures[topic][category][tweet["id"]]["isVerifiedAccount"] = tweet["user"]["verified"] 
            zubiagafeatures[topic][category][tweet["id"]]["statusCount"] = tweet["user"]["statuses_count"]
            zubiagafeatures[topic][category][tweet["id"]]["followingCount"] = tweet["user"]["friends_count"]
            zubiagafeatures[topic][category][tweet["id"]]["influnece"] = tweet["user"]["followers_count"]
            try:
                zubiagafeatures[topic][category][tweet["id"]]["userRole"] = tweet["user"]["followers_count"]/tweet["user"]["friends_count"]
            except:
                zubiagafeatures[topic][category][tweet["id"]]["userRole"] = float("inf")
            zubiagafeatures[topic][category][tweet["id"]]["totalProfileLikesCount"] = tweet["user"]["favourites_count"]
            accountCreationTime = datetime.datetime.strptime(tweet["user"]["created_at"], "%a %b %d %H:%M:%S %z %Y")
            today = datetime.datetime.now()
            zubiagafeatures[topic][category][tweet["id"]]["accountAge"] = (today.date() - accountCreationTime.date()).days
            zubiagafeatures[topic][category][tweet["id"]]["protectedProfile"] = tweet["user"]["protected"]
            zubiagafeatures[topic][category][tweet["id"]]["hasProfileLocation"] = True if tweet["user"]["location"] != '' else False
            zubiagafeatures[topic][category][tweet["id"]]["profileLocation"] = tweet["user"]["location"]
            zubiagafeatures[topic][category][tweet["id"]]["hasProfilePicture"] = True if tweet["user"]["profile_image_url"] != '' else False
            zubiagafeatures[topic][category][tweet["id"]]["geoEnabled"] = tweet["user"]["geo_enabled"]
            zubiagafeatures[topic][category][tweet["id"]]["hasProfileUrl"] = tweet["user"]["url"]
            zubiagafeatures[topic][category][tweet["id"]]["averageFollowSpeed"] = tweet["user"]["followers_count"] / zubiagafeatures[topic][category][tweet["id"]]["accountAge"]
            zubiagafeatures[topic][category][tweet["id"]]["averageBeingFollowedSpeed"] = tweet["user"]["friends_count"] / zubiagafeatures[topic][category][tweet["id"]]["accountAge"]
            zubiagafeatures[topic][category][tweet["id"]]["averageLikeSpeed"] = tweet["user"]["favourites_count"] / zubiagafeatures[topic][category][tweet["id"]]["accountAge"]
            zubiagafeatures[topic][category][tweet["id"]]["averageStatusSpeed"] = tweet["user"]["statuses_count"] / zubiagafeatures[topic][category][tweet["id"]]["accountAge"]
            zubiagafeatures[topic][category][tweet["id"]]["screenNameLength"] = len(tweet["user"]["screen_name"])
            zubiagafeatures[topic][category][tweet["id"]]["screenNameDigitCount"] = len([i for i in tweet["user"]["screen_name"] if i in [str(k) for k in range(0,10)]])
            
# Meta message features
        
            zubiagafeatures[topic][category][tweet["id"]]["hashtagCount"] = len(tweet["entities"]["hashtags"])
            zubiagafeatures[topic][category][tweet["id"]]["mentionCount"] = len(tweet["entities"]["user_mentions"])
            zubiagafeatures[topic][category][tweet["id"]]["hasUrl"] = ("http://" in tweet["text"] or "https://" in tweet["text"] or len(tweet["entities"]["urls"]) > 0)
            zubiagafeatures[topic][category][tweet["id"]]["multimediaCounter"] = len(tweet["entities"]["media"]) if "media" in tweet["entities"].keys() else 0
            zubiagafeatures[topic][category][tweet["id"]]["likeCount"] = tweet["favorite_count"]
            zubiagafeatures[topic][category][tweet["id"]]["retweetCount"] = tweet["retweet_count"]
            tweetCreationTime = datetime.datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
            zubiagafeatures[topic][category][tweet["id"]]["tweetPostTime"] = tweetCreationTime.hour*3600 + tweetCreationTime.minute*60 + tweetCreationTime.second 
            zubiagafeatures[topic][category][tweet["id"]]["place"] = tweet["place"]
            
#Compound score

            zubiagafeatures[topic][category][tweet["id"]]["credibilityScore"] = tweet["user"]["verified"] \
                                                                                * (zubiagafeatures[topic][category][tweet["id"]]["accountAge"] + tweet["user"]["followers_count"] \
                                                                                   + tweet["user"]["statuses_count"] + tweet["favorite_count"]) 
            zubiagafeatures[topic][category][tweet["id"]]["engagementScore"] = (tweet["user"]["statuses_count"] + tweet["favorite_count"])/zubiagafeatures[topic][category][tweet["id"]]["accountAge"]

#Other features
            
            pattern = re.compile('[>].*[<]')
            try:
                zubiagafeatures[topic][category][tweet["id"]]["source"] = pattern.findall(tweet["source"])[0][1:-1]
            except:
                zubiagafeatures[topic][category][tweet["id"]]["source"] = tweet["source"]


In [None]:
# Feature extraction from Kwon dataset

nlp = spacy.load("en")
nlp2 = StanfordCoreNLP('D:/Program Files (x86)/StanfordCoreNLP/stanford-corenlp-full-2018-02-27')
excepts=[]

kwonfeatures = {}
for topic in tqdm(kwonSourceTweets):
    kwonfeatures[topic] = {}
    for category in kwonSourceTweets[topic]:
        kwonfeatures[topic][category] = {}
        for tweet in kwonSourceTweets[topic][category]:
            kwonfeatures[topic][category][tweet["id"]] = {}
            kwonfeatures[topic][category][tweet["id"]]["category"] = category
            kwonfeatures[topic][category][tweet["id"]]["topic"] = topic
            kwonfeatures[topic][category][tweet["id"]]["id"] = tweet["id"]
            kwonfeatures[topic][category][tweet["id"]]["screenName"] = tweet["user"]["screen_name"]
            kwonfeatures[topic][category][tweet["id"]]["text"] = tweet["full_text"]
            kwonfeatures[topic][category][tweet["id"]]["tweetUrl"] = "https://twitter.com/" + tweet["user"]["screen_name"] + "/status/" + str(tweet["id"]) 
            
#Linguistic features

            spacyTweetText = nlp(tweet["full_text"])
            kwonfeatures[topic][category][tweet["id"]]["exclamationMarkCount"] = tweet["full_text"].count("!")
            kwonfeatures[topic][category][tweet["id"]]["questionMarkCount"] = tweet["full_text"].count("?")
            kwonfeatures[topic][category][tweet["id"]]["characterCount"] = len(tweet["full_text"])
            kwonfeatures[topic][category][tweet["id"]]["tokenCount"] = len(spacyTweetText)
            kwonfeatures[topic][category][tweet["id"]]["subjectivity"] = TextBlob(tweet["full_text"]).sentiment.subjectivity
            kwonfeatures[topic][category][tweet["id"]]["polarity"] = TextBlob(tweet["full_text"]).sentiment.polarity
            kwonfeatures[topic][category][tweet["id"]]["uppercaseCount"] = sum(1 for i in tweet["full_text"] if i.isupper())
            kwonfeatures[topic][category][tweet["id"]]["lowerCaseCount"] = sum(1 for i in tweet["full_text"] if i.islower())
            kwonfeatures[topic][category][tweet["id"]]["firstPersonPronounCount"] = tweet["full_text"].lower().split(" ").count("i") + tweet["full_text"].lower().split(" ").count("me") + tweet["full_text"].lower().split(" ").count("my") + tweet["full_text"].lower().split(" ").count("mine") + tweet["full_text"].lower().split(" ").count("we") + tweet["full_text"].lower().split(" ").count("us") + tweet["full_text"].lower().split(" ").count("our")  + tweet["full_text"].lower().split(" ").count("ours") + tweet["full_text"].lower().split(" ").count("i'm") + tweet["full_text"].lower().split(" ").count("we're") + tweet["full_text"].lower().split(" ").count("i've") + tweet["full_text"].lower().split(" ").count("we've") + tweet["full_text"].lower().split(" ").count("i'd") + tweet["full_text"].lower().split(" ").count("we'd")
            kwonfeatures[topic][category][tweet["id"]]["secondPersonPronounCount"] = tweet["full_text"].lower().split(" ").count("you") + tweet["full_text"].lower().split(" ").count("your") + tweet["full_text"].lower().split(" ").count("yours") +  tweet["full_text"].lower().split(" ").count("you're") + tweet["full_text"].lower().split(" ").count("you've") + tweet["full_text"].lower().split(" ").count("you'd")
            kwonfeatures[topic][category][tweet["id"]]["thirdPersonPronounCount"] = tweet["full_text"].lower().split(" ").count("he") + tweet["full_text"].lower().split(" ").count("she") + tweet["full_text"].lower().split(" ").count("it") + tweet["full_text"].lower().split(" ").count("his") + tweet["full_text"].lower().split(" ").count("her") + tweet["full_text"].lower().split(" ").count("its") + tweet["full_text"].lower().split(" ").count("him") + tweet["full_text"].lower().split(" ").count("hers") + tweet["full_text"].lower().split(" ").count("they") + tweet["full_text"].lower().split(" ").count("them") + tweet["full_text"].lower().split(" ").count("their") + tweet["full_text"].lower().split(" ").count("theirs")+ tweet["full_text"].lower().split(" ").count("they're") + tweet["full_text"].lower().split(" ").count("he's") + tweet["full_text"].lower().split(" ").count("she's") + tweet["full_text"].lower().split(" ").count("it's") + tweet["full_text"].lower().split(" ").count("they've") + tweet["full_text"].lower().split(" ").count("they'd") + tweet["full_text"].lower().split(" ").count("he'd") + tweet["full_text"].lower().split(" ").count("she'd") + tweet["full_text"].lower().split(" ").count("it'd")
            kwonfeatures[topic][category][tweet["id"]]["capitalWordsCount"] = len([b for b in [i for i in tweet["full_text"].split(" ")] if b.isupper()])
            kwonfeatures[topic][category][tweet["id"]]["averageWordComplexity"] = np.average([len(i) for i in tweet["full_text"].split(" ")])
            kwonfeatures[topic][category][tweet["id"]]["vuglarTermsCount"] = len([a for a in tweet["full_text"].split(" ") if a.lower() in vuglarList])
            kwonfeatures[topic][category][tweet["id"]]["emoticonCount"] = len([a for a in tweet["full_text"].split(" ") if a.lower() in emotiList])
            kwonfeatures[topic][category][tweet["id"]]["abbreviationCount"] = len([a for a in tweet["full_text"].split(" ") if a.lower() in abbrList])
            kwonfeatures[topic][category][tweet["id"]]["emojiCount"] = len([x for x in tweet["full_text"].split(" ") if x in emoji.EMOJI_UNICODE.keys() or x in emoji.EMOJI_UNICODE.values()])
 
            kwonfeatures[topic][category][tweet["id"]]["posAdjectiveCount"] = len([x for x in spacyTweetText if x.pos_ == "ADJ"])
            kwonfeatures[topic][category][tweet["id"]]["posAdpositionCount"] = len([x for x in spacyTweetText if x.pos_ == "ADP"])
            kwonfeatures[topic][category][tweet["id"]]["posAdverbCount"] = len([x for x in spacyTweetText if x.pos_ == "ADV"])
            kwonfeatures[topic][category][tweet["id"]]["posAuxiliaryCount"] = len([x for x in spacyTweetText if x.pos_ == "AUX"])
            kwonfeatures[topic][category][tweet["id"]]["posConjunctionCount"] = len([x for x in spacyTweetText if x.pos_ == "CONJ"])
            kwonfeatures[topic][category][tweet["id"]]["posCoordinatingConjunctionCount"] = len([x for x in spacyTweetText if x.pos_ == "CCONJ"])
            kwonfeatures[topic][category][tweet["id"]]["posDeterminerCount"] = len([x for x in spacyTweetText if x.pos_ == "DET"])
            kwonfeatures[topic][category][tweet["id"]]["posInterjectionCount"] = len([x for x in spacyTweetText if x.pos_ == "INTJ"])
            kwonfeatures[topic][category][tweet["id"]]["posNounCount"] = len([x for x in spacyTweetText if x.pos_ == "NOUN"])
            kwonfeatures[topic][category][tweet["id"]]["posNumeralCount"] = len([x for x in spacyTweetText if x.pos_ == "NUM"])
            kwonfeatures[topic][category][tweet["id"]]["posParticleCount"] = len([x for x in spacyTweetText if x.pos_ == "PART"])
            kwonfeatures[topic][category][tweet["id"]]["posPronounCount"] = len([x for x in spacyTweetText if x.pos_ == "PRON"])
            kwonfeatures[topic][category][tweet["id"]]["posProperNounCount"] = len([x for x in spacyTweetText if x.pos_ == "PROPN"])
            kwonfeatures[topic][category][tweet["id"]]["posPunctuationCount"] = len([x for x in spacyTweetText if x.pos_ == "PUNCT"])
            kwonfeatures[topic][category][tweet["id"]]["posSubordinatingConjunctionCount"] = len([x for x in spacyTweetText if x.pos_ == "SCONJ"])
            kwonfeatures[topic][category][tweet["id"]]["posSymbolCount"] = len([x for x in spacyTweetText if x.pos_ == "SYM"])
            kwonfeatures[topic][category][tweet["id"]]["posVerbCount"] = len([x for x in spacyTweetText if x.pos_ == "VERB"])
            kwonfeatures[topic][category][tweet["id"]]["posOtherCount"] = len([x for x in spacyTweetText if x.pos_ == "X"])
            kwonfeatures[topic][category][tweet["id"]]["posSpaceCount"] = len([x for x in spacyTweetText if x.pos_ == "SPACE"])
            kwonfeatures[topic][category][tweet["id"]]["nerPersonCount"] = len([x for x in spacyTweetText if x.ent_type_ == "Person"])
            kwonfeatures[topic][category][tweet["id"]]["nerNationalityCount"] = len([x for x in spacyTweetText if x.ent_type_ == "NORP"])
            kwonfeatures[topic][category][tweet["id"]]["nerBuildingCount"] = len([x for x in spacyTweetText if x.ent_type_ == "FAC"])
            kwonfeatures[topic][category][tweet["id"]]["nerOrganizationCount"] = len([x for x in spacyTweetText if x.ent_type_ == "ORG"])
            kwonfeatures[topic][category][tweet["id"]]["nerCountriesCount"] = len([x for x in spacyTweetText if x.ent_type_ == "GPE"])
            kwonfeatures[topic][category][tweet["id"]]["nerLocationCount"] = len([x for x in spacyTweetText if x.ent_type_ == "LOC"])
            kwonfeatures[topic][category][tweet["id"]]["nerProductCount"] = len([x for x in spacyTweetText if x.ent_type_ == "PRODUCT"])
            kwonfeatures[topic][category][tweet["id"]]["nerEventCount"] = len([x for x in spacyTweetText if x.ent_type_ == "EVENT"])
            kwonfeatures[topic][category][tweet["id"]]["nerArtCount"] = len([x for x in spacyTweetText if x.ent_type_ == "WORK_OF_ART"])
            kwonfeatures[topic][category][tweet["id"]]["nerLawCount"] = len([x for x in spacyTweetText if x.ent_type_ == "LAW"])
            kwonfeatures[topic][category][tweet["id"]]["nerLanguageCount"] = len([x for x in spacyTweetText if x.ent_type_ == "LANGUAGE"])
            kwonfeatures[topic][category][tweet["id"]]["nerDateCount"] = len([x for x in spacyTweetText if x.ent_type_ == "DATE"])
            kwonfeatures[topic][category][tweet["id"]]["nerTimeCount"] = len([x for x in spacyTweetText if x.ent_type_ == "TIME"])
            kwonfeatures[topic][category][tweet["id"]]["nerMoneyCount"] = len([x for x in spacyTweetText if x.ent_type_ == "MONEY"])
            kwonfeatures[topic][category][tweet["id"]]["nerQuantityCount"] = len([x for x in spacyTweetText if x.ent_type_ == "QUANTITY"])
            kwonfeatures[topic][category][tweet["id"]]["nerOrdinalCount"] = len([x for x in spacyTweetText if x.ent_type_ == "ORDINAL"])
            kwonfeatures[topic][category][tweet["id"]]["nerCardinalCount"] = len([x for x in spacyTweetText if x.ent_type_ == "Cardinal"])

            kwonfeatures[topic][category][tweet["id"]]["insight"] = kLIWC[topic][category][str(tweet["id"])]["insight"]
            kwonfeatures[topic][category][tweet["id"]]["tentative"] = kLIWC[topic][category][str(tweet["id"])]["tentat"]
            kwonfeatures[topic][category][tweet["id"]]["positiveEmotion"] = kLIWC[topic][category][str(tweet["id"])]["posemo"]
            kwonfeatures[topic][category][tweet["id"]]["negativeEmotion"] = kLIWC[topic][category][str(tweet["id"])]["negemo"]
            kwonfeatures[topic][category][tweet["id"]]["anxiety"] = kLIWC[topic][category][str(tweet["id"])]["anx"]
            kwonfeatures[topic][category][tweet["id"]]["certainty"] = kLIWC[topic][category][str(tweet["id"])]["certain"]
            kwonfeatures[topic][category][tweet["id"]]["tone"] = kLIWC[topic][category][str(tweet["id"])]["Tone"]
            try:
                parser=nlp2.parse(tweet["full_text"]) 
                tree=Tree.fromstring(parser.__str__()) 
                kwonfeatures[topic][category][tweet["id"]]["sentenceComplexity"] = tree.height() 
            except:
                excepts.append(tweet)
            
# User features

            kwonfeatures[topic][category][tweet["id"]]["hasProfileDescription"] = True if tweet["user"]["description"] != '' else False
            kwonfeatures[topic][category][tweet["id"]]["isVerifiedAccount"] = tweet["user"]["verified"] 
            kwonfeatures[topic][category][tweet["id"]]["statusCount"] = tweet["user"]["statuses_count"]
            kwonfeatures[topic][category][tweet["id"]]["followingCount"] = tweet["user"]["friends_count"]
            kwonfeatures[topic][category][tweet["id"]]["influnece"] = tweet["user"]["followers_count"]
            try:
                kwonfeatures[topic][category][tweet["id"]]["userRole"] = tweet["user"]["followers_count"] / tweet["user"]["friends_count"] 
            except:
                kwonfeatures[topic][category][tweet["id"]]["userRole"] = float("inf")
                excepts.append(tweet)
            kwonfeatures[topic][category][tweet["id"]]["totalProfileLikesCount"] = tweet["user"]["favourites_count"]
            accountCreationTime = datetime.datetime.strptime(tweet["user"]["created_at"], "%a %b %d %H:%M:%S %z %Y")
            today = datetime.datetime.now()
            kwonfeatures[topic][category][tweet["id"]]["accountAge"] = (today.date() - accountCreationTime.date()).days
            kwonfeatures[topic][category][tweet["id"]]["protectedProfile"] = tweet["user"]["protected"]
            kwonfeatures[topic][category][tweet["id"]]["hasProfileLocation"] = True if tweet["user"]["location"] != '' else False
            kwonfeatures[topic][category][tweet["id"]]["profileLocation"] = tweet["user"]["location"]
            kwonfeatures[topic][category][tweet["id"]]["hasProfilePicture"] = True if tweet["user"]["profile_image_url"] != '' else False
            kwonfeatures[topic][category][tweet["id"]]["geoEnabled"] = tweet["user"]["geo_enabled"]
            kwonfeatures[topic][category][tweet["id"]]["hasProfileUrl"] = tweet["user"]["url"]
            kwonfeatures[topic][category][tweet["id"]]["averageFollowSpeed"] = tweet["user"]["followers_count"] / kwonfeatures[topic][category][tweet["id"]]["accountAge"]
            kwonfeatures[topic][category][tweet["id"]]["averageBeingFollowedSpeed"] = tweet["user"]["friends_count"] / kwonfeatures[topic][category][tweet["id"]]["accountAge"]
            kwonfeatures[topic][category][tweet["id"]]["averageLikeSpeed"] = tweet["user"]["favourites_count"] / kwonfeatures[topic][category][tweet["id"]]["accountAge"]
            kwonfeatures[topic][category][tweet["id"]]["averageStatusSpeed"] = tweet["user"]["statuses_count"] / kwonfeatures[topic][category][tweet["id"]]["accountAge"]
            kwonfeatures[topic][category][tweet["id"]]["screenNameLength"] = len(tweet["user"]["screen_name"])
            kwonfeatures[topic][category][tweet["id"]]["screenNameDigitCount"] = len([i for i in tweet["user"]["screen_name"] if i in [str(k) for k in range(0,10)]])
            
# Meta message features
        
            kwonfeatures[topic][category][tweet["id"]]["hashtagCount"] = len(tweet["entities"]["hashtags"])
            kwonfeatures[topic][category][tweet["id"]]["mentionCount"] = len(tweet["entities"]["user_mentions"])
            kwonfeatures[topic][category][tweet["id"]]["hasUrl"] = ("http://" in tweet["full_text"] or "https://" in tweet["full_text"] or len(tweet["entities"]["urls"]) > 0)
            kwonfeatures[topic][category][tweet["id"]]["multimediaCounter"] = len(tweet["entities"]["media"]) if "media" in tweet["entities"].keys() else 0
            kwonfeatures[topic][category][tweet["id"]]["likeCount"] = tweet["favorite_count"]
            kwonfeatures[topic][category][tweet["id"]]["retweetCount"] = tweet["retweet_count"]
            tweetCreationTime = datetime.datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
            kwonfeatures[topic][category][tweet["id"]]["tweetPostTime"] = tweetCreationTime.hour*3600 + tweetCreationTime.minute*60 + tweetCreationTime.second 
            kwonfeatures[topic][category][tweet["id"]]["place"] = tweet["place"]

#Compound score

            kwonfeatures[topic][category][tweet["id"]]["credibilityScore"] = tweet["user"]["verified"] \
                                                                                * (kwonfeatures[topic][category][tweet["id"]]["accountAge"] + tweet["user"]["followers_count"] \
                                                                                   + tweet["user"]["statuses_count"] + tweet["favorite_count"]) 
            kwonfeatures[topic][category][tweet["id"]]["engagementScore"] = (tweet["user"]["statuses_count"] + tweet["favorite_count"])/kwonfeatures[topic][category][tweet["id"]]["accountAge"]

#Other features
            
            pattern = re.compile('[>].*[<]')
            try:
                kwonfeatures[topic][category][tweet["id"]]["source"] = pattern.findall(tweet["source"])[0][1:-1]
            except:
                kwonfeatures[topic][category][tweet["id"]]["source"] = tweet["source"]
                excepts.append(tweet)

In [None]:
zubiagafeaturesForDataframe = {}
for topic in tqdm(zubiagafeatures):
    for category in zubiagafeatures[topic]:
        for tweet_id in zubiagafeatures[topic][category]:
            zubiagafeaturesForDataframe[tweet_id] = zubiagafeatures[topic][category][tweet_id]
kwonfeaturesForDataframe = {}
for topic in tqdm(kwonfeatures):
    for category in kwonfeatures[topic]:
        for tweet_id in kwonfeatures[topic][category]:
            kwonfeaturesForDataframe[tweet_id] = kwonfeatures[topic][category][tweet_id]

In [None]:
df_zubiagaFeatures_raw = pd.DataFrame.from_dict(zubiagafeaturesForDataframe, orient="index")
df_kwonFeatures_raw = pd.DataFrame.from_dict(kwonfeaturesForDataframe, orient="index")

In [None]:
#Stroing the dataframe on the hard drive
pk.dump(df_zubiagaFeatures_raw, open("./df_zubiagaFeatures_raw","wb"))
pk.dump(df_kwonFeatures_raw, open("./df_kwonFeatures_raw","wb"))

In [None]:
# Leaving out the non-numerical features and non-early features
df_zubiagaFeatures_processed_v1 = df_zubiagaFeatures_raw.drop(columns=['topic', 'id', 'text', 'screenName', 'tweetUrl', 'source', 'hasProfileUrl', 'likeCount', 'retweetCount'])
df_kwonFeatures_processed_v1 = df_kwonFeatures_raw.drop(columns=['topic', 'id', 'text', 'screenName', 'tweetUrl', 'source', 'hasProfileUrl'])

In [None]:
# Turning all bollean columns to integer
df_zubiagaFeatures_processed_v2 = copy.deepcopy(df_zubiagaFeatures_processed_v1)
df_kwonFeatures_processed_v2 = copy.deepcopy(df_kwonFeatures_processed_v1)
boolCol = [i for i in df_zubiagaFeatures_processed_v1.columns if df_zubiagaFeatures_processed_v1.dtypes[i] == 'bool']
for col in boolCol:
    df_zubiagaFeatures_processed_v2[col] = df_zubiagaFeatures_processed_v1[col].astype(int)
    df_kwonFeatures_processed_v2[col] = df_kwonFeatures_processed_v1[col].astype(int)

In [None]:
#Turning all commas to dots!
for col in df_zubiagaFeatures_processed_v2.columns:
    df_zubiagaFeatures_processed_v2[col] = df_zubiagaFeatures_processed_v2[col].replace(",",".",regex=True) 
    df_kwonFeatures_processed_v2[col] = df_kwonFeatures_processed_v2[col].replace(",",".",regex=True)

In [None]:
#Turning some innocent object data to numeric version!
for col in ['insight', 'tentative', 'positiveEmotion', 'negativeEmotion', 'anxiety', 'certainty', 'tone']:
    df_zubiagaFeatures_processed_v2[col] = df_zubiagaFeatures_processed_v2[col].astype('float64')
    df_kwonFeatures_processed_v2[col] = df_kwonFeatures_processed_v2[col].astype('float64')

In [None]:
# Eliminating all the object data types
objectCol = [i for i in df_zubiagaFeatures_processed_v2.columns if df_zubiagaFeatures_processed_v2.dtypes[i] == 'object' and i != 'category']
for col in objectCol:
    df_zubiagaFeatures_processed_v2 = df_zubiagaFeatures_processed_v2.drop(col,axis="columns")
    df_kwonFeatures_processed_v2 = df_kwonFeatures_processed_v2.drop(col,axis="columns")

In [None]:
# Sanity check for infinite values
df_zubiagaFeatures_processed_v2 = df_zubiagaFeatures_processed_v2.replace([np.inf, -np.inf], np.nan).dropna()
df_kwonFeatures_processed_v2 = df_kwonFeatures_processed_v2.replace([np.inf, -np.inf], np.nan).dropna()

In [None]:
# Sanity check to see whether there is any null value
for col in df_zubiagaFeatures_processed_v2.columns:
    print(f'{col}: {pd.isnull(df_zubiagaFeatures_processed_v2[col]).any()}')
print("------------")
for col in df_kwonFeatures_processed_v2.columns:
    print(f'{col}: {pd.isnull(df_kwonFeatures_processed_v2[col]).any()}')

In [None]:
# More sanity check about null/missing values in the data frame
for col in df_zubiagaFeatures_processed_v2:
    print(df_zubiagaFeatures_processed_v2[col].isnull().any())
print("-----------")
for col in df_kwonFeatures_processed_v2:
    print(df_kwonFeatures_processed_v2[col].isnull().any())

In [None]:
# More sanity check about null/missing values in the data frame
for col in df_zubiagaFeatures_processed_v2:
    print(df_zubiagaFeatures_processed_v2[col].isnull().values.sum())
print("-----------")
for col in df_kwonFeatures_processed_v2:
    print(df_kwonFeatures_processed_v2[col].isnull().values.sum())

In [None]:
# More sanity check about null/missing values in the data frame
print(df_zubiagaFeatures_processed_v2[col].isnull().values.sum().sum())
print("-----------")
print(df_kwonFeatures_processed_v2[col].isnull().values.sum().sum())

In [None]:
# Separating rumour from nonRumour
df_zubiagaFeatures_processed_v2_rumour = df_zubiagaFeatures_processed_v2.drop(df_zubiagaFeatures_processed_v2[df_zubiagaFeatures_processed_v2.category != "rumour"].index)
df_zubiagaFeatures_processed_v2_rumour.drop(["category"], inplace=True, axis="columns")
df_zubiagaFeatures_processed_v2_nonRumour = df_zubiagaFeatures_processed_v2.drop(df_zubiagaFeatures_processed_v2[df_zubiagaFeatures_processed_v2.category != "nonRumour"].index)
df_zubiagaFeatures_processed_v2_nonRumour.drop(["category"], inplace=True, axis="columns")

df_kwonFeatures_processed_v2_rumour = df_kwonFeatures_processed_v2.drop(df_kwonFeatures_processed_v2[df_kwonFeatures_processed_v2.category != "rumour"].index)
df_kwonFeatures_processed_v2_rumour.drop(["category"], inplace=True, axis="columns")
df_kwonFeatures_processed_v2_nonRumour = df_kwonFeatures_processed_v2.drop(df_kwonFeatures_processed_v2[df_kwonFeatures_processed_v2.category != "nonRumour"].index)
df_kwonFeatures_processed_v2_nonRumour.drop(["category"], inplace=True, axis="columns")

In [None]:
# More sanity check about null/missing values in the data frame

print(df_zubiagaFeatures_processed_v2_rumour.isnull().any(None))
print(df_zubiagaFeatures_processed_v2_nonRumour.isnull().any(None))
print(df_kwonFeatures_processed_v2_rumour.isnull().any(None))
print(df_kwonFeatures_processed_v2_nonRumour.isnull().any(None))

In [None]:
# df_zubiagaFeatures_processed_v2_rumour
# df_zubiagaFeatures_processed_v2_nonRumour
# df_kwonFeatures_processed_v2_rumour
# df_kwonFeatures_processed_v2_nonRumour

In [None]:
#Normalization
x1 = df_zubiagaFeatures_processed_v2_rumour.values 
min_max_scaler1 = preprocessing.MinMaxScaler()
x_scaled1 = min_max_scaler1.fit_transform(x1)
df_zubiagaFeatures_processed_v2_rumour_normalized=pd.DataFrame(x_scaled1, columns=df_zubiagaFeatures_processed_v2_rumour.columns)

x2 = df_zubiagaFeatures_processed_v2_nonRumour.values 
min_max_scaler2 = preprocessing.MinMaxScaler()
x_scaled2 = min_max_scaler2.fit_transform(x2)
df_zubiagaFeatures_processed_v2_nonRumour_normalized=pd.DataFrame(x_scaled2, columns=df_zubiagaFeatures_processed_v2_nonRumour.columns)

x3 = df_kwonFeatures_processed_v2_rumour.values 
min_max_scaler3 = preprocessing.MinMaxScaler()
x_scaled3 = min_max_scaler3.fit_transform(x3)
df_kwonFeatures_processed_v2_rumour_normalized=pd.DataFrame(x_scaled3, columns=df_kwonFeatures_processed_v2_rumour.columns)

x4 = df_kwonFeatures_processed_v2_nonRumour.values 
min_max_scaler4 = preprocessing.MinMaxScaler()
x_scaled4 = min_max_scaler4.fit_transform(x4)
df_kwonFeatures_processed_v2_nonRumour_normalized=pd.DataFrame(x_scaled4, columns=df_kwonFeatures_processed_v2_nonRumour.columns)


In [None]:
df_zubiagaFeatures_processed_v2_rumour_normalized.to_csv("./zubiagaFeatures_rumours.csv")
df_zubiagaFeatures_processed_v2_nonRumour_normalized.to_csv("./zubiagaFeatures_nonRumours.csv")
df_kwonFeatures_processed_v2_rumour_normalized.to_csv("./kwonFeatures_rumours.csv")
df_kwonFeatures_processed_v2_nonRumour_normalized.to_csv("./kwonFeatures_nonRumours.csv")