In [None]:
import os
from tqdm import tqdm
import json
import spacy
import en_core_web_sm
import numpy as np
import re
import pandas as pd
import pickle as pk
import copy
import datetime
from sklearn import preprocessing
from textblob import TextBlob
import emoji
from pprint import pprint
from nltk.tree import Tree
import nltk.data
from nltk.tokenize import sent_tokenize, word_tokenize
from stanfordcorenlp import StanfordCoreNLP
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from hatesonar import Sonar
from pprint import pprint
from urlextract import URLExtract
import syllables
import textstat

In [None]:
# # model download
# stanfordnlp.download('en')   # This downloads the English models for the neural pipeline
# models_dir_adr = ""

In [None]:
# Addresses
tweetsAdr = "./Serialization/Tweets/"
featureSerializationAdr = "./Serialization/Features/"
tweetSerializationAdr = "./Serialization/Tweets/"

In [None]:
#Libraries Setup

nlp = spacy.load("en_core_web_sm")
nlp2 = StanfordCoreNLP('')
analyser = SentimentIntensityAnalyzer()
sonar = Sonar()
extractor = URLExtract()
sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
# Credible sources - USA

adr1 = ""
df = pd.read_csv(adr1, delimiter=";", skiprows=1 , names = [1,2,3,4,5,6,7,8,9], encoding = "ISO-8859-1")
df = df.drop([1,5,6,7,8,9], axis="columns")
df = df.rename({2:"source", 3:"id", 4:"website"}, axis="columns")

df1 = df.drop(columns=["id"], axis="columns")
df2 = df.drop(columns=["website"], axis="columns")

df2["id"] = df2["id"].str.lower().str.replace("?", "").str.strip()

credibleWebsitesUsa = [i for i in df1.to_dict(orient="list")["website"] if str(i) != "nan"]
credibleAccountsUsa = [j for j in df2.to_dict(orient="list")["id"] if str(j) != "nan"]

In [None]:
# Credible sources - INDIA

adr2 = ""
df = pd.read_csv(adr2, delimiter=";", skiprows=1 , names=[1,2,3])
df1 = df.drop(columns=[1,2])
df2 = df.drop(columns=[1,3])
df1 = df1.rename({2:"twitter", 3:"site"}, axis="columns")
df2 = df2.rename({2:"twitter", 3:"site"}, axis="columns")

df2["twitter"] = df2["twitter"].str.lower().str.strip()

credibleWebsitesIndia = [q for q in df1.to_dict(orient="list")["site"] if str(q) != "nan"]
credibleAccountsIndia = [p for p in df2.to_dict(orient="list")["twitter"] if str(p) != "nan"]

credibleAccountsIndia = credibleAccountsIndia + credibleAccountsUsa
credibleWebsitesIndia = credibleWebsitesIndia + credibleWebsitesUsa

In [None]:
# Notorious sources

notoriousSources = ""
dfn = pd.read_csv(notoriousSources, delimiter=";")
dfn = dfn.rename({"test":"title", "Unnamed: 1":"id", "Unnamed: 2":"website"}, axis="columns")
notoriousWebsites = [i.lower().strip() for i in dfn.to_dict(orient="list")["website"] if str(i) != "nan"]
notoriousId = [i.lower().strip() for i in dfn.to_dict(orient="list")["id"] if str(i) != "nan"]

In [None]:
# Loading abbreviations, vuglar terms and emoticons for feature extraction

abbrAdr = ""
abbrList = [w.strip() for w in open(abbrAdr).readlines() if w != "\n"]
    
emotiAdr = ""
emotiList = [w.strip() for w in open(emotiAdr).readlines() if w != "\n"]

vuglarAdr = ""
vuglarList = [w.strip() for w in open(vuglarAdr).readlines() if w != "\n"]

In [None]:
# Creating NRC dictionary for different feelings

adr = ""

nrcRaw = open(adr).readlines()
nrcDic = {}
for i in nrcRaw:
    tmp = i.strip().split("\t")
    lemma = tmp[0]
    sentiment = tmp[1]
    score = tmp[2]
    if lemma in nrcDic.keys():
        nrcDic[lemma][sentiment] = int(score)
    else:
        nrcDic[lemma] = {sentiment : int(score)}

In [None]:
# Creating Emotion dictionary

adr = ""

emotions = pd.DataFrame.from_csv(adr)
emotions = emotions[["Word", "V.Mean.Sum","A.Mean.Sum","D.Mean.Sum"]]
emotions.columns = ["word","valence","arousal","dominance"]
emotions = emotions.T
emotions.columns = emotions.loc["word"]
emotions = emotions.drop(["word"], axis="index")
emotionDic = pd.DataFrame.to_dict(emotions)

In [None]:
def tweetElements(tweet):
    rtFlag = True if "retweeted_status" in tweet.keys() else False
    qtFlag = True if "quoted_status" in tweet.keys() else False 

    if rtFlag == False and qtFlag == False:
        if tweet["truncated"] == True:
            text = copy.deepcopy(tweet["extended_tweet"]["full_text"])
            entities = copy.deepcopy(tweet["extended_tweet"]["entities"])
        else:
            text = copy.deepcopy(tweet["text"])
            entities = copy.deepcopy(tweet["entities"])
        user = copy.deepcopy(tweet["user"])
        return [rtFlag, qtFlag, text, entities, user]
    
    elif rtFlag == True and qtFlag == False:
        rt = copy.deepcopy(tweet["retweeted_status"])
        if rt["truncated"] == True:
            text = copy.deepcopy(rt["extended_tweet"]["full_text"])
            entities = copy.deepcopy(rt["extended_tweet"]["entities"])
        else:
            text = copy.deepcopy(rt["text"])
            entities = copy.deepcopy(rt["entities"])
        rtUser = copy.deepcopy(rt["user"])
        user = copy.deepcopy(tweet["user"])
        return [rtFlag, qtFlag, text, entities, user, rtUser]
    
    elif rtFlag == False and qtFlag == True:
        if tweet["truncated"] == True:
            text = copy.deepcopy(tweet["extended_tweet"]["full_text"])
            entities = copy.deepcopy(tweet["extended_tweet"]["entities"])
        else:
            text = copy.deepcopy(tweet["text"])
            entities = copy.deepcopy(tweet["entities"])
        user = copy.deepcopy(tweet["user"])
        qt = copy.deepcopy(tweet["quoted_status"])
        if qt["truncated"] == True:
            qtText = copy.deepcopy(qt["extended_tweet"]["full_text"])
            qtEntities = copy.deepcopy(qt["extended_tweet"]["entities"])
        else:
            qtText = copy.deepcopy(qt["text"])
            qtEntities = copy.deepcopy(qt["entities"])
        qtUser = copy.deepcopy(qt["user"])
        return [rtFlag, qtFlag, text, entities, user, qtText, qtEntities, qtUser]        
        
    elif rtFlag == True and qtFlag == True:
        rt = copy.deepcopy(tweet["retweeted_status"])
        qt = copy.deepcopy(tweet["quoted_status"])        
        if rt["truncated"] == True:
            text = copy.deepcopy(rt["extended_tweet"]["full_text"])
            entities = copy.deepcopy(rt["extended_tweet"]["entities"])
        else:
            text = copy.deepcopy(rt["text"])
            entities = copy.deepcopy(rt["entities"])
        rtUser = copy.deepcopy(rt["user"])        
        if qt["truncated"] == True:
            qtText = copy.deepcopy(qt["extended_tweet"]["full_text"])
            qtEntities = copy.deepcopy(qt["extended_tweet"]["entities"])
        else:
            qtText = copy.deepcopy(qt["text"])
            qtEntities = copy.deepcopy(qt["entities"])
        qtUser = copy.deepcopy(qt["user"])
        user = copy.deepcopy(tweet["user"])
        return [rtFlag, qtFlag, text, entities, user, qtText, qtEntities, qtUser, rtUser]        

In [None]:
### ALSO PROVIDE ONE VERSION WITHOUT PUNCTUATION
### AND ONE VERSION THAT USER CAN DETERMINE WHICH PUNCTUATION SHE WANTS OR DOESN'T WANT 
def textProcessor(twtTxt):
    urls = extractor.find_urls(twtTxt)
    semiRaw = twtTxt
    for url in urls:
        semiRaw = semiRaw.replace(url,"")
 
    semiRaw = semiRaw.replace("  ", " ").replace("  ", " ").replace("\t", " ").replace("\n", " ").strip()
    processed = semiRaw.replace("#"," ").replace("@", " ").replace("  ", " ").replace("  ", " ").replace("\t", " ").replace("\n", " ").strip()
    overProcessed = processed.lower()
    
    return twtTxt, semiRaw, processed, overProcessed

In [None]:
def firstPersonPronounCount(myTxt):
    # Because in tokenization of I've, i'd, and I'm "you" will be separated, we do not ned to take care of such cases
    comb_sing = word_tokenize(myTxt).count("i") \
        + (word_tokenize(myTxt).count("my") + word_tokenize(myTxt).count("mine") + word_tokenize(myTxt).count("me"))
    
    comb_plur = word_tokenize(myTxt).count("we")  \
        + (word_tokenize(myTxt).count("our") + word_tokenize(myTxt).count("ours") + word_tokenize(myTxt).count("us"))
 
    return comb_sing + comb_plur

In [None]:
def secondPersonPronounCount(myText):
    # Because in tokenization of you've, you'd, and you're "you" will be separated, we do not ned to take care of such cases
    return word_tokenize(myText).count("you")  \
            + word_tokenize(myText).count("your") + word_tokenize(myText).count("yours")

In [None]:
def thirdPersonPronounCount(myTweetTxt):
    # The same comments as above for he, she, it, and they
    sing = word_tokenize(myTweetTxt).count("he") + \
            word_tokenize(myTweetTxt).count("she")  + \
            word_tokenize(myTweetTxt).count("it")  + \
            (word_tokenize(myTweetTxt).count("his") + word_tokenize(myTweetTxt).count("her") + \
             word_tokenize(myTweetTxt).count("its") + word_tokenize(myTweetTxt).count("him") + \
             word_tokenize(myTweetTxt).count("him") +  word_tokenize(myTweetTxt).count("hers"))
    
    comb = word_tokenize(myTweetTxt).count("they")  + \
            (word_tokenize(myTweetTxt).count("their") + word_tokenize(myTweetTxt).count("theirs") + word_tokenize(myTweetTxt).count("them"))
     
    return sing + comb

In [None]:
def nrcEmotions(nrcTxt):
    nrcTxtList = word_tokenize(nrcTxt)
    
    angerScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            angerScore += nrcDic[term]["anger"]
    anticipationScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            anticipationScore += nrcDic[term]["anticipation"]
    disgustScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            disgustScore += nrcDic[term]["disgust"]
    fearScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            fearScore += nrcDic[term]["fear"]
    joyScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            joyScore += nrcDic[term]["joy"]
    sadnessScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            sadnessScore += nrcDic[term]["sadness"]
    surpriseScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            surpriseScore += nrcDic[term]["surprise"]
    trustScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            trustScore += nrcDic[term]["trust"]
    
    return angerScore, anticipationScore, disgustScore, fearScore, joyScore, sadnessScore, surpriseScore, trustScore

In [None]:
def emotions(semiRaw, overProc):
    posScore = analyser.polarity_scores(semiRaw)["pos"]
    negScore = analyser.polarity_scores(semiRaw)["neg"]
    neuScore = analyser.polarity_scores(semiRaw)["neu"]
    compScore = analyser.polarity_scores(semiRaw)["compound"]

    wordList = word_tokenize(overProc)
    arousalScore = 0
    for term in wordList:
        if term in emotionDic:
            arousalScore += emotionDic[term]["arousal"]

    dominanceScore = 0
    for term in wordList:
        if term in emotionDic:
            dominanceScore += emotionDic[term]["dominance"]
            
    return posScore, negScore, neuScore, compScore, arousalScore, dominanceScore

In [None]:
def hateSpeech(tweetTxt):
    sonar2 = sonar.ping(tweetTxt)
    return sonar2["classes"][0]["confidence"], sonar2["classes"][1]["confidence"], sonar2["classes"][2]["confidence"]

In [None]:
def singleTruth(first, second, third):
    if first == True and second == False and third == False:
        return True
    elif first == False and second == True and third == False:
        return True
    elif first == False and second == False and third == True:
        return True
    else:
        return False

In [None]:
def doubleTruth(first, second, third):
    if first == True and second == True and third == False:
        return True
    elif first == False and second == True and third == True:
        return True
    elif first == True and second == False and third == True:
        return True
    else:
        return False

# Kerala

### Rumour

In [None]:
keralaRumourFolder = [i for i in os.listdir(tweetSerializationAdr) if "keralaRumour" in i]
counter = 0
keralaRumourId = {}
keralaRumoursFeatures = {}

for file in keralaRumourFolder:
    keralaRumours = pk.load(open(tweetSerializationAdr+file, "rb")) 
    for tweet in tqdm(keralaRumours):
    ################################################################ Basic Setup ################################################################
        counter += 1
        qtText, qtEntities, qtUser, rtUser, tweetText = None, None, None, None, None

        elements = tweetElements(tweet)
        rtFlag = copy.deepcopy(elements[0])
        qtFlag = copy.deepcopy(elements[1]) 

        if rtFlag == False and qtFlag == False:
            text, entities, user = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4])
        elif rtFlag == True and qtFlag == False:
            text, entities, user, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5])
        elif rtFlag == False and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7])       
        elif rtFlag == True and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7]), copy.deepcopy(elements[8])

        rawText, semiRawText, processedText, overProcessedText = textProcessor(text)

        tweetId = tweet["id"]
        name = user["name"].lower().strip()    

        spacyText = nlp(overProcessedText) 
        spacyName = nlp(name)


        # There are some duplicated tweets (I don't know why!)
        if tweetId in keralaRumourId:
            keralaRumourId[tweetId] += 1
            continue
        else:
            keralaRumourId[tweetId] = 1


         ################################################################ Rumour General Info ################################################################

        keralaRumoursFeatures[tweetId]={}
        keralaRumoursFeatures[tweetId]["id"]=tweetId
        keralaRumoursFeatures[tweetId]["screenName"]=user["screen_name"]
        keralaRumoursFeatures[tweetId]["text"]=text
        keralaRumoursFeatures[tweetId]["tweetUrl"] = "https://twitter.com/" + user["screen_name"] + "/status/" + str(tweetId)

    ##### =>
        pattern = re.compile('[>].*[<]')
        try:
            keralaRumoursFeatures[tweetId]["source"] = pattern.findall(tweet["source"])[0][1:-1]
        except:
            keralaRumoursFeatures[tweetId]["source"] = tweet["source"]

        keralaRumoursFeatures[tweetId]["tweetPostTime"] = datetime.datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
    #### =>
        keralaRumoursFeatures[tweetId]["place"] = tweet["place"]
        keralaRumoursFeatures[tweetId]["profileLocation"] = user["location"]


         ################################################################ Syntactical Features ################################################################


        keralaRumoursFeatures[tweetId]["characterCount"] = len(rawText)
        keralaRumoursFeatures[tweetId]["tokenCount"] = len(spacyText)
        sentences = sentenceTokenizer.tokenize(overProcessedText)
        keralaRumoursFeatures[tweetId]["sentenceCount"] = len(sentences)
        keralaRumoursFeatures[tweetId]["averageWordComplexity"] = np.average([len(i) for i in word_tokenize(overProcessedText)])
        keralaRumoursFeatures[tweetId]["averageSentenceComplexity"] = np.average([len(word_tokenize(i)) for i in sentences])

        ### Twitter special tree for future
        try:
            parser=nlp2.parse(overProcessedText) 
            tree=Tree.fromstring(parser.__str__()) 
            keralaRumoursFeatures[tweetId]["tweetComplexity"] = tree.height() 
        except:
            keralaRumoursFeatures[tweetId]["tweetComplexity"] = 0

            
            


        tags = [x.tag_ for x in spacyText]    
        for tag in tags:
            keralaRumoursFeatures[tweetId][tag] = tags.count(tag)

        ners = [x.ent_type_ for x in spacyText]    
        for ner in ners:
            if ner != '':
                keralaRumoursFeatures[tweetId][ner] = ners.count(ner)

         ############################################################### Rumour Language Features ################################################################

        keralaRumoursFeatures[tweetId]["witnessPhrases"] = True if "i see" in overProcessedText else False
        keralaRumoursFeatures[tweetId]["witnessPhrases"] = True if "i hear" in overProcessedText else False

        uppercaseCharCount = sum(1 for i in semiRawText if i.isupper())
        lowercaseCharCount = sum(1 for i in semiRawText if i.islower())
        keralaRumoursFeatures[tweetId]["upperCaseCount"] = uppercaseCharCount
        keralaRumoursFeatures[tweetId]["lowerCaseCount"] = lowercaseCharCount
        try: # Becasue of devision by zero error
            keralaRumoursFeatures[tweetId]["upperCaseCharFrac"] =  uppercaseCharCount / (uppercaseCharCount + lowercaseCharCount)
        except:
            keralaRumoursFeatures[tweetId]["upperCaseCharFrac"] = 0

        capitalWordsCount = len([b for b in [i for i in word_tokenize(processedText)] if b.isupper()])
        keralaRumoursFeatures[tweetId]["capitalWordsCount"] = capitalWordsCount
        try: # Because of devision by zero error
            keralaRumoursFeatures[tweetId]["capitalWordFrac"] = capitalWordsCount / len(spacyText)
        except:
            keralaRumoursFeatures[tweetId]["capitalWordFrac"] = 0
        keralaRumoursFeatures[tweetId]["exclamationMarkCount"] = overProcessedText.count("!")
        keralaRumoursFeatures[tweetId]["questionMarkCount"] = overProcessedText.count("?")

        keralaRumoursFeatures[tweetId]["firstPersonPronounCount"] = firstPersonPronounCount(overProcessedText)
        keralaRumoursFeatures[tweetId]["secondPersonPronounCount"] = secondPersonPronounCount(overProcessedText)
        keralaRumoursFeatures[tweetId]["thirdPersonPronounCount"] = thirdPersonPronounCount(overProcessedText)

        keralaRumoursFeatures[tweetId]["vuglarTermsCount"] = len([a for a in word_tokenize(overProcessedText) if a in vuglarList])
        keralaRumoursFeatures[tweetId]["emoticonCount"] = len([a for a in word_tokenize(overProcessedText) if a in emotiList])
        keralaRumoursFeatures[tweetId]["abbreviationCount"] = len([a for a in word_tokenize(overProcessedText) if a in abbrList])
        keralaRumoursFeatures[tweetId]["emojiCount"] = len([x for x in overProcessedText if x in emoji.UNICODE_EMOJI])

        ############################################################### Rumour Psycholinguistic Features ################################################################

        keralaRumoursFeatures[tweetId]["subjectivityScore"] = TextBlob(semiRawText).sentiment.subjectivity
        keralaRumoursFeatures[tweetId]["polarityScore"] = TextBlob(semiRawText).sentiment.polarity    
        
        #We have "angry" feature from LIWC, that is why I changed the name of this feature from anger to angerEmotion
        keralaRumoursFeatures[tweetId]["angerEmotion"], keralaRumoursFeatures[tweetId]["anticipation"], keralaRumoursFeatures[tweetId]["disgust"], \
        keralaRumoursFeatures[tweetId]["fear"], keralaRumoursFeatures[tweetId]["joy"], keralaRumoursFeatures[tweetId]["sadness"], \
        keralaRumoursFeatures[tweetId]["surprise"], keralaRumoursFeatures[tweetId]["trust"] = nrcEmotions(overProcessedText)

        keralaRumoursFeatures[tweetId]["positiveEmotion"], keralaRumoursFeatures[tweetId]["negativeEmotion"], keralaRumoursFeatures[tweetId]["neutralEmotion"],\
        keralaRumoursFeatures[tweetId]["compoundEmotion"], keralaRumoursFeatures[tweetId]["arousalScore"], keralaRumoursFeatures[tweetId]["dominanceScore"] = \
        emotions(semiRawText, overProcessedText)

        keralaRumoursFeatures[tweetId]["hateSpeech"], keralaRumoursFeatures[tweetId]["offensiveLanguage"], keralaRumoursFeatures[tweetId]["neitherClasses"] = hateSpeech(semiRawText)

        
        keralaRumoursFeatures[tweetId]["flesch_reading_ease"] = textstat.flesch_reading_ease(overProcessedText)
        keralaRumoursFeatures[tweetId]["smog_index"] = textstat.smog_index(overProcessedText)
        keralaRumoursFeatures[tweetId]["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(overProcessedText)
        keralaRumoursFeatures[tweetId]["coleman_liau_index"] = textstat.coleman_liau_index(overProcessedText)
        keralaRumoursFeatures[tweetId]["automated_readability_index"] = textstat.automated_readability_index(overProcessedText)
        keralaRumoursFeatures[tweetId]["dale_chall_readability_score"] = textstat.dale_chall_readability_score(overProcessedText)
        keralaRumoursFeatures[tweetId]["difficult_words"] = textstat.difficult_words(overProcessedText)
        keralaRumoursFeatures[tweetId]["linsear_write_formula"] = textstat.linsear_write_formula(overProcessedText)
        keralaRumoursFeatures[tweetId]["gunning_fog"] = textstat.gunning_fog(overProcessedText)
        
        
        # One of the syllabus library pitfall is that, an empty sentence is one syllabus
        # To avoid devision by zero error
        try: 
            keralaRumoursFeatures[tweetId]["averageWordsyllables"] = np.average([syllables.estimate(i) for i in word_tokenize(overProcessedText)])
        except:
            keralaRumoursFeatures[tweetId]["averageWordsyllables"] = 0            
        
         ################################################################ Rumourmonger Features ################################################################

        ### popularity/seclusion features ###
        keralaRumoursFeatures[tweetId]["followingCount"] = user["friends_count"]
        keralaRumoursFeatures[tweetId]["influnece"] = user["followers_count"]
        keralaRumoursFeatures[tweetId]["userRole"] = (user["followers_count"]+1)/(user["friends_count"]+1)

         ### activity features ###
        today = datetime.datetime.now()
        accountCreationTime = datetime.datetime.strptime(user["created_at"], "%a %b %d %H:%M:%S %z %Y")
        keralaRumoursFeatures[tweetId]["accountAge"] = (today.date() - accountCreationTime.date()).days    
        keralaRumoursFeatures[tweetId]["totalProfileLikesCount"] = user["favourites_count"]
        keralaRumoursFeatures[tweetId]["statusCount"] = user["statuses_count"]
        keralaRumoursFeatures[tweetId]["averageFollowSpeed"] = user["followers_count"] / keralaRumoursFeatures[tweetId]["accountAge"]
        keralaRumoursFeatures[tweetId]["averageBeingFollowedSpeed"] = user["friends_count"] / keralaRumoursFeatures[tweetId]["accountAge"]
        keralaRumoursFeatures[tweetId]["averageLikeSpeed"] = user["favourites_count"] / keralaRumoursFeatures[tweetId]["accountAge"]
        keralaRumoursFeatures[tweetId]["averageStatusSpeed"] = user["statuses_count"] / keralaRumoursFeatures[tweetId]["accountAge"]

         ### profile reputation ###
        keralaRumoursFeatures[tweetId]["isVerifiedAccount"] = user["verified"]  

        keralaRumoursFeatures[tweetId]["userDescriptionReputation"] = (True if len([i for i in credibleAccountsUsa if i in user["description"].lower()]) > 0 else False) if user["description"] != None else False
        keralaRumoursFeatures[tweetId]["userDescriptionNotoriety"] = (True if len([i for i in notoriousId if i in user["description"].lower()]) > 0 else False) if user["description"] != None else False
        keralaRumoursFeatures[tweetId]["userUrlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in user["url"].lower()]) > 0 else False) if user["url"] != None else False
        keralaRumoursFeatures[tweetId]["userUrlNotoriety"] = (True if len([i for i in notoriousWebsites if i in user["url"].lower()]) > 0 else False) if user["url"] != None else False

         ### Reticency features ###  
        keralaRumoursFeatures[tweetId]["hasProfileLocation"] = True if user["location"] != None else False
        keralaRumoursFeatures[tweetId]["hasProfilePicture"] = True if user["profile_image_url"] != None else False
        keralaRumoursFeatures[tweetId]["geoEnabled"] = True if user["geo_enabled"] != None else False
        keralaRumoursFeatures[tweetId]["hasProfileUrl"] = True if user["url"] != None else False
        keralaRumoursFeatures[tweetId]["hasProfileDescription"] = True if user["description"] != None else False    

         ### fake identity ###
        keralaRumoursFeatures[tweetId]["screenNameLength"] = len(user["screen_name"])
        keralaRumoursFeatures[tweetId]["screenNameDigitCount"] = len([i for i in user["screen_name"] if i in [str(k) for k in range(0,10)]])   
        keralaRumoursFeatures[tweetId]["protectedProfile"] = True if user["protected"] != None else False
        keralaRumoursFeatures[tweetId]["personNameInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "PERSON"]) > 0  else False
        keralaRumoursFeatures[tweetId]["organizationInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "ORG"]) > 0  else False
        keralaRumoursFeatures[tweetId]["locationInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "LOC" or x.ent_type_ == "GPE"]) > 0  else False

        ### evidence availability ###   
        urlCount = len(entities["urls"]) if "urls" in entities.keys() else 0
        mediaCount = len(entities["media"]) if "media" in entities.keys() else 0

        keralaRumoursFeatures[tweetId]["urlAvailability"] = urlCount > 0
        keralaRumoursFeatures[tweetId]["mediaAvailability"] = mediaCount > 0
        keralaRumoursFeatures[tweetId]["quoteAvailability"] = qtFlag

         ### evidence diversity ###   
        keralaRumoursFeatures[tweetId]["monoSource"] = singleTruth(urlCount > 0, mediaCount > 0, qtFlag)
        keralaRumoursFeatures[tweetId]["doubleSource"] = doubleTruth(urlCount > 0, mediaCount > 0, qtFlag)
        keralaRumoursFeatures[tweetId]["trippleSource"] = (urlCount > 0) and (mediaCount > 0) and qtFlag

        ### evidence quality ## First hand / second hand
        keralaRumoursFeatures[tweetId]["firstHandSourceCount"] = urlCount + mediaCount
        keralaRumoursFeatures[tweetId]["secondHandSourceCount"] = 1 if qtFlag == True else 0

        ### evidence credibility    
        keralaRumoursFeatures[tweetId]["urlNotoriety"] = (True if len([i for i in notoriousWebsites if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False
        keralaRumoursFeatures[tweetId]["urlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False

        quoteUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescReputation = (len([i for i in credibleAccountsUsa if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        
        quoteUrlNotoriety = (len([i for i in notoriousWebsites if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlNotoriety = (len([i for i in notoriousWebsites if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescNotoriety = (len([i for i in notoriousId if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        keralaRumoursFeatures[tweetId]["quoteReputation"] = quoteUrlReputation or quoteProfileUrlReputation or quoteProfileDescReputation
        keralaRumoursFeatures[tweetId]["quoteNotoriety"] = quoteUrlNotoriety or quoteProfileUrlNotoriety or quoteProfileDescNotoriety

         ################################################################ Reach Features ################################################################
        keralaRumoursFeatures[tweetId]["likeCount"] = tweet["favorite_count"]
        keralaRumoursFeatures[tweetId]["retweetCount"] = tweet["retweet_count"]

         ################################################################ Miscellaneous Features ################################################################
        keralaRumoursFeatures[tweetId]["hashtagCount"] = len(entities["hashtags"])
        keralaRumoursFeatures[tweetId]["mentionCount"] = len(entities["user_mentions"])
        keralaRumoursFeatures[tweetId]["freshness"] = rtFlag

    pk.dump(keralaRumoursFeatures, open(f'{featureSerializationAdr}keralaRumoursFeatures_{counter}.pk', "wb"))
    keralaRumoursFeatures = {}

In [None]:
# Storing the features as one pandas dataframe
keralaRumoursFeaturesFolder = [i for i in os.listdir(featureSerializationAdr) if "keralaRumoursFeatures_" in i]
keralaRumoursFeaturesList = []
for i in keralaRumoursFeaturesFolder:
    keralaRumoursFeaturesList.append(pd.DataFrame.from_dict(pk.load(open(featureSerializationAdr+i, "rb"))).T)
keralaRumoursFeatures = pd.concat([df for df in keralaRumoursFeaturesList])
keralaRumoursFeatures["id"] = keralaRumoursFeatures["id"].astype("int64")
pk.dump(keralaRumoursFeatures, open(f'{featureSerializationAdr}/keralaRumoursFeaturesWithoutLIWC.pk', "wb"))

In [None]:
# Adding LIWC features
keralaRumoursLIWC = pk.load(open(featureSerializationAdr+"keralaRumoursLIWC.pk", "rb"))
keralaRumoursLIWC = keralaRumoursLIWC.fillna(0).replace(',','.', regex=True).astype("float64")
keralaRumoursFeaturesWithLIWC = pd.merge(keralaRumoursFeatures, keralaRumoursLIWC, on="id")
pk.dump(keralaRumoursFeaturesWithLIWC, open(f'{featureSerializationAdr}/keralaRumoursFeaturesWithLIWC.pk', "wb"))

### Non-Rumour

In [None]:
keralaNonRumourFolder = [i for i in os.listdir(tweetSerializationAdr) if "keralaNonRumour" in i]
counter = 0
keralaNonRumourId = {}
keralaNonRumoursFeatures = {}

for file in keralaNonRumourFolder:
    keralaNonRumours = pk.load(open(tweetSerializationAdr+file, "rb")) 
    for tweet in tqdm(keralaNonRumours):
    ################################################################ Basic Setup ################################################################
        counter += 1
        qtText, qtEntities, qtUser, rtUser, tweetText = None, None, None, None, None

        elements = tweetElements(tweet)
        rtFlag = copy.deepcopy(elements[0])
        qtFlag = copy.deepcopy(elements[1]) 

        if rtFlag == False and qtFlag == False:
            text, entities, user = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4])
        elif rtFlag == True and qtFlag == False:
            text, entities, user, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5])
        elif rtFlag == False and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7])       
        elif rtFlag == True and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7]), copy.deepcopy(elements[8])

        rawText, semiRawText, processedText, overProcessedText = textProcessor(text)

        tweetId = tweet["id"]
        name = user["name"].lower().strip()    

        spacyText = nlp(overProcessedText) 
        spacyName = nlp(name)


        # There are some duplicated tweets (I don't know why!)
        if tweetId in keralaNonRumourId:
            keralaNonRumourId[tweetId] += 1
            continue
        else:
            keralaNonRumourId[tweetId] = 1


         ################################################################ NonRumour General Info ################################################################

        keralaNonRumoursFeatures[tweetId]={}
        keralaNonRumoursFeatures[tweetId]["id"]=tweetId
        keralaNonRumoursFeatures[tweetId]["screenName"]=user["screen_name"]
        keralaNonRumoursFeatures[tweetId]["text"]=text
        keralaNonRumoursFeatures[tweetId]["tweetUrl"] = "https://twitter.com/" + user["screen_name"] + "/status/" + str(tweetId)

    ##### =>
        pattern = re.compile('[>].*[<]')
        try:
            keralaNonRumoursFeatures[tweetId]["source"] = pattern.findall(tweet["source"])[0][1:-1]
        except:
            keralaNonRumoursFeatures[tweetId]["source"] = tweet["source"]

        keralaNonRumoursFeatures[tweetId]["tweetPostTime"] = datetime.datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
    #### =>
        keralaNonRumoursFeatures[tweetId]["place"] = tweet["place"]
        keralaNonRumoursFeatures[tweetId]["profileLocation"] = user["location"]


         ############################################################### Syntactical Features ################################################################


        keralaNonRumoursFeatures[tweetId]["characterCount"] = len(rawText)
        keralaNonRumoursFeatures[tweetId]["tokenCount"] = len(spacyText)
        sentences = sentenceTokenizer.tokenize(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["sentenceCount"] = len(sentences)
        keralaNonRumoursFeatures[tweetId]["averageWordComplexity"] = np.average([len(i) for i in word_tokenize(overProcessedText)])
        keralaNonRumoursFeatures[tweetId]["averageSentenceComplexity"] = np.average([len(word_tokenize(i)) for i in sentences])

        ### Twitter special tree for future
        try:
            parser=nlp2.parse(overProcessedText) 
            tree=Tree.fromstring(parser.__str__()) 
            keralaNonRumoursFeatures[tweetId]["tweetComplexity"] = tree.height() 
        except:
            keralaNonRumoursFeatures[tweetId]["tweetComplexity"] = 0

            
            


        tags = [x.tag_ for x in spacyText]    
        for tag in tags:
            keralaNonRumoursFeatures[tweetId][tag] = tags.count(tag)

        ners = [x.ent_type_ for x in spacyText]    
        for ner in ners:
            if ner != '':
                keralaNonRumoursFeatures[tweetId][ner] = ners.count(ner)

         ############################################################### NonRumour Language Features ################################################################

        keralaNonRumoursFeatures[tweetId]["witnessPhrases"] = True if "i see" in overProcessedText else False
        keralaNonRumoursFeatures[tweetId]["witnessPhrases"] = True if "i hear" in overProcessedText else False

        uppercaseCharCount = sum(1 for i in semiRawText if i.isupper())
        lowercaseCharCount = sum(1 for i in semiRawText if i.islower())
        keralaNonRumoursFeatures[tweetId]["upperCaseCount"] = uppercaseCharCount
        keralaNonRumoursFeatures[tweetId]["lowerCaseCount"] = lowercaseCharCount
        try: # Becasue of devision by zero error
            keralaNonRumoursFeatures[tweetId]["upperCaseCharFrac"] =  uppercaseCharCount / (uppercaseCharCount + lowercaseCharCount)
        except:
            keralaNonRumoursFeatures[tweetId]["upperCaseCharFrac"] = 0

        capitalWordsCount = len([b for b in [i for i in word_tokenize(processedText)] if b.isupper()])
        keralaNonRumoursFeatures[tweetId]["capitalWordsCount"] = capitalWordsCount
        try: # Because of devision by zero error
            keralaNonRumoursFeatures[tweetId]["capitalWordFrac"] = capitalWordsCount / len(spacyText)
        except:
            keralaNonRumoursFeatures[tweetId]["capitalWordFrac"] = 0
        keralaNonRumoursFeatures[tweetId]["exclamationMarkCount"] = overProcessedText.count("!")
        keralaNonRumoursFeatures[tweetId]["questionMarkCount"] = overProcessedText.count("?")

        keralaNonRumoursFeatures[tweetId]["firstPersonPronounCount"] = firstPersonPronounCount(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["secondPersonPronounCount"] = secondPersonPronounCount(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["thirdPersonPronounCount"] = thirdPersonPronounCount(overProcessedText)

        keralaNonRumoursFeatures[tweetId]["vuglarTermsCount"] = len([a for a in word_tokenize(overProcessedText) if a in vuglarList])
        keralaNonRumoursFeatures[tweetId]["emoticonCount"] = len([a for a in word_tokenize(overProcessedText) if a in emotiList])
        keralaNonRumoursFeatures[tweetId]["abbreviationCount"] = len([a for a in word_tokenize(overProcessedText) if a in abbrList])
        keralaNonRumoursFeatures[tweetId]["emojiCount"] = len([x for x in overProcessedText if x in emoji.UNICODE_EMOJI])

        ############################################################### NonRumour Psycholinguistic Features ################################################################

        keralaNonRumoursFeatures[tweetId]["subjectivityScore"] = TextBlob(semiRawText).sentiment.subjectivity
        keralaNonRumoursFeatures[tweetId]["polarityScore"] = TextBlob(semiRawText).sentiment.polarity    

        keralaNonRumoursFeatures[tweetId]["angerEmotion"], keralaNonRumoursFeatures[tweetId]["anticipation"], keralaNonRumoursFeatures[tweetId]["disgust"], \
        keralaNonRumoursFeatures[tweetId]["fear"], keralaNonRumoursFeatures[tweetId]["joy"], keralaNonRumoursFeatures[tweetId]["sadness"], \
        keralaNonRumoursFeatures[tweetId]["surprise"], keralaNonRumoursFeatures[tweetId]["trust"] = nrcEmotions(overProcessedText)

        keralaNonRumoursFeatures[tweetId]["positiveEmotion"], keralaNonRumoursFeatures[tweetId]["negativeEmotion"], keralaNonRumoursFeatures[tweetId]["neutralEmotion"],\
        keralaNonRumoursFeatures[tweetId]["compoundEmotion"], keralaNonRumoursFeatures[tweetId]["arousalScore"], keralaNonRumoursFeatures[tweetId]["dominanceScore"] = \
        emotions(semiRawText, overProcessedText)

        keralaNonRumoursFeatures[tweetId]["hateSpeech"], keralaNonRumoursFeatures[tweetId]["offensiveLanguage"], keralaNonRumoursFeatures[tweetId]["neitherClasses"] = hateSpeech(semiRawText)

        keralaNonRumoursFeatures[tweetId]["flesch_reading_ease"] = textstat.flesch_reading_ease(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["smog_index"] = textstat.smog_index(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["coleman_liau_index"] = textstat.coleman_liau_index(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["automated_readability_index"] = textstat.automated_readability_index(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["dale_chall_readability_score"] = textstat.dale_chall_readability_score(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["difficult_words"] = textstat.difficult_words(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["linsear_write_formula"] = textstat.linsear_write_formula(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["gunning_fog"] = textstat.gunning_fog(overProcessedText)
        
        
        # One of the syllabus library pitfall is that, an empty sentence is one syllabus
        # To avoid devision by zero error
        try: 
            keralaNonRumoursFeatures[tweetId]["averageWordsyllables"] = np.average([syllables.estimate(i) for i in word_tokenize(overProcessedText)])
        except:
            keralaNonRumoursFeatures[tweetId]["averageWordsyllables"] = 0      
         ################################################################ NonRumourmonger Features ################################################################

        ### popularity/seclusion features ###
        keralaNonRumoursFeatures[tweetId]["followingCount"] = user["friends_count"]
        keralaNonRumoursFeatures[tweetId]["influnece"] = user["followers_count"]
        keralaNonRumoursFeatures[tweetId]["userRole"] = (user["followers_count"]+1)/(user["friends_count"]+1)

         ### activity features ###
        today = datetime.datetime.now()
        accountCreationTime = datetime.datetime.strptime(user["created_at"], "%a %b %d %H:%M:%S %z %Y")
        keralaNonRumoursFeatures[tweetId]["accountAge"] = (today.date() - accountCreationTime.date()).days    
        keralaNonRumoursFeatures[tweetId]["totalProfileLikesCount"] = user["favourites_count"]
        keralaNonRumoursFeatures[tweetId]["statusCount"] = user["statuses_count"]
        keralaNonRumoursFeatures[tweetId]["averageFollowSpeed"] = user["followers_count"] / keralaNonRumoursFeatures[tweetId]["accountAge"]
        keralaNonRumoursFeatures[tweetId]["averageBeingFollowedSpeed"] = user["friends_count"] / keralaNonRumoursFeatures[tweetId]["accountAge"]
        keralaNonRumoursFeatures[tweetId]["averageLikeSpeed"] = user["favourites_count"] / keralaNonRumoursFeatures[tweetId]["accountAge"]
        keralaNonRumoursFeatures[tweetId]["averageStatusSpeed"] = user["statuses_count"] / keralaNonRumoursFeatures[tweetId]["accountAge"]

         ### profile reputation ###
        keralaNonRumoursFeatures[tweetId]["isVerifiedAccount"] = user["verified"]  

        keralaNonRumoursFeatures[tweetId]["userDescriptionReputation"] = (True if len([i for i in credibleAccountsUsa if i in user["description"].lower()]) > 0 else False) if user["description"] != None else False
        keralaNonRumoursFeatures[tweetId]["userDescriptionNotoriety"] = (True if len([i for i in notoriousId if i in user["description"].lower()]) > 0 else False) if user["description"] != None else False
        keralaNonRumoursFeatures[tweetId]["userUrlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in user["url"].lower()]) > 0 else False) if user["url"] != None else False
        keralaNonRumoursFeatures[tweetId]["userUrlNotoriety"] = (True if len([i for i in notoriousWebsites if i in user["url"].lower()]) > 0 else False) if user["url"] != None else False

         ### Reticency features ###  
        keralaNonRumoursFeatures[tweetId]["hasProfileLocation"] = True if user["location"] != None else False
        keralaNonRumoursFeatures[tweetId]["hasProfilePicture"] = True if user["profile_image_url"] != None else False
        keralaNonRumoursFeatures[tweetId]["geoEnabled"] = True if user["geo_enabled"] != None else False
        keralaNonRumoursFeatures[tweetId]["hasProfileUrl"] = True if user["url"] != None else False
        keralaNonRumoursFeatures[tweetId]["hasProfileDescription"] = True if user["description"] != None else False    

         ### fake identity ###
        keralaNonRumoursFeatures[tweetId]["screenNameLength"] = len(user["screen_name"])
        keralaNonRumoursFeatures[tweetId]["screenNameDigitCount"] = len([i for i in user["screen_name"] if i in [str(k) for k in range(0,10)]])   
        keralaNonRumoursFeatures[tweetId]["protectedProfile"] = True if user["protected"] != None else False
        keralaNonRumoursFeatures[tweetId]["personNameInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "PERSON"]) > 0  else False
        keralaNonRumoursFeatures[tweetId]["organizationInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "ORG"]) > 0  else False
        keralaNonRumoursFeatures[tweetId]["locationInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "LOC" or x.ent_type_ == "GPE"]) > 0  else False

        ### evidence availability ###   
        urlCount = len(entities["urls"]) if "urls" in entities.keys() else 0
        mediaCount = len(entities["media"]) if "media" in entities.keys() else 0

        keralaNonRumoursFeatures[tweetId]["urlAvailability"] = urlCount > 0
        keralaNonRumoursFeatures[tweetId]["mediaAvailability"] = mediaCount > 0
        keralaNonRumoursFeatures[tweetId]["quoteAvailability"] = qtFlag

         ### evidence diversity ###   
        keralaNonRumoursFeatures[tweetId]["monoSource"] = singleTruth(urlCount > 0, mediaCount > 0, qtFlag)
        keralaNonRumoursFeatures[tweetId]["doubleSource"] = doubleTruth(urlCount > 0, mediaCount > 0, qtFlag)
        keralaNonRumoursFeatures[tweetId]["trippleSource"] = (urlCount > 0) and (mediaCount > 0) and qtFlag

        ### evidence quality ## First hand / second hand
        keralaNonRumoursFeatures[tweetId]["firstHandSourceCount"] = urlCount + mediaCount
        keralaNonRumoursFeatures[tweetId]["secondHandSourceCount"] = 1 if qtFlag == True else 0

        ### evidence credibility    
        keralaNonRumoursFeatures[tweetId]["urlNotoriety"]  = (True if len([i for i in notoriousWebsites if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False
        keralaNonRumoursFeatures[tweetId]["urlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False

        quoteUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescReputation = (len([i for i in credibleAccountsUsa if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        quoteUrlNotoriety = (len([i for i in notoriousWebsites if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlNotoriety = (len([i for i in notoriousWebsites if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescNotoriety = (len([i for i in notoriousId if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        keralaNonRumoursFeatures[tweetId]["quoteReputation"]  = quoteUrlReputation or quoteProfileUrlReputation or quoteProfileDescReputation
        keralaNonRumoursFeatures[tweetId]["quoteNotoriety"]  = quoteUrlNotoriety or quoteProfileUrlNotoriety or quoteProfileDescNotoriety

         ################################################################ Reach Features ################################################################
        keralaNonRumoursFeatures[tweetId]["likeCount"] = tweet["favorite_count"]
        keralaNonRumoursFeatures[tweetId]["retweetCount"] = tweet["retweet_count"]


         ################################################################ Miscellaneous Features ################################################################
        keralaNonRumoursFeatures[tweetId]["hashtagCount"] = len(entities["hashtags"])
        keralaNonRumoursFeatures[tweetId]["mentionCount"] = len(entities["user_mentions"])
        keralaNonRumoursFeatures[tweetId]["freshness"] = rtFlag

    pk.dump(keralaNonRumoursFeatures, open(f'{featureSerializationAdr}keralaNonRumoursFeatures_{counter}.pk', "wb"))
    keralaNonRumoursFeatures = {}

In [None]:
# Storing the features as one pandas dataframe
keralaNonRumoursFeaturesFolder = [i for i in os.listdir(featureSerializationAdr) if "keralaNonRumoursFeatures_" in i]
keralaNonRumoursFeaturesList = []
for i in keralaNonRumoursFeaturesFolder:
    keralaNonRumoursFeaturesList.append(pd.DataFrame.from_dict(pk.load(open(featureSerializationAdr+i, "rb"))).T)
keralaNonRumoursFeatures = pd.concat([df for df in keralaNonRumoursFeaturesList])
keralaNonRumoursFeatures["id"] = keralaNonRumoursFeatures["id"].astype("int64")
pk.dump(keralaNonRumoursFeatures, open(f'{featureSerializationAdr}/keralaNonRumoursFeaturesWithoutLIWC.pk', "wb"))

In [None]:
# Adding LIWC features
keralaNonRumoursLIWC = pk.load(open(featureSerializationAdr+"keralaNonRumoursLIWC.pk", "rb"))
keralaNonRumoursLIWC = keralaNonRumoursLIWC.fillna(0).replace(',','.', regex=True).astype("float64")
keralaNonRumoursFeaturesWithLIWC = pd.merge(keralaNonRumoursFeatures, keralaNonRumoursLIWC, on="id")
pk.dump(keralaNonRumoursFeaturesWithLIWC, open(f'{featureSerializationAdr}/keralaNonRumoursFeaturesWithLIWC.pk', "wb"))

# Florence

### Rumour

In [None]:
florenceRumourFolder = [i for i in os.listdir(tweetSerializationAdr) if "florenceRumour" in i]
counter = 0
florenceRumourId = {}
florenceRumoursFeatures = {}

for file in florenceRumourFolder:
    florenceRumours = pk.load(open(tweetSerializationAdr+file, "rb")) 
    for tweet in tqdm(florenceRumours):
    ################################################################ Basic Setup ################################################################
        counter += 1
        qtText, qtEntities, qtUser, rtUser, tweetText = None, None, None, None, None

        elements = tweetElements(tweet)
        rtFlag = copy.deepcopy(elements[0])
        qtFlag = copy.deepcopy(elements[1]) 

        if rtFlag == False and qtFlag == False:
            text, entities, user = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4])
        elif rtFlag == True and qtFlag == False:
            text, entities, user, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5])
        elif rtFlag == False and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7])       
        elif rtFlag == True and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7]), copy.deepcopy(elements[8])

        rawText, semiRawText, processedText, overProcessedText = textProcessor(text)

        tweetId = tweet["id"]
        name = user["name"].lower().strip()    

        spacyText = nlp(overProcessedText) 
        spacyName = nlp(name)


        # There are some duplicated tweets (I don't know why!)
        if tweetId in florenceRumourId:
            florenceRumourId[tweetId] += 1
            continue
        else:
            florenceRumourId[tweetId] = 1


         ################################################################ Rumour General Info ################################################################

        florenceRumoursFeatures[tweetId]={}
        florenceRumoursFeatures[tweetId]["id"]=tweetId
        florenceRumoursFeatures[tweetId]["screenName"]=user["screen_name"]
        florenceRumoursFeatures[tweetId]["text"]=text
        florenceRumoursFeatures[tweetId]["tweetUrl"] = "https://twitter.com/" + user["screen_name"] + "/status/" + str(tweetId)

    ##### =>
        pattern = re.compile('[>].*[<]')
        try:
            florenceRumoursFeatures[tweetId]["source"] = pattern.findall(tweet["source"])[0][1:-1]
        except:
            florenceRumoursFeatures[tweetId]["source"] = tweet["source"]

        florenceRumoursFeatures[tweetId]["tweetPostTime"] = datetime.datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
    #### =>
        florenceRumoursFeatures[tweetId]["place"] = tweet["place"]
        florenceRumoursFeatures[tweetId]["profileLocation"] = user["location"]


         ################################################################ Syntactical Features ################################################################


        florenceRumoursFeatures[tweetId]["characterCount"] = len(rawText)
        florenceRumoursFeatures[tweetId]["tokenCount"] = len(spacyText)
        sentences = sentenceTokenizer.tokenize(overProcessedText)
        florenceRumoursFeatures[tweetId]["sentenceCount"] = len(sentences)
        florenceRumoursFeatures[tweetId]["averageWordComplexity"] = np.average([len(i) for i in word_tokenize(overProcessedText)])
        florenceRumoursFeatures[tweetId]["averageSentenceComplexity"] = np.average([len(word_tokenize(i)) for i in sentences])

        ### Twitter special tree for future
        try:
            parser=nlp2.parse(overProcessedText) 
            tree=Tree.fromstring(parser.__str__()) 
            florenceRumoursFeatures[tweetId]["tweetComplexity"] = tree.height() 
        except:
            florenceRumoursFeatures[tweetId]["tweetComplexity"] = 0

        tags = [x.tag_ for x in spacyText]    
        for tag in tags:
            florenceRumoursFeatures[tweetId][tag] = tags.count(tag)

        ners = [x.ent_type_ for x in spacyText]    
        for ner in ners:
            if ner != '':
                florenceRumoursFeatures[tweetId][ner] = ners.count(ner)

         ############################################################### Rumour Language Features ################################################################

        florenceRumoursFeatures[tweetId]["witnessPhrases"] = True if "i see" in overProcessedText else False
        florenceRumoursFeatures[tweetId]["witnessPhrases"] = True if "i hear" in overProcessedText else False

        uppercaseCharCount = sum(1 for i in semiRawText if i.isupper())
        lowercaseCharCount = sum(1 for i in semiRawText if i.islower())
        florenceRumoursFeatures[tweetId]["upperCaseCount"] = uppercaseCharCount
        florenceRumoursFeatures[tweetId]["lowerCaseCount"] = lowercaseCharCount
        try: # Becasue of devision by zero error
            florenceRumoursFeatures[tweetId]["upperCaseCharFrac"] =  uppercaseCharCount / (uppercaseCharCount + lowercaseCharCount)
        except:
            florenceRumoursFeatures[tweetId]["upperCaseCharFrac"] = 0

        capitalWordsCount = len([b for b in [i for i in word_tokenize(processedText)] if b.isupper()])
        florenceRumoursFeatures[tweetId]["capitalWordsCount"] = capitalWordsCount
        try: # Because of devision by zero error
            florenceRumoursFeatures[tweetId]["capitalWordFrac"] = capitalWordsCount / len(spacyText)
        except:
            florenceRumoursFeatures[tweetId]["capitalWordFrac"] = 0
        florenceRumoursFeatures[tweetId]["exclamationMarkCount"] = overProcessedText.count("!")
        florenceRumoursFeatures[tweetId]["questionMarkCount"] = overProcessedText.count("?")

        florenceRumoursFeatures[tweetId]["firstPersonPronounCount"] = firstPersonPronounCount(overProcessedText)
        florenceRumoursFeatures[tweetId]["secondPersonPronounCount"] = secondPersonPronounCount(overProcessedText)
        florenceRumoursFeatures[tweetId]["thirdPersonPronounCount"] = thirdPersonPronounCount(overProcessedText)

        florenceRumoursFeatures[tweetId]["vuglarTermsCount"] = len([a for a in word_tokenize(overProcessedText) if a in vuglarList])
        florenceRumoursFeatures[tweetId]["emoticonCount"] = len([a for a in word_tokenize(overProcessedText) if a in emotiList])
        florenceRumoursFeatures[tweetId]["abbreviationCount"] = len([a for a in word_tokenize(overProcessedText) if a in abbrList])
        florenceRumoursFeatures[tweetId]["emojiCount"] = len([x for x in overProcessedText if x in emoji.UNICODE_EMOJI])

        ################################################################ Rumour Psycholinguistic Features ################################################################

        florenceRumoursFeatures[tweetId]["subjectivityScore"] = TextBlob(semiRawText).sentiment.subjectivity
        florenceRumoursFeatures[tweetId]["polarityScore"] = TextBlob(semiRawText).sentiment.polarity    

        florenceRumoursFeatures[tweetId]["angerEmotion"], florenceRumoursFeatures[tweetId]["anticipation"], florenceRumoursFeatures[tweetId]["disgust"], \
        florenceRumoursFeatures[tweetId]["fear"], florenceRumoursFeatures[tweetId]["joy"], florenceRumoursFeatures[tweetId]["sadness"], \
        florenceRumoursFeatures[tweetId]["surprise"], florenceRumoursFeatures[tweetId]["trust"] = nrcEmotions(overProcessedText)

        florenceRumoursFeatures[tweetId]["positiveEmotion"], florenceRumoursFeatures[tweetId]["negativeEmotion"], florenceRumoursFeatures[tweetId]["neutralEmotion"],\
        florenceRumoursFeatures[tweetId]["compoundEmotion"], florenceRumoursFeatures[tweetId]["arousalScore"], florenceRumoursFeatures[tweetId]["dominanceScore"] = \
        emotions(semiRawText, overProcessedText)

        florenceRumoursFeatures[tweetId]["hateSpeech"], florenceRumoursFeatures[tweetId]["offensiveLanguage"], florenceRumoursFeatures[tweetId]["neitherClasses"] = hateSpeech(semiRawText)
        
        
        florenceRumoursFeatures[tweetId]["flesch_reading_ease"] = textstat.flesch_reading_ease(overProcessedText)
        florenceRumoursFeatures[tweetId]["smog_index"] = textstat.smog_index(overProcessedText)
        florenceRumoursFeatures[tweetId]["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(overProcessedText)
        florenceRumoursFeatures[tweetId]["coleman_liau_index"] = textstat.coleman_liau_index(overProcessedText)
        florenceRumoursFeatures[tweetId]["automated_readability_index"] = textstat.automated_readability_index(overProcessedText)
        florenceRumoursFeatures[tweetId]["dale_chall_readability_score"] = textstat.dale_chall_readability_score(overProcessedText)
        florenceRumoursFeatures[tweetId]["difficult_words"] = textstat.difficult_words(overProcessedText)
        florenceRumoursFeatures[tweetId]["linsear_write_formula"] = textstat.linsear_write_formula(overProcessedText)
        florenceRumoursFeatures[tweetId]["gunning_fog"] = textstat.gunning_fog(overProcessedText)
        
        
        # One of the syllabus library pitfall is that, an empty sentence is one syllabus
        # To avoid devision by zero error
        try: 
            florenceRumoursFeatures[tweetId]["averageWordsyllables"] = np.average([syllables.estimate(i) for i in word_tokenize(overProcessedText)])
        except:
            florenceRumoursFeatures[tweetId]["averageWordsyllables"] = 0         
            
            
         ################################################################ Rumourmonger Features ################################################################

        ### popularity/seclusion features ###
        florenceRumoursFeatures[tweetId]["followingCount"] = user["friends_count"]
        florenceRumoursFeatures[tweetId]["influnece"] = user["followers_count"]
        florenceRumoursFeatures[tweetId]["userRole"] = (user["followers_count"]+1)/(user["friends_count"]+1)

         ### activity features ###
        today = datetime.datetime.now()
        accountCreationTime = datetime.datetime.strptime(user["created_at"], "%a %b %d %H:%M:%S %z %Y")
        florenceRumoursFeatures[tweetId]["accountAge"] = (today.date() - accountCreationTime.date()).days    
        florenceRumoursFeatures[tweetId]["totalProfileLikesCount"] = user["favourites_count"]
        florenceRumoursFeatures[tweetId]["statusCount"] = user["statuses_count"]
        florenceRumoursFeatures[tweetId]["averageFollowSpeed"] = user["followers_count"] / florenceRumoursFeatures[tweetId]["accountAge"]
        florenceRumoursFeatures[tweetId]["averageBeingFollowedSpeed"] = user["friends_count"] / florenceRumoursFeatures[tweetId]["accountAge"]
        florenceRumoursFeatures[tweetId]["averageLikeSpeed"] = user["favourites_count"] / florenceRumoursFeatures[tweetId]["accountAge"]
        florenceRumoursFeatures[tweetId]["averageStatusSpeed"] = user["statuses_count"] / florenceRumoursFeatures[tweetId]["accountAge"]

         ### profile reputation ###
        florenceRumoursFeatures[tweetId]["isVerifiedAccount"] = user["verified"]  

        florenceRumoursFeatures[tweetId]["userDescriptionReputation"] = (True if len([i for i in credibleAccountsUsa if i in user["description"].lower()]) > 0 else False) if user["description"] != None else False
        florenceRumoursFeatures[tweetId]["userDescriptionNotoriety"] = (True if len([i for i in notoriousId if i in user["description"].lower()]) > 0 else False) if user["description"] != None else False
        florenceRumoursFeatures[tweetId]["userUrlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in user["url"].lower()]) > 0 else False) if user["url"] != None else False
        florenceRumoursFeatures[tweetId]["userUrlNotoriety"] = (True if len([i for i in notoriousWebsites if i in user["url"].lower()]) > 0 else False) if user["url"] != None else False

         ### Reticency features ###  
        florenceRumoursFeatures[tweetId]["hasProfileLocation"] = True if user["location"] != None else False
        florenceRumoursFeatures[tweetId]["hasProfilePicture"] = True if user["profile_image_url"] != None else False
        florenceRumoursFeatures[tweetId]["geoEnabled"] = True if user["geo_enabled"] != None else False
        florenceRumoursFeatures[tweetId]["hasProfileUrl"] = True if user["url"] != None else False
        florenceRumoursFeatures[tweetId]["hasProfileDescription"] = True if user["description"] != None else False    

         ### fake identity ###
        florenceRumoursFeatures[tweetId]["screenNameLength"] = len(user["screen_name"])
        florenceRumoursFeatures[tweetId]["screenNameDigitCount"] = len([i for i in user["screen_name"] if i in [str(k) for k in range(0,10)]])   
        florenceRumoursFeatures[tweetId]["protectedProfile"] = True if user["protected"] != None else False
        florenceRumoursFeatures[tweetId]["personNameInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "PERSON"]) > 0  else False
        florenceRumoursFeatures[tweetId]["organizationInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "ORG"]) > 0  else False
        florenceRumoursFeatures[tweetId]["locationInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "LOC" or x.ent_type_ == "GPE"]) > 0  else False

        ### evidence availability ###   
        urlCount = len(entities["urls"]) if "urls" in entities.keys() else 0
        mediaCount = len(entities["media"]) if "media" in entities.keys() else 0

        florenceRumoursFeatures[tweetId]["urlAvailability"] = urlCount > 0
        florenceRumoursFeatures[tweetId]["mediaAvailability"] = mediaCount > 0
        florenceRumoursFeatures[tweetId]["quoteAvailability"] = qtFlag

         ### evidence diversity ###   
        florenceRumoursFeatures[tweetId]["monoSource"] = singleTruth(urlCount > 0, mediaCount > 0, qtFlag)
        florenceRumoursFeatures[tweetId]["doubleSource"] = doubleTruth(urlCount > 0, mediaCount > 0, qtFlag)
        florenceRumoursFeatures[tweetId]["trippleSource"] = (urlCount > 0) and (mediaCount > 0) and qtFlag

        ### evidence quality ## First hand / second hand
        florenceRumoursFeatures[tweetId]["firstHandSourceCount"] = urlCount + mediaCount
        florenceRumoursFeatures[tweetId]["secondHandSourceCount"] = 1 if qtFlag == True else 0

        ### evidence credibility    
        florenceRumoursFeatures[tweetId]["urlNotoriety"] = (True if len([i for i in notoriousWebsites if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False
        florenceRumoursFeatures[tweetId]["urlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False

        quoteUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescReputation = (len([i for i in credibleAccountsUsa if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        quoteUrlNotoriety = (len([i for i in notoriousWebsites if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlNotoriety = (len([i for i in notoriousWebsites if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescNotoriety = (len([i for i in notoriousId if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        florenceRumoursFeatures[tweetId]["quoteReputation"] = quoteUrlReputation or quoteProfileUrlReputation or quoteProfileDescReputation
        florenceRumoursFeatures[tweetId]["quoteNotoriety"] = quoteUrlNotoriety or quoteProfileUrlNotoriety or quoteProfileDescNotoriety

         ################################################################ Reach Features ################################################################
        florenceRumoursFeatures[tweetId]["likeCount"] = tweet["favorite_count"]
        florenceRumoursFeatures[tweetId]["retweetCount"] = tweet["retweet_count"]

         ################################################################ Miscellaneous Features ################################################################
        florenceRumoursFeatures[tweetId]["hashtagCount"] = len(entities["hashtags"])
        florenceRumoursFeatures[tweetId]["mentionCount"] = len(entities["user_mentions"])
        florenceRumoursFeatures[tweetId]["freshness"] = rtFlag

    pk.dump(florenceRumoursFeatures, open(f'{featureSerializationAdr}florenceRumoursFeatures_{counter}.pk', "wb"))
    florenceRumoursFeatures = {}

In [None]:
# Storing the features as one pandas dataframe
florenceRumoursFeaturesFolder = [i for i in os.listdir(featureSerializationAdr) if "florenceRumoursFeatures_" in i]
florenceRumoursFeaturesList = []
for i in florenceRumoursFeaturesFolder:
    florenceRumoursFeaturesList.append(pd.DataFrame.from_dict(pk.load(open(featureSerializationAdr+i, "rb"))).T)
florenceRumoursFeatures = pd.concat([df for df in florenceRumoursFeaturesList])
florenceRumoursFeatures["id"] = florenceRumoursFeatures["id"].astype("int64")
pk.dump(florenceRumoursFeatures, open(f'{featureSerializationAdr}/florenceRumoursFeaturesWithoutLIWC.pk', "wb"))

In [None]:
# Adding LIWC features
florenceRumoursLIWC = pk.load(open(featureSerializationAdr+"florenceRumoursLIWC.pk", "rb"))
florenceRumoursLIWC = florenceRumoursLIWC.fillna(0).replace(',','.', regex=True).astype("float64")
florenceRumoursFeaturesWithLIWC = pd.merge(florenceRumoursFeatures, florenceRumoursLIWC, on="id")
pk.dump(florenceRumoursFeaturesWithLIWC, open(f'{featureSerializationAdr}/florenceRumoursFeaturesWithLIWC.pk', "wb"))

### Non-Rumour

In [None]:
florenceNonRumourFolder = [i for i in os.listdir(tweetSerializationAdr) if "florenceNonRumour" in i]
counter = 0
florenceNonRumourId = {}
florenceNonRumoursFeatures = {}

for file in florenceNonRumourFolder:
    florenceNonRumours = pk.load(open(tweetSerializationAdr+file, "rb")) 
    for tweet in tqdm(florenceNonRumours):
    ################################################################ Basic Setup ################################################################
        counter += 1
        qtText, qtEntities, qtUser, rtUser, tweetText = None, None, None, None, None

        elements = tweetElements(tweet)
        rtFlag = copy.deepcopy(elements[0])
        qtFlag = copy.deepcopy(elements[1]) 

        if rtFlag == False and qtFlag == False:
            text, entities, user = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4])
        elif rtFlag == True and qtFlag == False:
            text, entities, user, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5])
        elif rtFlag == False and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7])       
        elif rtFlag == True and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7]), copy.deepcopy(elements[8])

        rawText, semiRawText, processedText, overProcessedText = textProcessor(text)

        tweetId = tweet["id"]
        name = user["name"].lower().strip()    

        spacyText = nlp(overProcessedText) 
        spacyName = nlp(name)


        # There are some duplicated tweets (I don't know why!)
        if tweetId in florenceNonRumourId:
            florenceNonRumourId[tweetId] += 1
            continue
        else:
            florenceNonRumourId[tweetId] = 1


         ################################################################ NonRumour General Info ################################################################

        florenceNonRumoursFeatures[tweetId]={}
        florenceNonRumoursFeatures[tweetId]["id"]=tweetId
        florenceNonRumoursFeatures[tweetId]["screenName"]=user["screen_name"]
        florenceNonRumoursFeatures[tweetId]["text"]=text
        florenceNonRumoursFeatures[tweetId]["tweetUrl"] = "https://twitter.com/" + user["screen_name"] + "/status/" + str(tweetId)

    ##### =>
        pattern = re.compile('[>].*[<]')
        try:
            florenceNonRumoursFeatures[tweetId]["source"] = pattern.findall(tweet["source"])[0][1:-1]
        except:
            florenceNonRumoursFeatures[tweetId]["source"] = tweet["source"]

        florenceNonRumoursFeatures[tweetId]["tweetPostTime"] = datetime.datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
    #### =>
        florenceNonRumoursFeatures[tweetId]["place"] = tweet["place"]
        florenceNonRumoursFeatures[tweetId]["profileLocation"] = user["location"]


         ################################################################ Syntactical Features ################################################################


        florenceNonRumoursFeatures[tweetId]["characterCount"] = len(rawText)
        florenceNonRumoursFeatures[tweetId]["tokenCount"] = len(spacyText)
        sentences = sentenceTokenizer.tokenize(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["sentenceCount"] = len(sentences)
        florenceNonRumoursFeatures[tweetId]["averageWordComplexity"] = np.average([len(i) for i in word_tokenize(overProcessedText)])
        florenceNonRumoursFeatures[tweetId]["averageSentenceComplexity"] = np.average([len(word_tokenize(i)) for i in sentences])

        ### Twitter special tree for future
        try:
            parser=nlp2.parse(overProcessedText) 
            tree=Tree.fromstring(parser.__str__()) 
            florenceNonRumoursFeatures[tweetId]["tweetComplexity"] = tree.height() 
        except:
            florenceNonRumoursFeatures[tweetId]["tweetComplexity"] = 0

            
            


        tags = [x.tag_ for x in spacyText]    
        for tag in tags:
            florenceNonRumoursFeatures[tweetId][tag] = tags.count(tag)

        ners = [x.ent_type_ for x in spacyText]    
        for ner in ners:
            if ner != '':
                florenceNonRumoursFeatures[tweetId][ner] = ners.count(ner)

         ################################################################ NonRumour Language Features ################################################################

        florenceNonRumoursFeatures[tweetId]["witnessPhrases"] = True if "i see" in overProcessedText else False
        florenceNonRumoursFeatures[tweetId]["witnessPhrases"] = True if "i hear" in overProcessedText else False

        uppercaseCharCount = sum(1 for i in semiRawText if i.isupper())
        lowercaseCharCount = sum(1 for i in semiRawText if i.islower())
        florenceNonRumoursFeatures[tweetId]["upperCaseCount"] = uppercaseCharCount
        florenceNonRumoursFeatures[tweetId]["lowerCaseCount"] = lowercaseCharCount
        try: # Becasue of devision by zero error
            florenceNonRumoursFeatures[tweetId]["upperCaseCharFrac"] =  uppercaseCharCount / (uppercaseCharCount + lowercaseCharCount)
        except:
            florenceNonRumoursFeatures[tweetId]["upperCaseCharFrac"] = 0

        capitalWordsCount = len([b for b in [i for i in word_tokenize(processedText)] if b.isupper()])
        florenceNonRumoursFeatures[tweetId]["capitalWordsCount"] = capitalWordsCount
        try: # Because of devision by zero error
            florenceNonRumoursFeatures[tweetId]["capitalWordFrac"] = capitalWordsCount / len(spacyText)
        except:
            florenceNonRumoursFeatures[tweetId]["capitalWordFrac"] = 0
        florenceNonRumoursFeatures[tweetId]["exclamationMarkCount"] = overProcessedText.count("!")
        florenceNonRumoursFeatures[tweetId]["questionMarkCount"] = overProcessedText.count("?")

        florenceNonRumoursFeatures[tweetId]["firstPersonPronounCount"] = firstPersonPronounCount(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["secondPersonPronounCount"] = secondPersonPronounCount(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["thirdPersonPronounCount"] = thirdPersonPronounCount(overProcessedText)

        florenceNonRumoursFeatures[tweetId]["vuglarTermsCount"] = len([a for a in word_tokenize(overProcessedText) if a in vuglarList])
        florenceNonRumoursFeatures[tweetId]["emoticonCount"] = len([a for a in word_tokenize(overProcessedText) if a in emotiList])
        florenceNonRumoursFeatures[tweetId]["abbreviationCount"] = len([a for a in word_tokenize(overProcessedText) if a in abbrList])
        florenceNonRumoursFeatures[tweetId]["emojiCount"] = len([x for x in overProcessedText if x in emoji.UNICODE_EMOJI])

        ################################################################ NonRumour Psycholinguistic Features ################################################################

        florenceNonRumoursFeatures[tweetId]["subjectivityScore"] = TextBlob(semiRawText).sentiment.subjectivity
        florenceNonRumoursFeatures[tweetId]["polarityScore"] = TextBlob(semiRawText).sentiment.polarity    

        florenceNonRumoursFeatures[tweetId]["angerEmotion"], florenceNonRumoursFeatures[tweetId]["anticipation"], florenceNonRumoursFeatures[tweetId]["disgust"], \
        florenceNonRumoursFeatures[tweetId]["fear"], florenceNonRumoursFeatures[tweetId]["joy"], florenceNonRumoursFeatures[tweetId]["sadness"], \
        florenceNonRumoursFeatures[tweetId]["surprise"], florenceNonRumoursFeatures[tweetId]["trust"] = nrcEmotions(overProcessedText)

        florenceNonRumoursFeatures[tweetId]["positiveEmotion"], florenceNonRumoursFeatures[tweetId]["negativeEmotion"], florenceNonRumoursFeatures[tweetId]["neutralEmotion"],\
        florenceNonRumoursFeatures[tweetId]["compoundEmotion"], florenceNonRumoursFeatures[tweetId]["arousalScore"], florenceNonRumoursFeatures[tweetId]["dominanceScore"] = \
        emotions(semiRawText, overProcessedText)

        florenceNonRumoursFeatures[tweetId]["hateSpeech"], florenceNonRumoursFeatures[tweetId]["offensiveLanguage"], florenceNonRumoursFeatures[tweetId]["neitherClasses"] = hateSpeech(semiRawText)
        
        
        florenceNonRumoursFeatures[tweetId]["flesch_reading_ease"] = textstat.flesch_reading_ease(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["smog_index"] = textstat.smog_index(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["coleman_liau_index"] = textstat.coleman_liau_index(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["automated_readability_index"] = textstat.automated_readability_index(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["dale_chall_readability_score"] = textstat.dale_chall_readability_score(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["difficult_words"] = textstat.difficult_words(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["linsear_write_formula"] = textstat.linsear_write_formula(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["gunning_fog"] = textstat.gunning_fog(overProcessedText)
        
        
        # One of the syllabus library pitfall is that, an empty sentence is one syllabus
        # To avoid devision by zero error
        try: 
            florenceNonRumoursFeatures[tweetId]["averageWordsyllables"] = np.average([syllables.estimate(i) for i in word_tokenize(overProcessedText)])
        except:
            florenceNonRumoursFeatures[tweetId]["averageWordsyllables"] = 0   
         ################################################################ NonRumourmonger Features ################################################################

        ### popularity/seclusion features ###
        florenceNonRumoursFeatures[tweetId]["followingCount"] = user["friends_count"]
        florenceNonRumoursFeatures[tweetId]["influnece"] = user["followers_count"]
        florenceNonRumoursFeatures[tweetId]["userRole"] = (user["followers_count"]+1)/(user["friends_count"]+1)

         ### activity features ###
        today = datetime.datetime.now()
        accountCreationTime = datetime.datetime.strptime(user["created_at"], "%a %b %d %H:%M:%S %z %Y")
        florenceNonRumoursFeatures[tweetId]["accountAge"] = (today.date() - accountCreationTime.date()).days    
        florenceNonRumoursFeatures[tweetId]["totalProfileLikesCount"] = user["favourites_count"]
        florenceNonRumoursFeatures[tweetId]["statusCount"] = user["statuses_count"]
        florenceNonRumoursFeatures[tweetId]["averageFollowSpeed"] = user["followers_count"] / florenceNonRumoursFeatures[tweetId]["accountAge"]
        florenceNonRumoursFeatures[tweetId]["averageBeingFollowedSpeed"] = user["friends_count"] / florenceNonRumoursFeatures[tweetId]["accountAge"]
        florenceNonRumoursFeatures[tweetId]["averageLikeSpeed"] = user["favourites_count"] / florenceNonRumoursFeatures[tweetId]["accountAge"]
        florenceNonRumoursFeatures[tweetId]["averageStatusSpeed"] = user["statuses_count"] / florenceNonRumoursFeatures[tweetId]["accountAge"]

         ### profile reputation ###
        florenceNonRumoursFeatures[tweetId]["isVerifiedAccount"] = user["verified"]  

        florenceNonRumoursFeatures[tweetId]["userDescriptionReputation"] = (True if len([i for i in credibleAccountsUsa if i in user["description"].lower()]) > 0 else False) if user["description"] != None else False
        florenceNonRumoursFeatures[tweetId]["userDescriptionNotoriety"] = (True if len([i for i in notoriousId if i in user["description"].lower()]) > 0 else False) if user["description"] != None else False
        florenceNonRumoursFeatures[tweetId]["userUrlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in user["url"].lower()]) > 0 else False) if user["url"] != None else False
        florenceNonRumoursFeatures[tweetId]["userUrlNotoriety"] = (True if len([i for i in notoriousWebsites if i in user["url"].lower()]) > 0 else False) if user["url"] != None else False

         ### Reticency features ###  
        florenceNonRumoursFeatures[tweetId]["hasProfileLocation"] = True if user["location"] != None else False
        florenceNonRumoursFeatures[tweetId]["hasProfilePicture"] = True if user["profile_image_url"] != None else False
        florenceNonRumoursFeatures[tweetId]["geoEnabled"] = True if user["geo_enabled"] != None else False
        florenceNonRumoursFeatures[tweetId]["hasProfileUrl"] = True if user["url"] != None else False
        florenceNonRumoursFeatures[tweetId]["hasProfileDescription"] = True if user["description"] != None else False    

         ### fake identity ###
        florenceNonRumoursFeatures[tweetId]["screenNameLength"] = len(user["screen_name"])
        florenceNonRumoursFeatures[tweetId]["screenNameDigitCount"] = len([i for i in user["screen_name"] if i in [str(k) for k in range(0,10)]])   
        florenceNonRumoursFeatures[tweetId]["protectedProfile"] = True if user["protected"] != None else False
        florenceNonRumoursFeatures[tweetId]["personNameInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "PERSON"]) > 0  else False
        florenceNonRumoursFeatures[tweetId]["organizationInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "ORG"]) > 0  else False
        florenceNonRumoursFeatures[tweetId]["locationInProfile"] = True if len([x for x in spacyName if x.ent_type_ == "LOC" or x.ent_type_ == "GPE"]) > 0  else False

        ### evidence availability ###   
        urlCount = len(entities["urls"]) if "urls" in entities.keys() else 0
        mediaCount = len(entities["media"]) if "media" in entities.keys() else 0

        florenceNonRumoursFeatures[tweetId]["urlAvailability"] = urlCount > 0
        florenceNonRumoursFeatures[tweetId]["mediaAvailability"] = mediaCount > 0
        florenceNonRumoursFeatures[tweetId]["quoteAvailability"] = qtFlag

         ### evidence diversity ###   
        florenceNonRumoursFeatures[tweetId]["monoSource"] = singleTruth(urlCount > 0, mediaCount > 0, qtFlag)
        florenceNonRumoursFeatures[tweetId]["doubleSource"] = doubleTruth(urlCount > 0, mediaCount > 0, qtFlag)
        florenceNonRumoursFeatures[tweetId]["trippleSource"] = (urlCount > 0) and (mediaCount > 0) and qtFlag

        ### evidence quality ## First hand / second hand
        florenceNonRumoursFeatures[tweetId]["firstHandSourceCount"] = urlCount + mediaCount
        florenceNonRumoursFeatures[tweetId]["secondHandSourceCount"] = 1 if qtFlag == True else 0

        ### evidence credibility    
        florenceNonRumoursFeatures[tweetId]["urlNotoriety"] = (True if len([i for i in notoriousWebsites if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False
        florenceNonRumoursFeatures[tweetId]["urlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False

        quoteUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescReputation = (len([i for i in credibleAccountsUsa if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        quoteUrlNotoriety = (len([i for i in notoriousWebsites if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlNotoriety = (len([i for i in notoriousWebsites if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescNotoriety = (len([i for i in notoriousId if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        florenceNonRumoursFeatures[tweetId]["quoteReputation"] = quoteUrlReputation or quoteProfileUrlReputation or quoteProfileDescReputation
        florenceNonRumoursFeatures[tweetId]["quoteNotoriety"] = quoteUrlNotoriety or quoteProfileUrlNotoriety or quoteProfileDescNotoriety

         ############################################################### Reach Features ################################################################
        florenceNonRumoursFeatures[tweetId]["likeCount"] = tweet["favorite_count"]
        florenceNonRumoursFeatures[tweetId]["retweetCount"] = tweet["retweet_count"]

         ################################################################ Miscellaneous Features ################################################################
        florenceNonRumoursFeatures[tweetId]["hashtagCount"] = len(entities["hashtags"])
        florenceNonRumoursFeatures[tweetId]["mentionCount"] = len(entities["user_mentions"])
        florenceNonRumoursFeatures[tweetId]["freshness"] = rtFlag

    pk.dump(florenceNonRumoursFeatures, open(f'{featureSerializationAdr}florenceNonRumoursFeatures_{counter}.pk', "wb"))
    florenceNonRumoursFeatures = {}

In [None]:
# Storing the features as one pandas dataframe
florenceNonRumoursFeaturesFolder = [i for i in os.listdir(featureSerializationAdr) if "florenceNonRumoursFeatures_" in i]
florenceNonRumoursFeaturesList = []
for i in florenceNonRumoursFeaturesFolder:
    florenceNonRumoursFeaturesList.append(pd.DataFrame.from_dict(pk.load(open(featureSerializationAdr+i, "rb"))).T)
florenceNonRumoursFeatures = pd.concat([df for df in florenceNonRumoursFeaturesList])
florenceNonRumoursFeatures["id"] = florenceNonRumoursFeatures["id"].astype("int64")
pk.dump(florenceNonRumoursFeatures, open(f'{featureSerializationAdr}/florenceNonRumoursFeaturesWithoutLIWC.pk', "wb"))

In [None]:
# Adding LIWC features
florenceNonRumoursLIWC = pk.load(open(featureSerializationAdr+"florenceNonRumoursLIWC.pk", "rb"))
florenceNonRumoursLIWC = florenceNonRumoursLIWC.fillna(0).replace(',','.', regex=True).astype("float64")
florenceNonRumoursFeaturesWithLIWC = pd.merge(florenceNonRumoursFeatures, florenceNonRumoursLIWC, on="id")
pk.dump(florenceNonRumoursFeaturesWithLIWC, open(f'{featureSerializationAdr}/florenceNonRumoursFeaturesWithLIWC.pk', "wb"))

In [None]:
df1 = pd.DataFrame.from_dict(pk.load(open( './Serialization/Features/florenceRumoursFeatures_50000' ,"rb")), orient="index")
df2 = pd.DataFrame.from_dict(pk.load(open( './Serialization/Features/florenceRumoursFeatures_100000' ,"rb")), orient="index")
df3 = pd.DataFrame.from_dict(pk.load(open( './Serialization/Features/florenceRumoursFeatures_119889' ,"rb")), orient="index")
df4 = pd.DataFrame.from_dict(pk.load(open( './Serialization/Features/florenceRumoursFeatures_169889' ,"rb")), orient="index")
df = pd.concat([df1,df2,df3,df4])