In [None]:
import os
from tqdm import tqdm
import json
import spacy
import en_core_web_sm
import numpy as np
import re
import pandas as pd
import pickle as pk
import copy
import datetime
from sklearn import preprocessing
from textblob import TextBlob
import emoji
from pprint import pprint
from nltk.tree import Tree
import nltk.data
from nltk.tokenize import sent_tokenize, word_tokenize
from stanfordcorenlp import StanfordCoreNLP
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from hatesonar import Sonar
from pprint import pprint
from urlextract import URLExtract
import syllables
import textstat

In [None]:
# Addresses
tweetsAdr = "./Serialization/Tweets/"
featureSerializationAdr = "./Serialization/Features/"
tweetSerializationAdr = "./Serialization/Tweets/"

In [None]:
#Libraries Setup

nlp = spacy.load("en_core_web_sm")
nlp2 = StanfordCoreNLP('')
analyser = SentimentIntensityAnalyzer()
sonar = Sonar()
extractor = URLExtract()
sentenceTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
# Credible sources - USA

adr1 = ""
df = pd.read_csv(adr1, delimiter=";", skiprows=1 , names = [1,2,3,4,5,6,7,8,9], encoding = "ISO-8859-1")
df = df.drop([1,5,6,7,8,9], axis="columns")
df = df.rename({2:"source", 3:"id", 4:"website"}, axis="columns")

df1 = df.drop(columns=["id"], axis="columns")
df2 = df.drop(columns=["website"], axis="columns")

df2["id"] = df2["id"].str.lower().str.replace("?", "").str.strip()

credibleWebsitesUsa = [i for i in df1.to_dict(orient="list")["website"] if str(i) != "nan"]
credibleAccountsUsa = [j for j in df2.to_dict(orient="list")["id"] if str(j) != "nan"]

In [None]:
# Credible sources - INDIA

adr2 = ""
df = pd.read_csv(adr2, delimiter=";", skiprows=1 , names=[1,2,3])
df1 = df.drop(columns=[1,2])
df2 = df.drop(columns=[1,3])
df1 = df1.rename({2:"twitter", 3:"site"}, axis="columns")
df2 = df2.rename({2:"twitter", 3:"site"}, axis="columns")

df2["twitter"] = df2["twitter"].str.lower().str.strip()
# df2["twitter"] = df2["twitter"].apply(lambda x: "@"+x)

credibleWebsitesIndia = [q for q in df1.to_dict(orient="list")["site"] if str(q) != "nan"]
credibleAccountsIndia = [p for p in df2.to_dict(orient="list")["twitter"] if str(p) != "nan"]

credibleAccountsIndia = credibleAccountsIndia + credibleAccountsUsa
credibleWebsitesIndia = credibleWebsitesIndia + credibleWebsitesUsa

In [None]:
# Notorious sources

notoriousSources = ""
dfn = pd.read_csv(notoriousSources, delimiter=";")
dfn = dfn.rename({"test":"title", "Unnamed: 1":"id", "Unnamed: 2":"website"}, axis="columns")
notoriousWebsites = [i.lower().strip() for i in dfn.to_dict(orient="list")["website"] if str(i) != "nan"]
notoriousId = [i.lower().strip() for i in dfn.to_dict(orient="list")["id"] if str(i) != "nan"]

In [None]:
# Loading abbreviations, vuglar terms and emoticons for feature extraction

abbrAdr = ""
abbrList = [w.strip() for w in open(abbrAdr).readlines() if w != "\n"]
    
emotiAdr = "./Lists/Emoticon/emoticons.txt"
emotiList = [w.strip() for w in open(emotiAdr).readlines() if w != "\n"]

vuglarAdr = "./Lists/Vuglar terms/vuglarTerms.txt"
vuglarList = [w.strip() for w in open(vuglarAdr).readlines() if w != "\n"]

In [None]:
# Creating NRC dictionary for different feelings

adr = ""

nrcRaw = open(adr).readlines()
nrcDic = {}
for i in nrcRaw:
    tmp = i.strip().split("\t")
    lemma = tmp[0]
    sentiment = tmp[1]
    score = tmp[2]
    if lemma in nrcDic.keys():
        nrcDic[lemma][sentiment] = int(score)
    else:
        nrcDic[lemma] = {sentiment : int(score)}

In [None]:
# Creating Emotion dictionary

adr = ""

emotions = pd.DataFrame.from_csv(adr)
emotions = emotions[["Word", "V.Mean.Sum","A.Mean.Sum","D.Mean.Sum"]]
emotions.columns = ["word","valence","arousal","dominance"]
emotions = emotions.T
emotions.columns = emotions.loc["word"]
emotions = emotions.drop(["word"], axis="index")
emotionDic = pd.DataFrame.to_dict(emotions)

In [None]:
def tweetElements(tweet):
    rtFlag = True if "retweeted_status" in tweet.keys() else False
    qtFlag = True if "quoted_status" in tweet.keys() else False 

    if rtFlag == False and qtFlag == False:
        if tweet["truncated"] == True:
            text = copy.deepcopy(tweet["extended_tweet"]["full_text"])
            entities = copy.deepcopy(tweet["extended_tweet"]["entities"])
        else:
            text = copy.deepcopy(tweet["text"])
            entities = copy.deepcopy(tweet["entities"])
        user = copy.deepcopy(tweet["user"])
        return [rtFlag, qtFlag, text, entities, user]
    
    elif rtFlag == True and qtFlag == False:
        rt = copy.deepcopy(tweet["retweeted_status"])
        if rt["truncated"] == True:
            text = copy.deepcopy(rt["extended_tweet"]["full_text"])
            entities = copy.deepcopy(rt["extended_tweet"]["entities"])
        else:
            text = copy.deepcopy(rt["text"])
            entities = copy.deepcopy(rt["entities"])
        rtUser = copy.deepcopy(rt["user"])
        user = copy.deepcopy(tweet["user"])
        return [rtFlag, qtFlag, text, entities, user, rtUser]
    
    elif rtFlag == False and qtFlag == True:
        if tweet["truncated"] == True:
            text = copy.deepcopy(tweet["extended_tweet"]["full_text"])
            entities = copy.deepcopy(tweet["extended_tweet"]["entities"])
        else:
            text = copy.deepcopy(tweet["text"])
            entities = copy.deepcopy(tweet["entities"])
        user = copy.deepcopy(tweet["user"])
        qt = copy.deepcopy(tweet["quoted_status"])
        if qt["truncated"] == True:
            qtText = copy.deepcopy(qt["extended_tweet"]["full_text"])
            qtEntities = copy.deepcopy(qt["extended_tweet"]["entities"])
        else:
            qtText = copy.deepcopy(qt["text"])
            qtEntities = copy.deepcopy(qt["entities"])
        qtUser = copy.deepcopy(qt["user"])
        return [rtFlag, qtFlag, text, entities, user, qtText, qtEntities, qtUser]        
        
    elif rtFlag == True and qtFlag == True:
        rt = copy.deepcopy(tweet["retweeted_status"])
        qt = copy.deepcopy(tweet["quoted_status"])        
        if rt["truncated"] == True:
            text = copy.deepcopy(rt["extended_tweet"]["full_text"])
            entities = copy.deepcopy(rt["extended_tweet"]["entities"])
        else:
            text = copy.deepcopy(rt["text"])
            entities = copy.deepcopy(rt["entities"])
        rtUser = copy.deepcopy(rt["user"])        
        if qt["truncated"] == True:
            qtText = copy.deepcopy(qt["extended_tweet"]["full_text"])
            qtEntities = copy.deepcopy(qt["extended_tweet"]["entities"])
        else:
            qtText = copy.deepcopy(qt["text"])
            qtEntities = copy.deepcopy(qt["entities"])
        qtUser = copy.deepcopy(qt["user"])
        user = copy.deepcopy(tweet["user"])
        return [rtFlag, qtFlag, text, entities, user, qtText, qtEntities, qtUser, rtUser]        

In [None]:
def textProcessor(twtTxt):
    urls = extractor.find_urls(twtTxt)
    semiRaw = twtTxt
    for url in urls:
        semiRaw = semiRaw.replace(url,"")
 
    semiRaw = semiRaw.replace("  ", " ").replace("  ", " ").replace("\t", " ").replace("\n", " ").strip()
    processed = semiRaw.replace("#"," ").replace("@", " ").replace("  ", " ").replace("  ", " ").replace("\t", " ").replace("\n", " ").strip()
    overProcessed = processed.lower()
    
    return twtTxt, semiRaw, processed, overProcessed

In [None]:
def firstPersonPronounCount(myTxt):
    # Because in tokenization of I've, i'd, and I'm "you" will be separated, we do not ned to take care of such cases
    comb_sing = word_tokenize(myTxt).count("i") \
        + (word_tokenize(myTxt).count("my") + word_tokenize(myTxt).count("mine") + word_tokenize(myTxt).count("me"))
    
    comb_plur = word_tokenize(myTxt).count("we")  \
        + (word_tokenize(myTxt).count("our") + word_tokenize(myTxt).count("ours") + word_tokenize(myTxt).count("us"))
 
    return comb_sing + comb_plur

In [None]:
def secondPersonPronounCount(myText):
    # Because in tokenization of you've, you'd, and you're "you" will be separated, we do not ned to take care of such cases
    return word_tokenize(myText).count("you")  \
            + word_tokenize(myText).count("your") + word_tokenize(myText).count("yours")

In [None]:
def thirdPersonPronounCount(myTweetTxt):
    # The same comments as above for he, she, it, and they
    sing = word_tokenize(myTweetTxt).count("he") + \
            word_tokenize(myTweetTxt).count("she")  + \
            word_tokenize(myTweetTxt).count("it")  + \
            (word_tokenize(myTweetTxt).count("his") + word_tokenize(myTweetTxt).count("her") + \
             word_tokenize(myTweetTxt).count("its") + word_tokenize(myTweetTxt).count("him") + \
             word_tokenize(myTweetTxt).count("him") +  word_tokenize(myTweetTxt).count("hers"))
    
    comb = word_tokenize(myTweetTxt).count("they")  + \
            (word_tokenize(myTweetTxt).count("their") + word_tokenize(myTweetTxt).count("theirs") + word_tokenize(myTweetTxt).count("them"))
     
    return sing + comb

In [None]:
def nrcEmotions(nrcTxt):
    nrcTxtList = word_tokenize(nrcTxt)
    
    angerScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            angerScore += nrcDic[term]["anger"]
    anticipationScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            anticipationScore += nrcDic[term]["anticipation"]
    disgustScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            disgustScore += nrcDic[term]["disgust"]
    fearScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            fearScore += nrcDic[term]["fear"]
    joyScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            joyScore += nrcDic[term]["joy"]
    sadnessScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            sadnessScore += nrcDic[term]["sadness"]
    surpriseScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            surpriseScore += nrcDic[term]["surprise"]
    trustScore = 0
    for term in nrcTxtList:
        if term in nrcDic:
            trustScore += nrcDic[term]["trust"]
    
    return angerScore, anticipationScore, disgustScore, fearScore, joyScore, sadnessScore, surpriseScore, trustScore

In [None]:
def emotions(semiRaw, overProc):
    posScore = analyser.polarity_scores(semiRaw)["pos"]
    negScore = analyser.polarity_scores(semiRaw)["neg"]
    neuScore = analyser.polarity_scores(semiRaw)["neu"]
    compScore = analyser.polarity_scores(semiRaw)["compound"]

    wordList = word_tokenize(overProc)
    arousalScore = 0
    for term in wordList:
        if term in emotionDic:
            arousalScore += emotionDic[term]["arousal"]

    dominanceScore = 0
    for term in wordList:
        if term in emotionDic:
            dominanceScore += emotionDic[term]["dominance"]
            
    return posScore, negScore, neuScore, compScore, arousalScore, dominanceScore

In [None]:
def hateSpeech(tweetTxt):
    sonar2 = sonar.ping(tweetTxt)
    return sonar2["classes"][0]["confidence"], sonar2["classes"][1]["confidence"], sonar2["classes"][2]["confidence"]

In [None]:
def singleTruth(first, second, third):
    if first == True and second == False and third == False:
        return True
    elif first == False and second == True and third == False:
        return True
    elif first == False and second == False and third == True:
        return True
    else:
        return False

In [None]:
def doubleTruth(first, second, third):
    if first == True and second == True and third == False:
        return True
    elif first == False and second == True and third == True:
        return True
    elif first == True and second == False and third == True:
        return True
    else:
        return False

# Kerala

### Rumour

In [None]:
keralaRumourFolder = [i for i in os.listdir(tweetSerializationAdr) if "keralaRumour" in i]
counter = 0
keralaRumourId = {}
keralaRumoursFeatures = {}

for file in keralaRumourFolder:
    keralaRumours = pk.load(open(tweetSerializationAdr+file, "rb")) 
    for tweet in tqdm(keralaRumours):
    ################################################################ Basic Setup ################################################################
        counter += 1
        qtText, qtEntities, qtUser, rtUser, tweetText = None, None, None, None, None

        elements = tweetElements(tweet)
        rtFlag = copy.deepcopy(elements[0])
        qtFlag = copy.deepcopy(elements[1]) 

        if rtFlag == False and qtFlag == False:
            text, entities, user = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4])
        elif rtFlag == True and qtFlag == False:
            text, entities, user, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5])
        elif rtFlag == False and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7])       
        elif rtFlag == True and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7]), copy.deepcopy(elements[8])

        rawText, semiRawText, processedText, overProcessedText = textProcessor(text)

        tweetId = tweet["id"]
        name = user["name"].lower().strip()    

        spacyText = nlp(overProcessedText) 
        spacyName = nlp(name)


        # There are some duplicated tweets (I don't know why!)
        if tweetId in keralaRumourId:
            keralaRumourId[tweetId] += 1
            continue
        else:
            keralaRumourId[tweetId] = 1


         ################################################################ Rumour General Info ################################################################

        keralaRumoursFeatures[tweetId]={}
        keralaRumoursFeatures[tweetId]["id"]=tweetId
        sentences = sentenceTokenizer.tokenize(overProcessedText)
        keralaRumoursFeatures[tweetId]["sentenceCount"] = len(sentences)
        keralaRumoursFeatures[tweetId]["averageWordComplexity"] = np.average([len(i) for i in word_tokenize(overProcessedText)])
        keralaRumoursFeatures[tweetId]["averageSentenceComplexity"] = np.average([len(word_tokenize(i)) for i in sentences])


        # This is temporary for some new features
        nrcTxtList = word_tokenize(overProcessedText)
        angerScore = 0
        for term in nrcTxtList:
            if term in nrcDic:
                angerScore += nrcDic[term]["anger"]
        keralaRumoursFeatures[tweetId]["angerEmotion"] = angerScore
        
        keralaRumoursFeatures[tweetId]["flesch_reading_ease"] = textstat.flesch_reading_ease(overProcessedText)
        keralaRumoursFeatures[tweetId]["smog_index"] = textstat.smog_index(overProcessedText)
        keralaRumoursFeatures[tweetId]["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(overProcessedText)
        keralaRumoursFeatures[tweetId]["coleman_liau_index"] = textstat.coleman_liau_index(overProcessedText)
        keralaRumoursFeatures[tweetId]["automated_readability_index"] = textstat.automated_readability_index(overProcessedText)
        keralaRumoursFeatures[tweetId]["dale_chall_readability_score"] = textstat.dale_chall_readability_score(overProcessedText)
        keralaRumoursFeatures[tweetId]["difficult_words"] = textstat.difficult_words(overProcessedText)
        keralaRumoursFeatures[tweetId]["linsear_write_formula"] = textstat.linsear_write_formula(overProcessedText)
        keralaRumoursFeatures[tweetId]["gunning_fog"] = textstat.gunning_fog(overProcessedText)
        
        
        # One of the syllabus library pitfall is that, an empty sentence is one syllabus
        # To avoid devision by zero error
        try: 
            keralaRumoursFeatures[tweetId]["averageWordsyllables"] = np.average([syllables.estimate(i) for i in word_tokenize(overProcessedText)])
        except:
            keralaRumoursFeatures[tweetId]["averageWordsyllables"] = 0            
        

        ### evidence credibility  
        urlCount = len(entities["urls"]) if "urls" in entities.keys() else 0
        mediaCount = len(entities["media"]) if "media" in entities.keys() else 0
        
        keralaRumoursFeatures[tweetId]["urlNotoriety"] = (True if len([i for i in notoriousWebsites if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False
        keralaRumoursFeatures[tweetId]["urlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False

        quoteUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescReputation = (len([i for i in credibleAccountsUsa if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        
        quoteUrlNotoriety = (len([i for i in notoriousWebsites if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlNotoriety = (len([i for i in notoriousWebsites if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescNotoriety = (len([i for i in notoriousId if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        keralaRumoursFeatures[tweetId]["quoteReputation"] = quoteUrlReputation or quoteProfileUrlReputation or quoteProfileDescReputation
        keralaRumoursFeatures[tweetId]["quoteNotoriety"] = quoteUrlNotoriety or quoteProfileUrlNotoriety or quoteProfileDescNotoriety


    pk.dump(keralaRumoursFeatures, open(f'{featureSerializationAdr}keralaRumoursFeatures_SomeNewFeatures_{counter}.pk', "wb")) # This is temporary for adding new features
    keralaRumoursFeatures = {}

In [None]:
# This is temporary to add some new features
keralaRumoursFeaturesFolder = [i for i in os.listdir(featureSerializationAdr) if "keralaRumoursFeatures_SomeNewFeatures_" in i]
keralaRumoursFeaturesList = []
for i in keralaRumoursFeaturesFolder:
    keralaRumoursFeaturesList.append(pd.DataFrame.from_dict(pk.load(open(featureSerializationAdr+i, "rb"))).T)
keralaRumoursFeatures = pd.concat([df for df in keralaRumoursFeaturesList])
keralaRumoursFeatures["id"] = keralaRumoursFeatures["id"].astype("int64")
pk.dump(keralaRumoursFeatures, open(f'{featureSerializationAdr}/keralaRumoursFeaturesWithoutLIWC_SomeNewFeatures.pk', "wb"))

In [None]:
# Adding LIWC features
keralaRumoursLIWC = pk.load(open(featureSerializationAdr+"keralaRumoursLIWC.pk", "rb"))
keralaRumoursFeaturesWithLIWC = pd.merge(keralaRumoursFeatures, keralaRumoursLIWC, on="id")
pk.dump(keralaRumoursFeaturesWithLIWC, open(f'{featureSerializationAdr}/keralaRumoursFeaturesWithLIWC_SomeNewFeatures.pk', "wb"))

In [None]:
oldkeralaRumoursLIWC = pk.load(open(featureSerializationAdr+"keralaRumoursFeaturesWithoutLIWC.pk", "rb"))

oldkeralaRumoursLIWC = oldkeralaRumoursLIWC.drop(columns=['averageSentenceComplexity', 'averageWordComplexity','sentenceCount'])

keralaRumoursWithLIWC_NewFeatures = pd.merge(oldkeralaRumoursLIWC, keralaRumoursFeaturesWithLIWC, on="id")

pk.dump(keralaRumoursWithLIWC_NewFeatures, open(f'{featureSerializationAdr}/keralaRumoursWithLIWC_NewFeatures.pk', "wb"))

### Non-Rumour

In [None]:
keralaNonRumourFolder = [i for i in os.listdir(tweetSerializationAdr) if "keralaNonRumour" in i]
counter = 0
keralaNonRumourId = {}
keralaNonRumoursFeatures = {}

for file in keralaNonRumourFolder:
    keralaNonRumours = pk.load(open(tweetSerializationAdr+file, "rb")) 
    for tweet in tqdm(keralaNonRumours):
    ################################################################ Basic Setup ################################################################
        counter += 1
        qtText, qtEntities, qtUser, rtUser, tweetText = None, None, None, None, None

        elements = tweetElements(tweet)
        rtFlag = copy.deepcopy(elements[0])
        qtFlag = copy.deepcopy(elements[1]) 

        if rtFlag == False and qtFlag == False:
            text, entities, user = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4])
        elif rtFlag == True and qtFlag == False:
            text, entities, user, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5])
        elif rtFlag == False and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7])       
        elif rtFlag == True and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7]), copy.deepcopy(elements[8])

        rawText, semiRawText, processedText, overProcessedText = textProcessor(text)

        tweetId = tweet["id"]
        name = user["name"].lower().strip()    

        spacyText = nlp(overProcessedText) 
        spacyName = nlp(name)


        # There are some duplicated tweets (I don't know why!)
        if tweetId in keralaNonRumourId:
            keralaNonRumourId[tweetId] += 1
            continue
        else:
            keralaNonRumourId[tweetId] = 1


         ################################################################ NonRumour General Info ################################################################

        keralaNonRumoursFeatures[tweetId]={}
        keralaNonRumoursFeatures[tweetId]["id"]=tweetId
        sentences = sentenceTokenizer.tokenize(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["sentenceCount"] = len(sentences)
        keralaNonRumoursFeatures[tweetId]["averageWordComplexity"] = np.average([len(i) for i in word_tokenize(overProcessedText)])
        keralaNonRumoursFeatures[tweetId]["averageSentenceComplexity"] = np.average([len(word_tokenize(i)) for i in sentences])


        # This is temporary for some new features
        nrcTxtList = word_tokenize(overProcessedText)
        angerScore = 0
        for term in nrcTxtList:
            if term in nrcDic:
                angerScore += nrcDic[term]["anger"]
        keralaNonRumoursFeatures[tweetId]["angerEmotion"] = angerScore
        
        keralaNonRumoursFeatures[tweetId]["flesch_reading_ease"] = textstat.flesch_reading_ease(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["smog_index"] = textstat.smog_index(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["coleman_liau_index"] = textstat.coleman_liau_index(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["automated_readability_index"] = textstat.automated_readability_index(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["dale_chall_readability_score"] = textstat.dale_chall_readability_score(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["difficult_words"] = textstat.difficult_words(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["linsear_write_formula"] = textstat.linsear_write_formula(overProcessedText)
        keralaNonRumoursFeatures[tweetId]["gunning_fog"] = textstat.gunning_fog(overProcessedText)
        
        
        # One of the syllabus library pitfall is that, an empty sentence is one syllabus
        # To avoid devision by zero error
        try: 
            keralaNonRumoursFeatures[tweetId]["averageWordsyllables"] = np.average([syllables.estimate(i) for i in word_tokenize(overProcessedText)])
        except:
            keralaNonRumoursFeatures[tweetId]["averageWordsyllables"] = 0      
        ### evidence credibility
        urlCount = len(entities["urls"]) if "urls" in entities.keys() else 0
        mediaCount = len(entities["media"]) if "media" in entities.keys() else 0
        
        keralaNonRumoursFeatures[tweetId]["urlNotoriety"]  = (True if len([i for i in notoriousWebsites if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False
        keralaNonRumoursFeatures[tweetId]["urlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False

        quoteUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescReputation = (len([i for i in credibleAccountsUsa if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        quoteUrlNotoriety = (len([i for i in notoriousWebsites if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlNotoriety = (len([i for i in notoriousWebsites if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescNotoriety = (len([i for i in notoriousId if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        keralaNonRumoursFeatures[tweetId]["quoteReputation"]  = quoteUrlReputation or quoteProfileUrlReputation or quoteProfileDescReputation
        keralaNonRumoursFeatures[tweetId]["quoteNotoriety"]  = quoteUrlNotoriety or quoteProfileUrlNotoriety or quoteProfileDescNotoriety


    pk.dump(keralaNonRumoursFeatures, open(f'{featureSerializationAdr}keralaNonRumoursFeatures_SomeNewFeatures_{counter}.pk', "wb"))
    keralaNonRumoursFeatures = {}

In [None]:
# This is temporary to add some new features
keralaNonRumoursFeaturesFolder = [i for i in os.listdir(featureSerializationAdr) if "keralaNonRumoursFeatures_SomeNewFeatures_" in i]
keralaNonRumoursFeaturesList = []
for i in keralaNonRumoursFeaturesFolder:
    keralaNonRumoursFeaturesList.append(pd.DataFrame.from_dict(pk.load(open(featureSerializationAdr+i, "rb"))).T)
keralaNonRumoursFeatures = pd.concat([df for df in keralaNonRumoursFeaturesList])
keralaNonRumoursFeatures["id"] = keralaNonRumoursFeatures["id"].astype("int64")
pk.dump(keralaNonRumoursFeatures, open(f'{featureSerializationAdr}/keralaNonRumoursFeaturesWithoutLIWC_SomeNewFeatures.pk', "wb"))

In [None]:
# Adding LIWC features
keralaNonRumoursLIWC = pk.load(open(featureSerializationAdr+"keralaNonRumoursLIWC.pk", "rb"))
keralaNonRumoursFeaturesWithLIWC = pd.merge(keralaNonRumoursFeatures, keralaNonRumoursLIWC, on="id")
pk.dump(keralaNonRumoursFeaturesWithLIWC, open(f'{featureSerializationAdr}/keralaNonRumoursFeaturesWithLIWC_SomeNewFeatures.pk', "wb"))

In [None]:
oldkeralaNonRumoursLIWC = pk.load(open(featureSerializationAdr+"keralaNonRumoursFeaturesWithoutLIWC.pk", "rb"))

oldkeralaNonRumoursLIWC = oldkeralaNonRumoursLIWC.drop(columns=['averageSentenceComplexity', 'averageWordComplexity','sentenceCount'])

keralaNonRumoursWithLIWC_NewFeatures = pd.merge(oldkeralaNonRumoursLIWC, keralaNonRumoursFeaturesWithLIWC, on="id")

pk.dump(keralaNonRumoursWithLIWC_NewFeatures, open(f'{featureSerializationAdr}/keralaNonRumoursWithLIWC_NewFeatures.pk', "wb"))

# Florence

### Rumour

In [None]:
florenceRumourFolder = [i for i in os.listdir(tweetSerializationAdr) if "florenceRumour" in i]
counter = 0
florenceRumourId = {}
florenceRumoursFeatures = {}

for file in florenceRumourFolder:
    florenceRumours = pk.load(open(tweetSerializationAdr+file, "rb")) 
    for tweet in tqdm(florenceRumours):
    ################################################################ Basic Setup ################################################################
        counter += 1
        qtText, qtEntities, qtUser, rtUser, tweetText = None, None, None, None, None

        elements = tweetElements(tweet)
        rtFlag = copy.deepcopy(elements[0])
        qtFlag = copy.deepcopy(elements[1]) 

        if rtFlag == False and qtFlag == False:
            text, entities, user = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4])
        elif rtFlag == True and qtFlag == False:
            text, entities, user, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5])
        elif rtFlag == False and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7])       
        elif rtFlag == True and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7]), copy.deepcopy(elements[8])

        rawText, semiRawText, processedText, overProcessedText = textProcessor(text)

        tweetId = tweet["id"]
        name = user["name"].lower().strip()    

        spacyText = nlp(overProcessedText) 
        spacyName = nlp(name)


        # There are some duplicated tweets (I don't know why!)
        if tweetId in florenceRumourId:
            florenceRumourId[tweetId] += 1
            continue
        else:
            florenceRumourId[tweetId] = 1


         ################################################################ Rumour General Info ################################################################

        florenceRumoursFeatures[tweetId]={}
        florenceRumoursFeatures[tweetId]["id"]=tweetId
        sentences = sentenceTokenizer.tokenize(overProcessedText)
        florenceRumoursFeatures[tweetId]["sentenceCount"] = len(sentences)
        florenceRumoursFeatures[tweetId]["averageWordComplexity"] = np.average([len(i) for i in word_tokenize(overProcessedText)])
        florenceRumoursFeatures[tweetId]["averageSentenceComplexity"] = np.average([len(word_tokenize(i)) for i in sentences])


        ### evidence credibility    
        urlCount = len(entities["urls"]) if "urls" in entities.keys() else 0
        mediaCount = len(entities["media"]) if "media" in entities.keys() else 0
        
        florenceRumoursFeatures[tweetId]["urlNotoriety"] = (True if len([i for i in notoriousWebsites if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False
        florenceRumoursFeatures[tweetId]["urlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False

        quoteUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescReputation = (len([i for i in credibleAccountsUsa if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        quoteUrlNotoriety = (len([i for i in notoriousWebsites if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlNotoriety = (len([i for i in notoriousWebsites if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescNotoriety = (len([i for i in notoriousId if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        florenceRumoursFeatures[tweetId]["quoteReputation"] = quoteUrlReputation or quoteProfileUrlReputation or quoteProfileDescReputation
        florenceRumoursFeatures[tweetId]["quoteNotoriety"] = quoteUrlNotoriety or quoteProfileUrlNotoriety or quoteProfileDescNotoriety

    pk.dump(florenceRumoursFeatures, open(f'{featureSerializationAdr}florenceRumoursFeatures_SomeNewFeatures_{counter}.pk', "wb"))
    florenceRumoursFeatures = {}

In [None]:
# Storing the features as one pandas dataframe
florenceRumoursFeaturesFolder = [i for i in os.listdir(featureSerializationAdr) if "florenceRumoursFeatures_SomeNewFeatures_" in i]
florenceRumoursFeaturesList = []
for i in florenceRumoursFeaturesFolder:
    florenceRumoursFeaturesList.append(pd.DataFrame.from_dict(pk.load(open(featureSerializationAdr+i, "rb"))).T)
florenceRumoursFeatures = pd.concat([df for df in florenceRumoursFeaturesList])
florenceRumoursFeatures["id"] = florenceRumoursFeatures["id"].astype("int64")
pk.dump(florenceRumoursFeatures, open(f'{featureSerializationAdr}/florenceRumoursFeaturesWithoutLIWC_SomeNewFeatures.pk', "wb"))

In [None]:
# Adding LIWC features
florenceRumoursLIWC = pk.load(open(featureSerializationAdr+"florenceRumoursLIWC.pk", "rb"))
florenceRumoursFeaturesWithLIWC = pd.merge(florenceRumoursFeatures, florenceRumoursLIWC, on="id")
pk.dump(florenceRumoursFeaturesWithLIWC, open(f'{featureSerializationAdr}/florenceRumoursFeaturesWithLIWC_SomeNewFeatures.pk', "wb"))

In [None]:
oldkeralaRumoursLIWC = pk.load(open(featureSerializationAdr+"keralaRumoursFeaturesWithoutLIWC.pk", "rb"))

oldkeralaRumoursLIWC = oldkeralaRumoursLIWC.drop(columns=['averageSentenceComplexity', 'averageWordComplexity','sentenceCount'])

keralaRumoursWithLIWC_NewFeatures = pd.merge(oldkeralaRumoursLIWC, keralaRumoursFeaturesWithLIWC, on="id")

pk.dump(keralaRumoursWithLIWC_NewFeatures, open(f'{featureSerializationAdr}/keralaRumoursWithLIWC_NewFeatures.pk', "wb"))

### Non-Rumour

In [None]:
florenceNonRumourFolder = [i for i in os.listdir(tweetSerializationAdr) if "florenceNonRumour" in i]
counter = 0
florenceNonRumourId = {}
florenceNonRumoursFeatures = {}

for file in florenceNonRumourFolder:
    florenceNonRumours = pk.load(open(tweetSerializationAdr+file, "rb")) 
    for tweet in tqdm(florenceNonRumours):
    ################################################################ Basic Setup ################################################################
        counter += 1
        qtText, qtEntities, qtUser, rtUser, tweetText = None, None, None, None, None

        elements = tweetElements(tweet)
        rtFlag = copy.deepcopy(elements[0])
        qtFlag = copy.deepcopy(elements[1]) 

        if rtFlag == False and qtFlag == False:
            text, entities, user = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4])
        elif rtFlag == True and qtFlag == False:
            text, entities, user, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5])
        elif rtFlag == False and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7])       
        elif rtFlag == True and qtFlag == True:
            text, entities, user, qtText, qtEntities, qtUser, rtUser = copy.deepcopy(elements[2]), copy.deepcopy(elements[3]), copy.deepcopy(elements[4]), copy.deepcopy(elements[5]), copy.deepcopy(elements[6]), copy.deepcopy(elements[7]), copy.deepcopy(elements[8])

        rawText, semiRawText, processedText, overProcessedText = textProcessor(text)

        tweetId = tweet["id"]
        name = user["name"].lower().strip()    

        spacyText = nlp(overProcessedText) 
        spacyName = nlp(name)


        # There are some duplicated tweets (I don't know why!)
        if tweetId in florenceNonRumourId:
            florenceNonRumourId[tweetId] += 1
            continue
        else:
            florenceNonRumourId[tweetId] = 1


         ################################################################ NonRumour General Info ################################################################

        florenceNonRumoursFeatures[tweetId]={}
        florenceNonRumoursFeatures[tweetId]["id"]=tweetId
        sentences = sentenceTokenizer.tokenize(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["sentenceCount"] = len(sentences)
        florenceNonRumoursFeatures[tweetId]["averageWordComplexity"] = np.average([len(i) for i in word_tokenize(overProcessedText)])
        florenceNonRumoursFeatures[tweetId]["averageSentenceComplexity"] = np.average([len(word_tokenize(i)) for i in sentences])

        
    # This is temporary for some new features
        nrcTxtList = word_tokenize(overProcessedText)
        angerScore = 0
        for term in nrcTxtList:
            if term in nrcDic:
                angerScore += nrcDic[term]["anger"]
        florenceNonRumoursFeatures[tweetId]["angerEmotion"] = angerScore
        
        florenceNonRumoursFeatures[tweetId]["flesch_reading_ease"] = textstat.flesch_reading_ease(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["smog_index"] = textstat.smog_index(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["coleman_liau_index"] = textstat.coleman_liau_index(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["automated_readability_index"] = textstat.automated_readability_index(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["dale_chall_readability_score"] = textstat.dale_chall_readability_score(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["difficult_words"] = textstat.difficult_words(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["linsear_write_formula"] = textstat.linsear_write_formula(overProcessedText)
        florenceNonRumoursFeatures[tweetId]["gunning_fog"] = textstat.gunning_fog(overProcessedText)
        
        
        # One of the syllabus library pitfall is that, an empty sentence is one syllabus
        # To avoid devision by zero error
        try: 
            florenceNonRumoursFeatures[tweetId]["averageWordsyllables"] = np.average([syllables.estimate(i) for i in word_tokenize(overProcessedText)])
        except:
            florenceNonRumoursFeatures[tweetId]["averageWordsyllables"] = 0   
         ################################################################ NonRumourmonger Features ################################################################

        ### evidence credibility 

        urlCount = len(entities["urls"]) if "urls" in entities.keys() else 0
        mediaCount = len(entities["media"]) if "media" in entities.keys() else 0
        
        florenceNonRumoursFeatures[tweetId]["urlNotoriety"] = (True if len([i for i in notoriousWebsites if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False
        florenceNonRumoursFeatures[tweetId]["urlReputation"] = (True if len([i for i in credibleWebsitesUsa if i in entities["urls"][0]["expanded_url"]]) > 0 else False) if urlCount>0 else False

        quoteUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlReputation = (len([i for i in credibleWebsitesUsa if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescReputation = (len([i for i in credibleAccountsUsa if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        quoteUrlNotoriety = (len([i for i in notoriousWebsites if i in qtEntities["urls"][0]["expanded_url"]]) > 0 if len(qtEntities["urls"])>0 else False) if qtFlag else False
        quoteProfileUrlNotoriety = (len([i for i in notoriousWebsites if i in qtUser["url"]]) > 0 if qtUser["url"] != None else False) if qtFlag else False
        quoteProfileDescNotoriety = (len([i for i in notoriousId if i in qtUser["description"]]) > 0 if qtUser["description"] != None else False) if qtFlag else False

        florenceNonRumoursFeatures[tweetId]["quoteReputation"] = quoteUrlReputation or quoteProfileUrlReputation or quoteProfileDescReputation
        florenceNonRumoursFeatures[tweetId]["quoteNotoriety"] = quoteUrlNotoriety or quoteProfileUrlNotoriety or quoteProfileDescNotoriety


    pk.dump(florenceNonRumoursFeatures, open(f'{featureSerializationAdr}florenceNonRumoursFeatures_SomeNewFeatures_{counter}.pk', "wb"))
    florenceNonRumoursFeatures = {}

In [None]:
# Storing the features as one pandas dataframe
florenceNonRumoursFeaturesFolder = [i for i in os.listdir(featureSerializationAdr) if "florenceNonRumoursFeatures_SomeNewFeatures_" in i]
florenceNonRumoursFeaturesList = []
for i in florenceNonRumoursFeaturesFolder:
    florenceNonRumoursFeaturesList.append(pd.DataFrame.from_dict(pk.load(open(featureSerializationAdr+i, "rb"))).T)
florenceNonRumoursFeatures = pd.concat([df for df in florenceNonRumoursFeaturesList])
florenceNonRumoursFeatures["id"] = florenceNonRumoursFeatures["id"].astype("int64")
pk.dump(florenceNonRumoursFeatures, open(f'{featureSerializationAdr}/florenceNonRumoursFeaturesWithoutLIWC_SomeNewFeatures.pk', "wb"))

In [None]:
# Adding LIWC features
florenceNonRumoursLIWC = pk.load(open(featureSerializationAdr+"florenceNonRumoursLIWC.pk", "rb"))
florenceNonRumoursFeaturesWithLIWC = pd.merge(florenceNonRumoursFeatures, florenceNonRumoursLIWC, on="id")
pk.dump(florenceNonRumoursFeaturesWithLIWC, open(f'{featureSerializationAdr}/florenceNonRumoursFeaturesWithLIWC_SomeNewFeatures.pk', "wb"))

In [None]:
oldflorenceNonRumoursLIWC = pk.load(open(featureSerializationAdr+"florenceNonRumoursFeaturesWithoutLIWC.pk", "rb"))

oldflorenceNonRumoursLIWC = oldflorenceNonRumoursLIWC.drop(columns=['averageSentenceComplexity', 'averageWordComplexity','sentenceCount'])

florenceNonRumoursWithLIWC_NewFeatures = pd.merge(oldflorenceNonRumoursLIWC, florenceNonRumoursFeaturesWithLIWC, on="id")

pk.dump(florenceNonRumoursWithLIWC_NewFeatures, open(f'{featureSerializationAdr}/florenceNonRumoursWithLIWC_NewFeatures.pk', "wb"))