In [1]:
import pandas as pd
import os, re
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from string import punctuation
from random import seed
seed(1000)

In [2]:
 def removePuncs(text):
        puncs = [i for i in punctuation]
        for i in puncs:
            text = text.replace(i,"")
        return text


def preProcess(text,forumwords):
    #remove punctuations
    text = removePuncs(text)

    #split text to words
    words = [word.strip().lower() for word in text.split()]
    #remove stopwords and numbers
    stops = self.stops+forumwords
    words = [word for word in words if word not in stops and word.isalpha()]
    #pos_tag words
    tagged = pos_tag(words)
    #remove unwanted tags
    tagged = [tag for tag in tagged if tag[1] not in self.negtags]
    words = [word[0] for word in tagged]
    #stem words
    words = [stemmer.stem(word) for word in words]
    #join words to form sentence
    sentence = " ".join([word.strip() for word in words])
    return sentence

def rescale_0_100(x):
    #x is a numeric vector of >1 elements
    minimum = min(x)
    maximum = max(x)
    
    scale_factor = (100-0)/(maximum-minimum)
    scaled = [abs(((i-maximum)+100)*scale_factor) for i in x]
    return scaled

def CyberRelatedness(text, lexicon):
    #lexicon is a dataframe of terms with their corresponding APMIS scores:columns labelled "terms", "apmis"
    #text is a single string input of text to be measured for degree of cyber-relatedness
    
    #get length of text
    words = [i.strip() for i in text.split()]
    words = [stemmer.stem(i) for i in words]
    
    counter=[]
    for word in words:
        if len(word) > 2:
            if word in lexicon.terms:
                matched = lexicon.loc[lexicon['terms']==word]
                counter.append(list(matched.scaled))
        
    if counter:
        return sum([item for sublist in counter for item in sublist])/len(counter)
    else:
        return 0


In [3]:
#read in test corpus files
basedir = "../DataCollection/corpus"
files = [basedir+"/"+i for i in os.listdir(basedir)]

In [4]:
data = []
for i in files:
    try:
        with open(i, "r+", encoding="utf-8") as ff:
            data.append(ff.readlines())
        ff.close()
    except:
        pass

In [5]:
data = [i for i in data if len(i) ==5]

In [6]:
source = [i[0].strip() for i in data]
category = [i[1].strip() for i in data]
classed = [i[2].strip() for i in data]
text = [i[4].strip() for i in data]

In [7]:
data = pd.DataFrame([source, category, classed, text]).transpose()
data.columns = ["source", "category", "classed", "text"]

In [8]:
data.head()

Unnamed: 0,source,category,classed,text
0,bbc,publishing,Cyber,Butlin's guest records exposed to hackers. Up ...
1,bbc,publishing,Cyber,US warns of supply chain cyber-attacks. Nation...
2,bbc,publishing,Cyber,National Counterintelligence and Security Cent...
3,bbc,publishing,Cyber,Home security camera recordings hijacked. Rese...
4,bbc,publishing,Cyber,Why is the Daily Mail's site 'not secure'?. Go...


In [9]:
data['classed'].value_counts()

Cyber       117
NonCyber    101
Name: classed, dtype: int64

In [10]:
data['source'].unique()

array(['bbc', 'facebook', 'hackernews', 'linkedin', 'quora', 'reddit',
       'stackx', 'steemit'], dtype=object)

In [11]:
pd.crosstab(data.source, data.classed)

classed,Cyber,NonCyber
source,Unnamed: 1_level_1,Unnamed: 2_level_1
bbc,15,15
facebook,15,13
hackernews,14,0
linkedin,15,14
quora,15,15
reddit,15,15
stackx,14,14
steemit,14,15


In [12]:
#read in scores
apmis =  pd.read_csv("../../BuildingTheLexicon/Analysis/absapmis.csv")
tfidf = pd.read_csv("../../BuildingTheLexicon/Analysis/tfidf_scores.csv")
fdr = pd.read_csv("../../BuildingTheLexicon/Analysis/frequency_degree_ratio.csv")

In [13]:
top_apmis = apmis.head(420)
top_tfidf = tfidf.head(420)
top_fdr = fdr.head(420)

In [14]:
terms = list(set(list(top_apmis.terms)+list(top_tfidf.terms)+list(top_fdr.terms)))

In [15]:
top_terms = apmis.loc[apmis['terms'].isin(terms)]
top_terms = top_terms[['terms', 'apmis']]

top_terms['terms'] = [stemmer.stem(i) for i in top_terms['terms']]
top_terms["scaled"] = rescale_0_100(top_terms.apmis)
top_terms.index = top_terms.terms
top_terms.head()

Unnamed: 0_level_0,terms,apmis,scaled
terms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
prize,prize,-1555.033814,93.995158
enter,enter,-1397.430351,84.53132
ticket,ticket,-1362.706916,82.446233
http,http,-1198.492768,72.585433
retweet,retweet,-1164.01659,70.515193


In [16]:
top_terms.shape

(754, 3)

In [17]:
applied = []
for txt in text:
    applied.append(CyberRelatedness(txt, top_terms))

In [18]:
md=top_terms.scaled.mean()

In [90]:
data['scaled'] = applied
data['classified'] = ["Cyber" if i>19 else "NonCyber" for i in applied]

In [91]:
md

17.96399103847172

In [92]:
data.to_csv("data.csv")

In [93]:
conf_tab=pd.crosstab( data.classified, data.classed)

In [94]:
conf_tab

classed,Cyber,NonCyber
classified,Unnamed: 1_level_1,Unnamed: 2_level_1
Cyber,91,25
NonCyber,26,76


In [95]:
tp = conf_tab.iloc[0,0]
fp = conf_tab.iloc[1,0]
fn = conf_tab.iloc[0,1]
tn = conf_tab.iloc[1,1]

In [96]:
error_rate = (fp+fn)/(fp+fn+tp+tn)
error_rate*100

23.394495412844037

In [97]:
accuracy = (tp+tn)/(fp+fn+tp+tn)
accuracy*100

76.60550458715596

In [98]:
sensitivity = (tp)/(fn+tp)
sensitivity*100

78.44827586206897

In [79]:
specificity = (tn)/(tn+fp)
specificity*100

74.50980392156863

In [80]:
precision = (tp)/(tp+fp)
precision*100

77.77777777777779

In [81]:
fpr = 1-specificity
fpr*100

25.49019607843137