In [1]:
import pandas as pd 
import numpy as np 

# Load and preprocess train data

In [2]:
df = pd.read_csv('securitytextsdetection/train.csv')

In [3]:
df[:20]

Unnamed: 0,example_id,text,label
0,140d03eabb7cb5c2558605eb8336689c,brandpost best of both worlds hybrid onsite an...,0
1,f7f1f906c9e2b76e63020f8794516185,$mention$ they shall in all cases except treas...,0
2,39f0b2ebc12e008a7a43ec318d0c3874,lifelock offers to protect you from the equifa...,0
3,ad4e57c69f00548253cb6d47b15c3ce4,skimmer adware spent two months in google play...,1
4,236bfe8f2f145dbcf17be122546946db,just want to love and be loved,0
5,f9f0419dd6ec37b9f72a8a8292a37d0b,cyber attack on barts nhs trust eloited zeroda...,1
6,3e23b538a07b92f2e27b6964dd30242b,docusigns customer email database accessed by ...,1
7,991a2ba0398b4b4bc63201141a401207,ddos real threat that big data can help combat...,1
8,ee2971e296b39717c23e070ef08f64b3,cyberattack glossary thaw are malware phassach...,0
9,a1c2ec94210674309d9896540f6894c8,malware is not going away trust your network s...,0


In [4]:
df.label.value_counts()

0    708
1    481
Name: label, dtype: int64

In [5]:
from string import punctuation

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
#from replacers import
lemmatizer = WordNetLemmatizer() 
stop_words = set(stopwords.words('english'))

In [6]:
def preprocess_text(tokenizer, lemmatizer, stop_words, punctuation, text): 
    tokens = tokenizer(text.lower())
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return [token for token in lemmas if token not in stop_words and token not in punctuation]

df['cleaned'] = df.text.apply(lambda x: preprocess_text(word_tokenize, lemmatizer, stop_words, punctuation, x))

In [7]:
df.head()

Unnamed: 0,example_id,text,label,cleaned
0,140d03eabb7cb5c2558605eb8336689c,brandpost best of both worlds hybrid onsite an...,0,"[brandpost, best, world, hybrid, onsite, cloud..."
1,f7f1f906c9e2b76e63020f8794516185,$mention$ they shall in all cases except treas...,0,"[mention, shall, case, except, treason, felony..."
2,39f0b2ebc12e008a7a43ec318d0c3874,lifelock offers to protect you from the equifa...,0,"[lifelock, offer, protect, equifax, breach, se..."
3,ad4e57c69f00548253cb6d47b15c3ce4,skimmer adware spent two months in google play...,1,"[skimmer, adware, spent, two, month, google, p..."
4,236bfe8f2f145dbcf17be122546946db,just want to love and be loved,0,"[want, love, loved]"


In [8]:
def flat_nested(nested):
    flatten = []
    for item in nested:
        if isinstance(item, list):
            flatten.extend(item)
        else:
            flatten.append(item)
    return flatten

In [9]:
from collections import Counter, defaultdict 

cnt_vocab1 = Counter(flat_nested(df.cleaned.tolist()))

In [10]:
len(cnt_vocab1)

4145

# Load and preprocess test data

In [12]:
df_test = pd.read_csv('securitytextsdetection/test.csv')

In [13]:
df_test.head()

Unnamed: 0,example_id,text
0,24bf52cc84b75bce8ee9c0cdd0b5117f,phishing heads to the cloud forbes $url$ cloud
1,708cc462eca0c274bcff304bbc4396f8,Sports Direct Failed to Tell Staff of Data Bre...
2,c5da1d371af9525d2fc2174311e3dae4,understanding vulnerability through humanright...
3,844f5f5be464a0600891ac6b8c1a1afd,cussword manager onelogin hit by data breach $...
4,97daba48c36db68f3e84ebcbf8952aa2,vuln multiple flexense products cve20177310 bu...


In [14]:
df_test['cleaned'] = df_test.text.apply(lambda x: preprocess_text(word_tokenize, lemmatizer, stop_words, punctuation, x))

In [15]:
df_test.head()

Unnamed: 0,example_id,text,cleaned
0,24bf52cc84b75bce8ee9c0cdd0b5117f,phishing heads to the cloud forbes $url$ cloud,"[phishing, head, cloud, forbes, url, cloud]"
1,708cc462eca0c274bcff304bbc4396f8,Sports Direct Failed to Tell Staff of Data Bre...,"[sport, direct, failed, tell, staff, data, bre..."
2,c5da1d371af9525d2fc2174311e3dae4,understanding vulnerability through humanright...,"[understanding, vulnerability, humanrights, me..."
3,844f5f5be464a0600891ac6b8c1a1afd,cussword manager onelogin hit by data breach $...,"[cussword, manager, onelogin, hit, data, breac..."
4,97daba48c36db68f3e84ebcbf8952aa2,vuln multiple flexense products cve20177310 bu...,"[vuln, multiple, flexense, product, cve2017731..."


In [16]:
df_test.shape

(786, 3)

In [17]:
cnt_vocab2 = Counter(flat_nested(df_test.cleaned.tolist()))

In [18]:
len(cnt_vocab2)

3162

# Train model

In [19]:
cnt_vocab_all = {**cnt_vocab1, **cnt_vocab2}
vocabulary = set(cnt_vocab_all)

In [20]:
len(vocabulary)

5752

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = df.cleaned.apply(lambda x: ' '.join(x))
y = df.label

vectorizer = TfidfVectorizer(vocabulary = vocabulary)
X = vectorizer.fit_transform(corpus)

In [22]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver = 'lbfgs', multi_class = 'multinomial').fit(X, y)

In [23]:
clf.score(X, y)

0.9613120269133726

# sample_submission

In [24]:
corpus_test = df_test.cleaned.apply(lambda x: ' '.join(x))

In [25]:
X_test = vectorizer.fit_transform(corpus_test)

In [26]:
X_test

<786x5752 sparse matrix of type '<class 'numpy.float64'>'
	with 7944 stored elements in Compressed Sparse Row format>

In [27]:
label = clf.predict(X_test)

In [28]:
def Tobinary(arr):
    result = []
    for i in arr:
        if i:
            result.append(True)
        else:
            result.append(False)
    return result

In [29]:
label = Tobinary(label)

In [30]:
sample_submission = pd.DataFrame({'example_id' : df_test['example_id'].tolist(), 'label' : label})

In [31]:
sample_submission

Unnamed: 0,example_id,label
0,24bf52cc84b75bce8ee9c0cdd0b5117f,False
1,708cc462eca0c274bcff304bbc4396f8,False
2,c5da1d371af9525d2fc2174311e3dae4,False
3,844f5f5be464a0600891ac6b8c1a1afd,True
4,97daba48c36db68f3e84ebcbf8952aa2,True
...,...,...
781,d88c14f3513eeac07d96bf2247e590da,False
782,d86ba8e778366f29d751212248a8b117,False
783,77d311fd2f0fe6e7120e92efc00a3dc1,False
784,d1e6b79eed2ed19f35c2bbc3bf3c7940,True


In [32]:
sample_submission.to_csv(r'C:\Users\orang\Documents\submission.csv', index = False)