GitHub - dillonmabry/pihole-dns-helper-blocker: PiHole assistant to the open source PiHole DNS ad-blocking service for Raspberry Pi. Uses ThreatCrowd API v2 to check domain relations to save known malicious IPs.

PiHole/ThreatCrowd Project

Project to integrate DNS domains from PiHole into ThreatCrowd by performing relationship checks between domains Utilize ThreatCrowd API by running near-real time communication between PiHole and a temporary staging database in order to update Regex rules, save known IPs of listed threats, as well as analysis for known malware/threats

TODO:

Integrate file hashes from known threats to be saved as staging data
Integrate geolocation for origin lookup
Create view to perform malware analysis of known threats
Expand capabilities of PiHole by using a FaaS type infrastructure to asynchronously process DNS hits and check whether it is a known threat
Continue metadata analysis of malware and statistics, possibly in future develop algorithms to detect DNS lookups and process on the fly

Analysis and Quick Models

# Analyze and run simple models on data from malicious/non-malicious domains, ips, and their respective geolocations
# Data provided from regular internet usage and DNS lookups via PiHole

# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_curve
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap

# Malicious/Non-malicious ip geolocations
mal_df = pd.read_csv('/Users/dillonmabry/Desktop/Projects/PiHoleMetadata/Malicious/ipgeos.csv')
safe_df = pd.read_csv('/Users/dillonmabry/Desktop/Projects/PiHoleMetadata/Safe/ipgeos.csv')

# Setup malicious and safe original datasets
mal_df.insert(len(mal_df.columns), "Malicious", 1)
safe_df.insert(len(safe_df.columns), "Malicious", 0)

# Merge datasets with malicious indicators
df = pd.concat([mal_df, safe_df]).drop(["id"], axis=1)

# Shuffle final frame of original merged datasets safe/malicious
df = df.sample(frac=1)

# Preserve original dataset
df_sk = df.copy()

# Clean dataset
df_sk = df_sk[(df_sk.ip_address != '127.0.0.1') & (df_sk.ip_address != 'localhost')] # remove local ips
df_sk = df_sk[df_sk.longitude != -9999] # remove outlier data
df_sk = df_sk[df_sk.latitude != -9999] # remove outlier data
df_sk = df_sk[df_sk.country_code != "N/P"] # remove not-provided
df_sk = df_sk[df_sk.continent_code != "N/P"] # remove not-provided
df_sk = df_sk.drop(["continent_code"], axis=1)
df_sk = df_sk.dropna() # drop N/A records

agg = df_sk[df_sk.Malicious == 1].groupby(['country_code'])["country_code"].count().sort_values(ascending=False).iloc[0:10]

# Top malicious ipgeolocations by country
ax = agg.plot(kind='bar', title ="Malicious Countries Grouped", figsize=(10, 5), legend=True, fontsize=12)
ax.set_xlabel("Country", fontsize=12)
ax.set_ylabel("Count", fontsize=12)
plt.show()

isp_agg = df_sk[df_sk.Malicious == 1].groupby(['country_code', 'isp'])["country_code"].count().sort_values(ascending=False).iloc[0:10]

# Top malicious isps by country
ax = isp_agg.plot(kind='bar', title ="Malicious ISPs Grouped by Country", figsize=(10, 5), legend=True, fontsize=12)
ax.set_xlabel("Country, ISP", fontsize=12)
ax.set_ylabel("Count", fontsize=12)
plt.show()

mal_lngs = df_sk[df_sk.Malicious == 1]["longitude"].values
mal_lats = df_sk[df_sk.Malicious == 1]["latitude"].values

lngs = df_sk[df_sk.Malicious == 0]["longitude"].values
lats = df_sk[df_sk.Malicious == 0]["latitude"].values

plt.figure(figsize=(14, 8))
earth = Basemap()
earth.bluemarble(alpha=0.75)
plt.scatter(mal_lngs, mal_lats, c='red',alpha=0.5, zorder=10)
plt.scatter(lngs, lats, c='blue',alpha=0.5, zorder=5)
plt.xlabel("IP Geolocations - Malicious (Red), Non-Malicious (Blue)")
plt.show()

# Numeric Categorical encoding and feature extraction/cleanup
le = LabelEncoder()
# Categorical attrs
le.fit(df_sk["country_code"])
df_sk["country_code"] = le.transform(df_sk["country_code"])
le.fit(df_sk["isp"])
df_sk["isp"] = le.transform(df_sk["isp"])
le.fit(df_sk["parent_domain"])
df_sk["parent_domain"] = le.transform(df_sk["parent_domain"])

# Drop unnecessary attributes
df_sk = df_sk.drop(["ip_address"], axis=1) # IP is too numerous/sparse, need to find a regularization pattern for subnets

# Convert to np arrays
labels = df_sk["Malicious"].values
label_names = ["Malicious", "Safe"]
df_sk = df_sk.drop(["Malicious"], axis=1) # DROP MALICIOUS/NON IDENTIFIER OF MAIN MERGED DATASET BEFORE MODELING
features = df_sk.values
feature_names = df_sk.columns[0:]

# Split our data
train, test, train_labels, test_labels = train_test_split(features,
                                                          labels,
                                                          test_size=0.33,
                                                          random_state=42)

# Logistic Regression
lr = LogisticRegression(random_state=0)
lr.fit(train, train_labels)
preds = lr.predict(test)
print(accuracy_score(test_labels, preds))

0.929078014184

# Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
rf.fit(train, train_labels)
preds = rf.predict(test)
print(accuracy_score(test_labels, preds))
print(df_sk.columns)
print(rf.feature_importances_)

0.962269503546
Index(['parent_domain', 'country_code', 'latitude', 'longitude', 'isp'], dtype='object')
[ 0.73240397  0.07076328  0.02346796  0.0510711   0.12229368]

# Gaussian
gnb = GaussianNB()
model = gnb.fit(train, train_labels)
preds = gnb.predict(test)
print(accuracy_score(test_labels, preds))

0.915602836879

# Decision Classifier
dclf = DecisionTreeClassifier(random_state=0)
dclf.fit(train, train_labels)
preds = dclf.predict(test)
print(accuracy_score(test_labels, preds))
cross_val_score(dclf, test, test_labels, cv=10)
print(dclf.feature_importances_)

0.996312056738
[  9.69256958e-01   2.77148766e-04   3.02767067e-03   6.19121418e-03
   2.12470083e-02]

def show_roc(model, test, test_labels):
    # Predict
    probs = model.predict_proba(test)
    preds = probs[:,1]
    fpr, tpr, threshold = roc_curve(test_labels, preds)
    roc_auc = auc(fpr, tpr)
    # Chart
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

# LR ROC
show_roc(lr, test, test_labels) # Overfitting

# RF ROC
show_roc(rf, test, test_labels) # Overfitting

# GB ROC
show_roc(gnb, test, test_labels) # Appropriate ROC

show_roc(dclf, test, test_labels) # Overfitting example

# TODO Perform second test with secondary test set with new data
# Add more features and examine overfitting

Name		Name	Last commit message	Last commit date
Latest commit History 25 Commits
.gitignore		.gitignore
README.md		README.md
config.py		config.py
database.py		database.py
export_csv.sh		export_csv.sh
logger.py		logger.py
process_threats.py		process_threats.py
run.sh		run.sh
setup_db.py		setup_db.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

PiHole/ThreatCrowd Project

TODO:

Analysis and Quick Models

About

Releases

Packages

Languages

dillonmabry/pihole-dns-helper-blocker

Folders and files

Latest commit

History

Repository files navigation

PiHole/ThreatCrowd Project

TODO:

Analysis and Quick Models

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages