Skip to content

PiHole assistant to the open source PiHole DNS ad-blocking service for Raspberry Pi. Uses ThreatCrowd API v2 to check domain relations to save known malicious IPs.

Notifications You must be signed in to change notification settings

dillonmabry/pihole-dns-helper-blocker

Repository files navigation

PiHole/ThreatCrowd Project

Project to integrate DNS domains from PiHole into ThreatCrowd by performing relationship checks between domains Utilize ThreatCrowd API by running near-real time communication between PiHole and a temporary staging database in order to update Regex rules, save known IPs of listed threats, as well as analysis for known malware/threats

TODO:

  • Integrate file hashes from known threats to be saved as staging data
  • Integrate geolocation for origin lookup
  • Create view to perform malware analysis of known threats
  • Expand capabilities of PiHole by using a FaaS type infrastructure to asynchronously process DNS hits and check whether it is a known threat
  • Continue metadata analysis of malware and statistics, possibly in future develop algorithms to detect DNS lookups and process on the fly

Analysis and Quick Models

# Analyze and run simple models on data from malicious/non-malicious domains, ips, and their respective geolocations
# Data provided from regular internet usage and DNS lookups via PiHole

# Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_curve
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
# Malicious/Non-malicious ip geolocations
mal_df = pd.read_csv('/Users/dillonmabry/Desktop/Projects/PiHoleMetadata/Malicious/ipgeos.csv')
safe_df = pd.read_csv('/Users/dillonmabry/Desktop/Projects/PiHoleMetadata/Safe/ipgeos.csv')

# Setup malicious and safe original datasets
mal_df.insert(len(mal_df.columns), "Malicious", 1)
safe_df.insert(len(safe_df.columns), "Malicious", 0)

# Merge datasets with malicious indicators
df = pd.concat([mal_df, safe_df]).drop(["id"], axis=1)

# Shuffle final frame of original merged datasets safe/malicious
df = df.sample(frac=1)
# Preserve original dataset
df_sk = df.copy()

# Clean dataset
df_sk = df_sk[(df_sk.ip_address != '127.0.0.1') & (df_sk.ip_address != 'localhost')] # remove local ips
df_sk = df_sk[df_sk.longitude != -9999] # remove outlier data
df_sk = df_sk[df_sk.latitude != -9999] # remove outlier data
df_sk = df_sk[df_sk.country_code != "N/P"] # remove not-provided
df_sk = df_sk[df_sk.continent_code != "N/P"] # remove not-provided
df_sk = df_sk.drop(["continent_code"], axis=1)
df_sk = df_sk.dropna() # drop N/A records
agg = df_sk[df_sk.Malicious == 1].groupby(['country_code'])["country_code"].count().sort_values(ascending=False).iloc[0:10]

# Top malicious ipgeolocations by country
ax = agg.plot(kind='bar', title ="Malicious Countries Grouped", figsize=(10, 5), legend=True, fontsize=12)
ax.set_xlabel("Country", fontsize=12)
ax.set_ylabel("Count", fontsize=12)
plt.show()

png

isp_agg = df_sk[df_sk.Malicious == 1].groupby(['country_code', 'isp'])["country_code"].count().sort_values(ascending=False).iloc[0:10]

# Top malicious isps by country
ax = isp_agg.plot(kind='bar', title ="Malicious ISPs Grouped by Country", figsize=(10, 5), legend=True, fontsize=12)
ax.set_xlabel("Country, ISP", fontsize=12)
ax.set_ylabel("Count", fontsize=12)
plt.show()

png

mal_lngs = df_sk[df_sk.Malicious == 1]["longitude"].values
mal_lats = df_sk[df_sk.Malicious == 1]["latitude"].values

lngs = df_sk[df_sk.Malicious == 0]["longitude"].values
lats = df_sk[df_sk.Malicious == 0]["latitude"].values

plt.figure(figsize=(14, 8))
earth = Basemap()
earth.bluemarble(alpha=0.75)
plt.scatter(mal_lngs, mal_lats, c='red',alpha=0.5, zorder=10)
plt.scatter(lngs, lats, c='blue',alpha=0.5, zorder=5)
plt.xlabel("IP Geolocations - Malicious (Red), Non-Malicious (Blue)")
plt.show()

png

# Numeric Categorical encoding and feature extraction/cleanup
le = LabelEncoder()
# Categorical attrs
le.fit(df_sk["country_code"])
df_sk["country_code"] = le.transform(df_sk["country_code"])
le.fit(df_sk["isp"])
df_sk["isp"] = le.transform(df_sk["isp"])
le.fit(df_sk["parent_domain"])
df_sk["parent_domain"] = le.transform(df_sk["parent_domain"])

# Drop unnecessary attributes
df_sk = df_sk.drop(["ip_address"], axis=1) # IP is too numerous/sparse, need to find a regularization pattern for subnets
# Convert to np arrays
labels = df_sk["Malicious"].values
label_names = ["Malicious", "Safe"]
df_sk = df_sk.drop(["Malicious"], axis=1) # DROP MALICIOUS/NON IDENTIFIER OF MAIN MERGED DATASET BEFORE MODELING
features = df_sk.values
feature_names = df_sk.columns[0:]
# Split our data
train, test, train_labels, test_labels = train_test_split(features,
                                                          labels,
                                                          test_size=0.33,
                                                          random_state=42)
# Logistic Regression
lr = LogisticRegression(random_state=0)
lr.fit(train, train_labels)
preds = lr.predict(test)
print(accuracy_score(test_labels, preds))
0.929078014184
# Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
rf.fit(train, train_labels)
preds = rf.predict(test)
print(accuracy_score(test_labels, preds))
print(df_sk.columns)
print(rf.feature_importances_)
0.962269503546
Index(['parent_domain', 'country_code', 'latitude', 'longitude', 'isp'], dtype='object')
[ 0.73240397  0.07076328  0.02346796  0.0510711   0.12229368]
# Gaussian
gnb = GaussianNB()
model = gnb.fit(train, train_labels)
preds = gnb.predict(test)
print(accuracy_score(test_labels, preds))
0.915602836879
# Decision Classifier
dclf = DecisionTreeClassifier(random_state=0)
dclf.fit(train, train_labels)
preds = dclf.predict(test)
print(accuracy_score(test_labels, preds))
cross_val_score(dclf, test, test_labels, cv=10)
print(dclf.feature_importances_)
0.996312056738
[  9.69256958e-01   2.77148766e-04   3.02767067e-03   6.19121418e-03
   2.12470083e-02]
def show_roc(model, test, test_labels):
    # Predict
    probs = model.predict_proba(test)
    preds = probs[:,1]
    fpr, tpr, threshold = roc_curve(test_labels, preds)
    roc_auc = auc(fpr, tpr)
    # Chart
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
# LR ROC
show_roc(lr, test, test_labels) # Overfitting

png

# RF ROC
show_roc(rf, test, test_labels) # Overfitting

png

# GB ROC
show_roc(gnb, test, test_labels) # Appropriate ROC

png

show_roc(dclf, test, test_labels) # Overfitting example

png

# TODO Perform second test with secondary test set with new data
# Add more features and examine overfitting

About

PiHole assistant to the open source PiHole DNS ad-blocking service for Raspberry Pi. Uses ThreatCrowd API v2 to check domain relations to save known malicious IPs.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published