In [None]:
#! pip install pycaret

In [None]:
import graphistry
import pandas as pd
import numpy as np
import pycaret

from sklearn.cluster import DBSCAN
from collections import Counter

import matplotlib.pylab as plt

from joblib import load, dump

import spacy
import warnings
warnings.filterwarnings('ignore')

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
graphistry.register(api=3, protocol="https", server="hub.graphistry.com", username="...", password="...") 

In [None]:
g = graphistry.bind()

In [None]:
# df = pd.read_csv('~/Downloads/instances.csv')
# names = df.columns
# df2 = pd.read_csv('~/Downloads/instances 2.csv', names=names)
# df3 = pd.read_csv('~/Downloads/instances 3.csv', names=names)
# df4 = pd.read_csv('~/Downloads/instances 4.csv', names=names)
# df5 = pd.read_csv('~/Downloads/instances 5.csv', names=names)
# df6 = pd.read_csv('~/Downloads/instances 6.csv', names=names)
# mdf = pd.concat([df, df2, df3, df4, df5, df6], ignore_index=True)
# mdf = mdf.drop_duplicates(ignore_index=True)
# mdf['n'] = range(len(mdf))
# mdf.to_csv('data/darkweb.csv')

In [None]:
# Now we have a total df we can load

In [None]:
df = pd.read_csv('data/darkweb.csv', index_col=0)

In [None]:
df=df.fillna('')

In [None]:
df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
df = df.sort_values(by='published_date', ascending=True, ignore_index=True)

In [None]:
df.columns

In [None]:
df_date = df.groupby('published_date').size()

In [None]:
df_date.plot(figsize=(15,10))

In [None]:
df_cols = ['event_type', 'fragment', 'document_title', 'document_authors']  # the rest are noise for featurization

In [None]:
df[df_cols] = df[df_cols].astype(str)

In [None]:
# this is not needed, so the following can be skipped
process = False

if process:
    g2 = g.nodes(df, 'n').featurize(use_columns=df_cols)
    # now save the features so we don't have to reprocess 
    X = g2.node_features
    dump(X, 'data/darkweb_features.pickle')
else:
    X = load('data/darkweb_features.pickle')
    g2 = g.nodes(df, 'n')
    g2.node_features = X

In [None]:
g3 = g2.umap(scale=.5, n_neighbors=7)

In [None]:
g3.nodes(df[df_cols+['n']], 'n').plot() # what a mess, but nice clusters

# Let's cluster and see how well dimensional reduction has done to group similar records

In [None]:
emb = g3.node_embedding

In [None]:
clustering = DBSCAN(eps=3, min_samples=2).fit(emb)
labels = clustering.labels_
cnt = Counter(labels).most_common()
cnt

In [None]:
len(cnt) # lots of clusters

In [None]:
cdf = df[labels==3]  #makes sense
cdf[df_cols]

In [None]:
cdf.resample('Q').size()

# Use SpaCy to see if we can extract some useful info and de-noise

In [None]:
OBJECT_DEPS = {"dobj", "dative", "attr", "oprd"}
SUBJECT_DEPS = {"nsubj", "nsubjpass", "csubj", "agent", "expl"}
# tags that define wether the word is wh-
WH_WORDS = {"WP", "WP$", "WRB"}

# extract the subject, object and verb from the input
def get_svo_lemmas(doc):
    """Get Subject Verb Object Triples"""
    sub = []
    at = []
    ve = []
    for token in doc:
        # is this a verb?
        if token.pos_ == "VERB":
            ve.append(token.lemma_)
        # is this the object?
        if token.dep_ in OBJECT_DEPS or token.head.dep_ in OBJECT_DEPS:
            at.append(token.lemma_)
        # is this the subject?
        if token.dep_ in SUBJECT_DEPS or token.head.dep_ in SUBJECT_DEPS:
            sub.append(token.lemma_)
    return [k for k in zip(sub, ve, at)]

def extract_named_ents(text):
    """Extract named entities
    """
    return [(ent.text, ent.label_) for ent in nlp(text).ents]


In [None]:
doc = df.fragment.sample(1).values[0]
doc = nlp(doc)
get_svo_lemmas(doc) #meh

In [None]:
extract_named_ents(doc)

In [None]:
good_cols = ['n', 'fragment', 'document_title', 'document_authors', 'event_type', 'document_id']
good_cols2 = ['n', 'fragment', 'document_title']

In [None]:
# these aren't useful in this format
#ents = df.fragment.apply(lambda x: extract_named_ents(nlp(str(x))))

In [None]:
#ents.values 

In [None]:
#nents = [(k, v) for k, v in enumerate(ents.values) if v != list([])]

In [None]:
#nents

# Let's analyze using ngram analysis

In [None]:
doclength = df.document_title.apply(lambda x: len(x.split()))

In [None]:
df['title_length'] = doclength

In [None]:
doclength.plot(kind='hist', log=True)

In [None]:
df['fragment_length'] = df.fragment.apply(lambda x: len(str(x).split()))

In [None]:
df.fragment_length.hist(log=True)

# Let's prune to documents that have minimum length 
## Smaller sample for prototyping

In [None]:
bdf = df[(df.fragment_length>5) & (df.title_length>5)]

In [None]:
bdf['n'] = range(len(bdf)) # bdf will be what we use for most of subsequent analysis until we pull in other sources

# let's find a word to word score
## This will be useful to find, quickly, what sellers are involved in over all the data

In [None]:
import csv
# wget https://www.sketchengine.eu/english-word-list/
reader = csv.reader(open('data/english-word-list-total.csv', 'r'))
data = [row for row in reader]
remove_words = [data[k][0].split(';')[1] for k in range(len(data))]
remove_words = remove_words[1:]

In [None]:
# let's encode as ngrams with usernames too
from sklearn.feature_extraction.text import CountVectorizer

cvect = CountVectorizer(min_df=3, max_df=0.35, ngram_range=(1,3), stop_words=remove_words)

# make the matrix with seller included
M = np.array(cvect.fit_transform(
    bdf.fragment.astype(str).values + 
    ' ' + bdf.document_title.astype(str).values +
    ' ' + bdf.document_authors.values).todense())

top_sims = 10
coldict = {k:v for v, k in cvect.vocabulary_.items()}
ww = np.cov(M.T)
w2w = [[coldict[k] for k in row.argsort()[::-1][:top_sims]] for row in ww]

In [None]:
def word_in_key(word, wdict):
    indices = []
    wordlist = []
    for w, indx in wdict.items():
        if word in w:
            indices.append(indx)
            wordlist.append(w)
    return wordlist, indices


def get_top_correlated_words(word, verbose=False):
    wordlist, indices = word_in_key(word.lower(), cvect.vocabulary_)
    seen = set()
    bests = []
    for w, i in zip(wordlist, indices):
        if w not in seen:
            #print(f'{word}: {w} -> {w2w[i]}')
            bests+= w2w[i]
        else:
            seen.add(w)
    bests = sorted(np.unique(bests))
    print(f'{word} -> {bests}') if verbose else None
    return bests

def get_random_word():
    return np.random.choice(list(cvect.vocabulary_))
    

In [None]:
# If we want to see how well entities correlate via ngrams -- not as good as g3.umap().plot()
# cc = np.cov(M)
# rows, cols = cc.nonzero()
# edges = pd.DataFrame({'a': rows, 'b':cols, 'weight': cc[rows, cols]})
# edges_pruned = prune_weighted_edges_df(edges, scale=8)
# g = graphistry.nodes(bdf, 'n').edges(edges_pruned, 'a', 'b')
# g.plot()

In [None]:
# user 'word cloud' 
r=get_top_correlated_words('DopeValley', verbose=True)

In [None]:
def get_likely_drugs_or_region(username):
    """ Gets likely drugs/items/region for seller names"""
    r=get_top_correlated_words(username)
    drugs = Counter()
    for a in r:
        res = extract_named_ents(nlp(str(a)))
        if len(res):
            for k in res:
                if len(k)==2:
                    aa, b = k
                    if b in ['GPE', 'ORG', 'NP', 'PERSON']:
                        res = aa.split()[0]
                        if res.lower() != username.lower():
                            drugs.update([res])
    if len(drugs):
        print(f'{username} is likely selling (item/drug/region) -- ')
        print('-'*30)
        for n, c in drugs.most_common():
            print(f'\t{n.upper()}  ->  score {c/len(drugs):.2f}')
        print()
    return drugs

c=get_likely_drugs_or_region('DopeValley')

In [None]:
bdf['seller'] = bdf.document_authors.apply(lambda x: x.split()[0].replace('[', '').replace(']', ''))

In [None]:
bdf.groupby('seller').agg('mean')

In [None]:
# instead of username, put in anything...
r=get_likely_drugs_or_region('cocaine')

In [None]:
# get all sellers and see what they might be selling
users = Counter(bdf.seller).most_common()
resses = []
for user, count in users:
    r = get_likely_drugs_or_region(user)
    resses.append(r)

In [None]:
from sklearn.feature_extraction import DictVectorizer
dvect = DictVectorizer()

dm = np.array(dvect.fit_transform(resses).todense())

In [None]:
dm  # users x NER tags

In [None]:
doldict = {k:v for v, k in dvect.vocabulary_.items()}

In [None]:
indices = dm.sum(0).argsort()[::-1]
drug_cols = [doldict[k] for k in indices]

In [None]:
plt.figure(figsize=(15,10))
plt.imshow(dm, aspect='auto')

topN = 20
plt.xticks(indices[:topN], np.array(drug_cols)[indices[:topN]], rotation=70)


plt.yticks(np.arange(0, len(dm), 5), np.array([u for u, c in users])[::5], rotation=40)
print('Users by top (items/drugs/locations)')

In [None]:
plt.figure(figsize=(15,10))
plt.plot(dm.sum(0))
topN = 20
plt.xticks(indices[:topN], np.array(drug_cols)[indices[:topN]], rotation=70)

print('Abundance over time')

In [None]:
plt.figure(figsize=(12,12))
plt.imshow(np.cov(dm.T))
plt.xticks(indices[:topN], np.array(drug_cols)[indices[:topN]], rotation=70)
plt.yticks(indices[:topN], np.array(drug_cols)[indices[:topN]], rotation=30)
print()

In [None]:
cc = np.cov(dm.T)
sims = [[doldict[k] for k in row.argsort()[::-1][1:5]] for row in cc]
sims

In [None]:
# some examples

In [None]:
sims = get_top_correlated_words(get_random_word(), verbose=True)

In [None]:
sims = get_top_correlated_words(get_random_word(), verbose=True)

In [None]:
sims = get_top_correlated_words(get_random_word(), verbose=True)

# Now let's see words per seller (time ordered) [Not Interesting]

In [None]:
groups = [bdf[bdf.seller==user] for user, c in users]

In [None]:
groups[1].fragment

In [None]:
len(groups)

In [None]:
mats = []
for i, gdf in enumerate(groups):
    m = np.array(cvect.transform(gdf.fragment.astype(str).values + 
    ' ' + gdf.document_title.astype(str).values +
    ' ' + gdf.document_authors.values).todense())
    mats.append(m)
    print(users[i], m.shape)


In [None]:
[coldict[k] for k in mats[0].sum(0).argsort()[::-1][:topN]] # better using above funcs

In [None]:
[coldict[k] for k in mats[1].sum(0).argsort()[::-1][:topN]]

In [None]:
sims = get_top_correlated_words(get_random_word())

# Featurize and Cluster in bdf

In [None]:
good_cols3 = ['fragment', 'document_title', 'document_authors']

In [None]:
g2 = graphistry.nodes(bdf, 'n').featurize(use_columns=good_cols3)

In [None]:
g3 = g2.umap(scale=1, n_neighbors=7)

In [None]:
g3.plot()

In [None]:
bdf['time'] = pd.to_datetime(bdf['published_date'])

In [None]:
bdf.resample('W', on='time')['seller'].count().plot() #boo

# Lets Forecast and Correlate Multi Sources

In [None]:
#! pip install pycaret

In [None]:
import pandas as pd
from pycaret.regression import *
import pycaret, numpy as np

In [None]:
dro = pd.read_csv('~/Downloads/Drug Related Offenses 2.csv')
# get narco events
dro = dro.fillna(0)
dro.groupby('Offense Type').count()
narco = dro[dro['Offense Type'] == 'Drug/Narcotic Violations'][1:]
# standard transforms
narco['date'] = pd.to_datetime(narco['Incident Date'], errors='coerce')
narco = narco.sort_values(by='date', ascending=True, ignore_index=True)
narco['Number of Crimes'] = narco['Number of Crimes'].astype(int)
#narco.set_index('date', inplace=True)

In [None]:
def get_dateset(df, year_split, drop, keep):
    data = df.copy()

    data['Month'] = [i.month for i in data['date']]
    data['Year'] = [i.year for i in data['date']]
    # create a sequence of numbers
    data['Series'] = np.arange(1,len(data)+1)
    # drop unnecessary columns and re-arrange
    data.drop(drop, axis=1, inplace=True)
    data = data[keep] 
    # check the head of the dataset
    #data.head()

    train = data[data.Year<year_split]
    test = data[data.Year>year_split]
    return train, test, data

def train_caret(df, year_split, drop, keep, target, numeric_features, session_id=123):
    
    train, test, data = get_dateset(df, year_split, drop, keep)
    # initialize setup
    s = setup(data = train, test_data = test, 
              target = target, 
              fold_strategy = 'timeseries', 
              numeric_features = numeric_features, 
              fold = 3, 
              transform_target = False, 
              session_id = session_id)

    best = compare_models(sort = 'MAE')
    prediction_holdout = predict_model(best)

    predictions = predict_model(best, data=data)

    predictions['date'] = df['date']
    predictions[target] = df[target]

    predictions.plot(x='date', y=[target, 'Label'], figsize=(15, 10))
    return predictions

In [None]:
narco

In [None]:

drop = ['date', 'Offense Type', 'Incident Date']
keep = ['Series', 'Year', 'Month', 'Number of Crimes']
train, test = get_dateset(narco, 2017, drop, keep)

In [None]:
preds = train_caret(narco, 2017, drop, keep, 'Number of Crimes', ['Series', 'Year', 'Month'])

In [None]:
odf = pd.read_csv('~/Downloads/Opioid Report.csv') #skiprows

In [None]:
odf = odf[1:]
odf = odf.fillna(0)

In [None]:
odf

In [None]:
odf['Number of Drug Reports']=odf['Number of Drug Reports'].apply(lambda x: 0 if ',' in str(x) else int(x))

In [None]:
odf['date'] = pd.to_datetime(odf['Incident Date'], errors='coerce')
odf = odf.sort_values(by='date', ascending=True, ignore_index=True)
odf['Number of Drug Reports'] = odf['Number of Drug Reports'].astype(int)
odf = odf[:-5]

In [None]:
drop = ['date', 'Drug Type', 'Incident Date']
keep = ['Series', 'Year', 'Month', 'Number of Drug Reports']

## need to add outlier threshold
train, test, d = get_dateset(odf, 2017, drop, keep)

In [None]:
numeric_features = ['Year', 'Month', 'Series']
target = 'Number of Drug Reports'
# outliers don't do well here... would need to resample/prune, but scores are okay...
preds = train_caret(odf, 2017, drop, keep, target, ['Series', 'Year', 'Month'])

In [None]:
odf.resample('M', on='date').sum().plot(figsize=(15,7)) #compare

In [None]:
hdf = pd.read_csv('~/Downloads/Heroin & Fentanyl.csv', skiprows=30, names='date place drug counts'.split())

In [None]:
hdf.head()

In [None]:
hdf['counts'] = hdf['counts'].fillna(0)

In [None]:
hdf['date'] = pd.to_datetime(hdf['date'], errors='coerce')
hdf = hdf.sort_values(by='date', ascending=True, ignore_index=True)
hdf['counts'] = hdf['counts'].astype(int)

In [None]:
drug_counts = hdf.groupby('drug').resample('W', on='date').sum()

In [None]:
hdf.resample('3M', on='date').sum().plot(figsize=(15,7))

In [None]:
hhdf = hdf[hdf.drug=='Heroin']
hhdf

In [None]:
numeric_features = ['Year', 'Month', 'Series']
target = 'counts'
drop = ['date', 'place', 'drug']
keep = ['Year', 'Month', 'Series', 'counts']

#detects definite trend jumps year over year
preds = train_caret(hhdf, 2017, drop, keep, target, ['Series', 'Year', 'Month'])

## Compare datasets 

In [None]:
odf.groupby('Drug Type').sum() #all Drug Types counts

In [None]:
hodf = odf[odf['Drug Type'] == 'Heroin']

In [None]:
hodf

In [None]:
len(hodf), len(hhdf) # both datasets are similar size

In [None]:
res = pd.merge_asof(hodf, hhdf, on='date') #cool function
res

## Now we can compare heroin correlation between two datasets

In [None]:
from scipy import stats

stats.pearsonr(res['Number of Drug Reports'], res.counts)

In [None]:
np.corrcoef(res['Number of Drug Reports'], res.counts)

In [None]:
# so the heroin - heroin cor is high between the two datasets

# Get news 

In [None]:
import datanews
from dateutil import parser
from pprint import pprint

def parse_datestring(datestring):
    # turns '2021-08-24T21:26:08+00:00' into 'Tue Aug 24 21:26:08 2021'
    return parser.parse(datestring)

def get_unique_hits(docs):
    untitles = set([k["title"] for k in docs])
    ndocs = []
    for k in docs:
        if k["title"] in untitles:
            ndocs.append(k)
            untitles.remove(k["title"])
    return ndocs

def get_news(query, from_date=None, to_date=None, source=None, language="en"):
    # from_date='2021-11-11'
    datanews.api_key = "0lhxl30stv3dfd0jo8yz1pghm"
    response = datanews.news(
        q=query,
        from_date=from_date,
        to_date=to_date,
        language=language,
        source=source,
        size=100,
    )
    if 'hits' in response:
        articles = response["hits"]
        pprint(Counter([row["title"] for row in articles]).most_common(10))
        for k in articles:
            k["pubDate"] = parse_datestring(k["pubDate"])
        return get_unique_hits(articles)
    return None


In [None]:
data = []
for query in ['opiod overdose in tennessee', 'opiod deaths in tennessee', \
              'heroine deaths in tennessee', 'heroine overdoses in tennessee', \
             'illegal drugs in tennessee', 'drug busts in tennessee', 'drug rings in tennessee']:
    articles = get_news(query, from_date='2021-01-01', to_date='2022-02-16')
    data.append([query, articles])

In [None]:
[len(k[1]) for k in data ]

In [None]:
data[0][1]

In [None]:
def get_tennessee_news(data):
    tdata = []
    i = 0
    for q, resses in data:
        for row in resses:
            if ('tennessee' in row['title']) or ('tennessee' in row['description']) or ('tennessee' in row['source']):
                tdata.append(row)
                i+=1
                print(f'{i}')
    return tdata
    

In [None]:
news = get_tennessee_news(data) #not many

In [None]:
news

In [None]:
allnews = [[l['title'], l['content'].split('...')[0]] for k in data for l in k[1]]

In [None]:
allnews

# however, the featurization and umap gives good clusters

In [None]:
aln = pd.DataFrame(allnews, columns = ['title', 'content'])
aln['n'] = range(len(aln))

In [None]:
g5 = g.nodes(aln, 'n').featurize()

In [None]:
g6 = g5.umap(scale=1)

In [None]:
g6.plot()