In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import compress

import datetime
from dateutil.parser import parse

import math
import os
import copy
import pickle

#import drugLookup

In [3]:
# Read in data (from pickle file)
file = open('drug_df.dms','rb')
data = pickle.load(file)

In [4]:
data = data.sort_values(by=['dates'])

In [6]:
data.iloc[29000:28010,:]

Unnamed: 0,transaction_id,vendor_id,vendor_name,bitcoin,USD,product_description,dates,drug_prediction
59869,'60967,1874,dayglow,0.0,118.991,1 gr Very Good Cocaine,2018-12-16 19:39:31,cocaine
59888,'60986,1874,dayglow,0.0,118.991,1 gr Very Good Cocaine,2018-12-16 19:39:31,cocaine
59857,'60955,1874,dayglow,0.0,118.991,1 gr Very Good Cocaine,2018-12-16 19:39:31,cocaine
59867,'60965,1874,dayglow,0.0,118.991,1 gr Very Good Cocaine,2018-12-16 19:39:31,cocaine
80814,'81912,123,DDUK-NDD,0.02889,93.39,500x TEVA Diazepam Tablets ? FREE NDD,2018-12-16 19:50:39,benzodiazepines
81108,'82206,1629,fun-gee,0.00974,31.48,3.5g (1/8oz) Magic Mushrooms P. cubensis amazo...,2018-12-16 19:52:30,psilocybin
81402,'82500,137,ThePowerCartel,0.1535,496.19,28g Ketamine S-Isomer Rock Form,2018-12-16 19:56:28,ketamine
81359,'82457,137,ThePowerCartel,0.1535,496.19,28g Ketamine S-Isomer Rock Form,2018-12-16 19:56:28,ketamine
81439,'82537,592,Mr.Sandman,0.159,513.97,100 x 110ug LSD,2018-12-16 19:56:40,lsd
60194,'61292,1630,Qualitywhitee,0.0,57.0837,2.5 Gram Ketamine,2018-12-16 20:00:48,ketamine


## Split dataset into train/test set


In [27]:
# Get total number of samples, and number of train/test samples
n_samples = data.shape[0]
n_train = round(n_samples * .75)

# Create training/test set
copied_data = data.copy()

# Convert USD column from objects to integers
# copied_data['USD']= pd.to_numeric
copied_data[["USD"]] = copied_data[["USD"]].apply(pd.to_numeric)


#Create training and testing set
train_set = copied_data.iloc[:n_train,:]
test_set = copied_data.iloc[n_train:,:]

In [28]:
copied_data.dtypes

transaction_id                 object
vendor_id                       int64
vendor_name                    object
bitcoin                        object
USD                           float64
product_description            object
dates                  datetime64[ns]
drug_prediction                object
dtype: object

In [29]:
#Split testing and training set
p_train = list(train_set['USD'])
v_train = list(train_set['vendor_name'])

p_test = list(test_set['USD'])
v_test = list(test_set['vendor_name'])

## Compute P( p | v )

#### Generate list of bins/categories

In [30]:
start = 0.00 # first interval price
interval = 20 # length of interval (dollars)

# Create list of price intervals
n_intervals = int(13000/interval) # compute number of intervals
list_of_intervals = [i for i in range(n_intervals)]
slide_amount = interval # amount to slide window (if no overlap desired, set equal to interval)

#Generate the list of bins/intervals
for index in list_of_intervals:
    if list_of_intervals[index] == 0:
        list_of_intervals[index] = 0
    else:
        list_of_intervals[index] = slide_amount
        slide_amount += interval

# print (list_of_intervals)

list_of_intervals= [float(i) for i in list_of_intervals]

print (list_of_intervals)

[0.0, 20.0, 40.0, 60.0, 80.0, 100.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0, 240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0, 420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0, 600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0, 780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0, 960.0, 980.0, 1000.0, 1020.0, 1040.0, 1060.0, 1080.0, 1100.0, 1120.0, 1140.0, 1160.0, 1180.0, 1200.0, 1220.0, 1240.0, 1260.0, 1280.0, 1300.0, 1320.0, 1340.0, 1360.0, 1380.0, 1400.0, 1420.0, 1440.0, 1460.0, 1480.0, 1500.0, 1520.0, 1540.0, 1560.0, 1580.0, 1600.0, 1620.0, 1640.0, 1660.0, 1680.0, 1700.0, 1720.0, 1740.0, 1760.0, 1780.0, 1800.0, 1820.0, 1840.0, 1860.0, 1880.0, 1900.0, 1920.0, 1940.0, 1960.0, 1980.0, 2000.0, 2020.0, 2040.0, 2060.0, 2080.0, 2100.0, 2120.0, 2140.0, 2160.0, 2180.0, 2200.0, 2220.0, 2240.0, 2260.0, 2280.0, 2300.0, 2320.0, 2340.0, 2360.0, 2380.0, 2400.0, 2420.0, 2440.0, 2460.0, 2480.0, 2500.0, 2520.0, 2540.0, 2560.0, 2580.0, 2600.0, 2620.0,

#### Create functions to put price value in categories

In [31]:
#  Function to check if price is in interval
def priceInInterval(start_price, check_price, length):
    end_price = start_price + length
    cond1 = check_price >= start_price
    cond2 = check_price < end_price
    return(cond1 and cond2)

# Function to classify given price into one of bins
def binPrice(price1, start_prices, length = interval):
    price_bin = [start_price for start_price in start_prices if priceInInterval(start_price, price1,length)]
    if len(price_bin) == 0:
        return(start_prices[-1])
    else:
        return(price_bin[0])

#### Categorize each price value for each listing

In [32]:
# train_set.dtypes

In [33]:
# For given vendor, count the number of transactions in each time_bin
train_set['price_category'] = train_set['USD'].apply(binPrice, start_prices = list_of_intervals)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [34]:
train_set.head(6)

Unnamed: 0,transaction_id,vendor_id,vendor_name,bitcoin,USD,product_description,dates,drug_prediction,price_category
10946,'12044,754,hiddenhands,0,72.532,KETAMINE 3G AAA+++ | OFFER | Next Day Delivery,2018-04-03 19:14:02,ketamine,60.0
10911,'12009,754,hiddenhands,0,72.532,KETAMINE 3G AAA+++ | OFFER | Next Day Delivery,2018-04-03 19:14:02,ketamine,60.0
15416,'16514,438,drugpharmacist,0,145.6,Bomb Cartel Ice 7g - FREE PRIORITY SHIPPING,2018-04-03 22:19:03,methamphetamine,140.0
21600,'22698,962,trotters_stash,0,382.446,7g High Quality Pure #3 Heroin - Uncut,2018-04-04 02:00:13,heroin,380.0
10955,'12053,754,hiddenhands,0,72.532,KETAMINE 3G AAA+++ | OFFER | Next Day Delivery,2018-04-04 19:14:02,ketamine,60.0
10947,'12045,754,hiddenhands,0,72.532,KETAMINE 3G AAA+++ | OFFER | Next Day Delivery,2018-04-04 19:14:02,ketamine,60.0


#### Compute probability of vendor given price

In [35]:
# Get probability for each vendor given a specific time
def getVendorsForPrice(price_bin):
#     Args: Name of a vendor (string)
#     Returns: series, representing proportion of total for each drug
    price_subset = train_set[train_set['price_category'] == price_bin]
    tally = price_subset['vendor_name'].groupby(price_subset['vendor_name']).count()
    return(tally/np.sum(tally))

In [14]:
# Get list of vendors
_ , vendor_list = pd.factorize(v_train, sort = True)

#Create inverse emission probability dataframe
pvp = pd.DataFrame(columns = vendor_list)
#Insert vendor name column
pvp.insert(0, "price_bin", list_of_intervals)
# Apply function to dataframe
pvp.iloc[:,1:] = pvp['price_bin'].apply(getVendorsForPrice)
# Sort by Vendor name and fill NA values with 0
pvp = pvp.fillna(0)
# Set index to be time
pvp = pvp.set_index('price_bin')

#### Make Bayes Prediction

In [15]:
# Get prior probabilities (i.e., probability of each vendor)
vendor_probs = train_set['vendor_name'].value_counts()/np.sum(train_set['vendor_name'].value_counts())
vendor_probs = vendor_probs.sort_index()

In [16]:
# Create dictionary with best prediction for each drug
pvt_dict = {price_bin : list(pvp.loc[price_bin].sort_values(ascending = False)[:10].index) for price_bin in pvp.index}

In [17]:
# Function to predict class for each drug in test set
def bayesPredict(obs_seq):
    preds = [pvt_dict[obs] for obs in obs_seq]
    return(preds)

# Get times for testing
test_set['price_category'] = test_set['USD'].apply(binPrice, start_prices = list_of_intervals)
t_test = list(test_set['price_category'])

# Make predictions
test_set['vendor_pred'] = bayesPredict(t_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
pvt_dict

{0.0: ['cashcow72',
  'Gladyman',
  'gordonflour',
  'HappyDutch',
  'fun-gee',
  'densc0rp',
  'puremedicinehouse',
  'PaulReubens',
  'French_camorra',
  'deepbay'],
 20.0: ['mrs.feelgood',
  'TREES',
  'MaggieThatcher',
  'PHARMALABS',
  'bangalow',
  'clawedfeats',
  'einsteingroup',
  'dankcity',
  'blueviking',
  'thatsteroidguy'],
 40.0: ['mushroomswaterfowl17',
  'mailordermary2',
  'NDD-DMT',
  'JUNY',
  'ICE-CUBE',
  'Americansteroids',
  'muttznutz',
  'Qualitywhitee',
  'TheLollipopGuild',
  'AlanCuring420_UK'],
 60.0: ['Dr_Seuss_FR',
  'zouaves',
  'Grenouillebleu',
  'StealthPharmacy',
  'UKXAN',
  'puremedicinehouse',
  'ash_williams',
  'Cash_King',
  'TheCzechMeth',
  'hiddenhands'],
 80.0: ['BudWorld',
  'JerryGarcia',
  'Migosicegang',
  'green.goddess',
  'DOPE_CHEF',
  'aussieroidmuncherr',
  'frosties2014',
  'StealthPharmacy',
  'ScarlettsK',
  'MoonBanana'],
 100.0: ['blueviking',
  'StealthPharmacy',
  'CaliDrugCo',
  'drugpharmacist',
  'zouaves',
  'HerbinLeg

In [18]:
# Compute 'top n' accuracy
def get_top_n(df, n):
    q = zip(df['vendor_name'], df['vendor_pred'])
    if n == 1:
        ar = [samp[0] == samp[1][0] for samp in q]
    else:
        ar = [samp[0] in samp[1][:n] for samp in q]
    acc = np.sum(ar)/len(ar)
    return(round(acc,4))

In [19]:
print('Time interval for bin:',interval, 'dollar.')
print('Top 1 accuracy :',get_top_n(test_set, 1))
print('Top 2 accuracy :',get_top_n(test_set, 2))
print('Top 3 accuracy :',get_top_n(test_set, 3))
print('Top 5 accuracy :',get_top_n(test_set, 5))
print('Top 10 accuracy:',get_top_n(test_set, 10))

Time interval for bin: 20 dollar.
Top 1 accuracy : 0.0723
Top 2 accuracy : 0.1083
Top 3 accuracy : 0.1495
Top 5 accuracy : 0.1923
Top 10 accuracy: 0.2843


#### Export data to pickle file

In [84]:
#concatenate training and testing set
drug_and_price = pd.concat([train_set, test_set],sort = False)


In [85]:
# Export to file
import pickle
file = open('drug_and_price.pkl','wb')
pickle.dump(drug_and_price, file)
file.close()

file = open('drug_and_price_train.pkl','wb')
pickle.dump(train_set, file)
file.close()

file = open('drug_and_price_test.pkl','wb')
pickle.dump(test_set, file)
file.close()

#### Price plots

In [20]:
# Price plots
plot_price = data[['vendor_name', 'USD']].copy()

vendor_names = data['vendor_name'].tolist()

USD = data['USD'].tolist()

# price_graphs = pd.DataFrame({'vendor_name': vendor_names, 'USD': USD})

# price_graphs.groupby('vendor_name').hist()
# print (data['USD'].hist(by=data['vendor_name']))


