# Project Luther
## Dark Web Market Price Prediction

#### by Skip Everling



##### This notebook is three notebooks merged to include all code used for the project

In [None]:
import pprint
import urllib.parse as urlparse
import time
import random
import re

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
# functions to save and load a Python object as a JSON file
import json
def save_obj(data, name):
    with open(name + '.json', 'w') as fp:
        json.dump(data, fp, sort_keys=True, indent=4)

def load_obj(name):
    with open(name + '.json', 'r') as fp:
        return json.load(fp)

In [None]:
# Functions to help extract quantity information from product title strings 

def get_quantity(prod_string):
    '''Extracts the item quantity from product string'''
    gram_pattern = "[.\d]+[ ]?[Gg]"
    kg_pattern = "[.\d]+[ ]?[Kk]"
    
    gram_match = re.search(gram_pattern, prod_string)
    if gram_match is None:
        kg_match = re.search(kg_pattern, prod_string) # if no grams match, check if it's kilos or kg
        if kg_match is None:
            return np.nan
        else:
            return raw_to_num_quant(kg_match.group())*1000
    else:
        return raw_to_num_quant(gram_match.group())

def get_perc(prod_string):
    '''Extracts the percentage quality from product string'''
    perc_pattern = "[.\d]+[ ]?%"
    match = re.search(perc_pattern, prod_string)
    if match is None:
        return np.nan
    else: 
        return float(match.group().replace("%",""))
    

def raw_to_num_quant(raw_quantity):
    '''
    Extracts the numeric value from the quantity string
    e.g. "14.0" from "14g"
    '''
    if type(raw_quantity) is not str:
        return np.nan
    
    pattern = "(?:\d*\.)?\d+"
    match = re.match(pattern, raw_quantity)
    if match is None:
        return np.nan
    else:
        return float(match.group())

In [None]:
from forex_python.bitcoin import BtcConverter
def btc_to_usd(btc_val):
    return BtcConverter().get_latest_price('USD') * btc_val

### Tor (Anonymous Browsing and access to "Dark Web" pages)

In [None]:
# this code sets web proxy to use Tor at port 9050
# ip address inside this code should be different from public ip of the running computer
import socks
import socket
import requests

# changes default
#socks.setdefaultproxy(proxy_type=socks.PROXY_TYPE_SOCKS5, addr="127.0.0.1", port=9050)
#socket.socket = socks.socksocket

In [None]:
print(requests.get("http://lchudifyeqm4ldjj.onion/?ai=1675").text)

In [None]:
import requests
session = requests.session()
# Tor uses the 9050 port as the default socks port
# make sure tor is running
session.proxies = {'http':  'socks5h://127.0.0.1:9050',
                   'https': 'socks5h://127.0.0.1:9050'}

# Make a request through the Tor connection
# IP visible through Tor
print(session.get("http://httpbin.org/ip").text)
# Above should print an IP different than your public IP

# Following prints your normal public IP
print(requests.get("http://httpbin.org/ip").text)

In [None]:
# Dream Market url: http://lchudifyeqm4ldjj.onion/?ai=1675
# Dream Market username: lutherlooker
# Dream Market password: lutherlookersee

In [None]:
session.get("http://lchudifyeqm4ldjj.onion/?ai=1675").text #Dream Market url

In [None]:
# above code is not necessary to run Selenium below

### Selenium to navigate sites

In [None]:
# code to make Selenium work with Tor browser
# must open TBB (Tor Browser Bundle) before running this, so that you establish a Tor circuit

import os
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium import webdriver

# path to the firefox binary inside the Tor package
binary = '/Applications/TorBrowser.app/Contents/MacOS/firefox'
if os.path.exists(binary) is False:
    raise ValueError("The binary path to Tor firefox does not exist.")
firefox_binary = FirefoxBinary(binary)

browser = None
def get_browser(binary=None):
    global browser  
    # only one instance of a browser opens, remove global for multiple instances
    if not browser: 
        browser = webdriver.Firefox(firefox_binary=binary)
    return browser

if __name__ == "__main__":
    browser = get_browser(binary=firefox_binary)
    urls = (
        ('tor browser check', 'https://check.torproject.org/'),
        ('ip checker', 'http://icanhazip.com')
    )
    for url_name, url in urls:
        print("getting", url_name, "at", url)
        browser.get(url)

In [None]:
#browser.get("https://check.torproject.com")
browser.get("http://lchudifyeqm4ldjj.onion/?ai=1675") # Navigate to Dream Market

##### ...log in with user credentials manually (to get past bot-detection captcha)

In [None]:
# After log in, go to Cocaine listings
browser.get("http://lchudifyeqm4ldjj.onion/?category=187") 

In [None]:
# Optionally get two big Javascript variables on the page that contain data about vendors and listings displayed
#vendor_data = browser.execute_script("return proddata;")
#proddata = browser.execute_script("return proddata;")

In [None]:
from bs4 import BeautifulSoup

def make_listings_dict(html_doc):
    soup = BeautifulSoup(html_doc, 'html.parser')

    #find div tags that surround each displayed listing in results page
    listings = soup.find_all("div", class_="around")

    listings_dict = {}

    for listing in listings:

        title = listing.find("div", class_="text oTitle")
        title_text = title.find("a").get_text().strip()

        domain = "http://jd6yhuwcivehvdt4.onion"
        product_link= title.a["href"][1:] # get relative path link
        product_link = domain + product_link # turn into absolute path link

        body = listing.find("div", class_="oOfferBody")

        escrow_tag = body.find("div", class_="escrowInfo")
        escrow = escrow_tag.find("div").get_text()
        
        btc_price = body.find("div", class_="bottom oPrice").get_text().strip()

        vendor = body.find("div", class_="oVendor")
        vendor_tag = vendor.find("a")
        vendor_name = vendor.find("a").get_text().strip() # first a tag in vendor div tag is vendor's name
        vendor_link = domain + vendor_tag["href"][1:]

        transactions = body.find("span", title="Successful transactions").get_text().replace("(","").replace(")","")

        rating = body.find("span", class_="userRating gold").get_text().strip() if body.find("span", class_="userRating gold") else None 

        ships_from_to = body.find("span", class_="osBod").get_text().strip()

        listings_dict[title_text] = {
                                "product_title": title_text,
                                "product_link" : product_link,
                                "escrow"       : str(escrow),
                                "btc_price"    : btc_price,
                                "vendor_name"  : vendor_name,
                                "vendor_link"  : vendor_link,
                                "successful_transactions" : transactions,
                                "rating" : rating,
                                "ships_from_to" : ships_from_to
                               }
    
    return listings_dict

In [None]:
def save_page():
    page = make_listings_dict(browser.page_source)
    #print(type(page))
    page_url = browser.current_url
    #print(page_url)

    parsed = urlparse.urlparse(page_url)
    #print(urlparse.parse_qs(parsed.query))
    page_num = urlparse.parse_qs(parsed.query)['page']

    save_obj(page, "page"+ page_num[0])
    print("Saved file: page" + page_num[0] + ".json")
    return

Find and click the "Next Page" button

In [None]:
def go_to_next_page():

    ### Find the "Next Page" button
    # format of HTML: <a class="gPager lastPager" title="Next page" href=...> </a>
    next_page_button = browser.find_element_by_class_name("lastPager")

    ### Go to the page listed in href attribute of that HTML link element
    # example: href="./?page=3"
    next_page = next_page_button.get_attribute("href")
    browser.get(next_page)
    return

In [None]:
### browse and collect listings ###
for i in range(1, 3000):
    
    try:
        save_page()
        go_to_next_page()
        time.sleep(15 + (random.randint(0, 3000) / 1000)) # 15s plus 1-3s
    except:
        print("Error after {} pages.".format(i))
        break
    
    if i % 10 == 0:
        time.sleep(10)
        #browser.get("http://lchudifyeqm4ldjj.onion/?category=104")
        #browser.back()
    

## Collect vendor pages

In [None]:
# get a list of unique vendor_links
vendor_links = list(cocaine_listings["vendor_link"].unique())
print(len(vendor_links))

In [None]:
from bs4 import BeautifulSoup

# iterate through vendor links and collect vendor page data

for link in vendor_links:
    
    parsed = urlparse.urlparse(link)
    vendor_name = urlparse.parse_qs(parsed.query)['member'][0]
    vendor_dict[vendor_name] = {}
    
    ratings_url = link + "&tab=ratings#tabChooser"
    browser.get(ratings_url)
    
    time.sleep(5)
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    
    ### Get some vendor info
    member_since = soup.find("label", text="Join date")
    vendor_dict[vendor_name]["vendorJoinDate"] = member_since.find_next("span").get_text()

    ##########
    ### Scrape the vendor's ratings table
    rating_table = soup.find("div", id="tableOfRatings")
    
    #1mo
    newer_than_1mo = rating_table.find('td', text = re.compile('Newer than 1 Month'))
    stars1 = newer_than_1mo.find_next('td')
    stars2 = stars1.find_next('td')
    stars3 = stars2.find_next('td')
    stars4 = stars3.find_next('td')
    stars5 = stars4.find_next('td')
    
    vendor_dict[vendor_name]["oneStars_1mo"] = stars1.get_text()
    vendor_dict[vendor_name]["twoStars_1mo"] = stars2.get_text()
    vendor_dict[vendor_name]["threeStars_1mo"] = stars3.get_text()
    vendor_dict[vendor_name]["fourStars_1mo"] = stars4.get_text()
    vendor_dict[vendor_name]["fiveStars_1mo"] = stars5.get_text()
 
    #3mos
    newer_than_3mo = rating_table.find('td', text = re.compile('Newer than 3 Months'))
    stars1 = newer_than_3mo.find_next('td')
    stars2 = stars1.find_next('td')
    stars3 = stars2.find_next('td')
    stars4 = stars3.find_next('td')
    stars5 = stars4.find_next('td')
    
    vendor_dict[vendor_name]["oneStars_3mo"] = stars1.get_text()
    vendor_dict[vendor_name]["twoStars_3mo"] = stars2.get_text()
    vendor_dict[vendor_name]["threeStars_3mo"] = stars3.get_text()
    vendor_dict[vendor_name]["fourStars_3mo"] = stars4.get_text()
    vendor_dict[vendor_name]["fiveStars_3mo"] = stars5.get_text()
    
    #3+mos
    older_than_3mo = rating_table.find('td', text = re.compile('Older'))
    stars1 = older_than_3mo.find_next('td')
    stars2 = stars1.find_next('td')
    stars3 = stars2.find_next('td')
    stars4 = stars3.find_next('td')
    stars5 = stars4.find_next('td')
    
    vendor_dict[vendor_name]["oneStars_old3mos"] = stars1.get_text()
    vendor_dict[vendor_name]["twoStars_old3mos"] = stars2.get_text()
    vendor_dict[vendor_name]["threeStars_old3mos"] = stars3.get_text()
    vendor_dict[vendor_name]["fourStars_old3mos"] = stars4.get_text()
    vendor_dict[vendor_name]["fiveStars_old3mos"] = stars5.get_text()
    ########
    
    ###
    # get order totals and sum them
    total_paid_to_vendor = 0
    buyers = soup.find("table", class_="ratingTable hoverable")
    buyers_paid = buyers.find_all("td", text = re.compile('~ ฿')) #find amount paid e.g.: ~ ฿0.02
    for buyer_paid in buyers_paid:
        buyer_paid = buyer_paid.get_text()
        if "." in buyer_paid:
            buyer_paid = float("." + buyer_paid.split(".")[1])
        else:
            buyer_paid = float(buyer_paid.split("฿")[1])
        total_paid_to_vendor += buyer_paid
        
    vendor_dict[vendor_name]["recent_order_sum_total"] = total_paid_to_vendor
    #pprint.pprint(vendor_dict)
    print("Added info for {} to vendor dict.".format(vendor_name))
    
    vendor_links.remove(link)

In [None]:
# Save collected vendor information as a JSON file
save_obj(vendor_dict, "cocaine_vendors")

In [None]:
# Load collected vendor information from JSON file
vendor_dict = load_obj("cocaine_vendors")
vendor_df = pd.DataFrame(vendor_dict).transpose()
vendor_df.reset_index(inplace=True)
vendor_df.rename(columns={"index":"vendor_name"}, inplace=True)

# Fix some data types in dataframe since they were collected as generic objects in original dict 
vendor_df['vendorJoinDate'] = pd.to_datetime(vendor_df['vendorJoinDate'])
vendor_df['recent_order_sum_total'] = pd.to_numeric(vendor_df['recent_order_sum_total'])
for column in vendor_df.columns:
    if "Stars" in column:
        vendor_df[column] = pd.to_numeric(vendor_df[column])

vendor_df.head()

In [None]:
import pprint
import urllib.parse as urlparse
import time
import random
import re

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import ml_insights as mli

#### Helper functions

In [None]:
# functions to save and load a Python object as a JSON file
import json
def save_obj(data, name):
    with open(name + '.json', 'w') as fp:
        json.dump(data, fp, sort_keys=True, indent=4)

def load_obj(name):
    with open(name + '.json', 'r') as fp:
        return json.load(fp)

In [None]:
# Functions to help extract quantity information from product title strings 

def get_quantity(prod_string):
    '''Extracts the item quantity from product string'''
    gram_pattern = "[.\d]+[ ]?[Gg]"
    kg_pattern = "[.\d]+[ ]?[Kk]"
    
    gram_match = re.search(gram_pattern, prod_string)
    if gram_match is None:
        kg_match = re.search(kg_pattern, prod_string) # if no grams match, check if it's kilos or kg
        if kg_match is None:
            return np.nan
        else:
            return raw_to_num_quant(kg_match.group())*1000
    else:
        return raw_to_num_quant(gram_match.group())

def get_perc(prod_string):
    '''Extracts the percentage quality from product string'''
    perc_pattern = "[.\d]+[ ]?%"
    match = re.search(perc_pattern, prod_string)
    if match is None:
        return np.nan
    else: 
        return float(match.group().replace("%",""))
    

def raw_to_num_quant(raw_quantity):
    '''
    Extracts the numeric value from the quantity string
    e.g. "14.0" from "14g"
    '''
    if type(raw_quantity) is not str:
        return np.nan
    
    pattern = "(?:\d*\.)?\d+"
    match = re.match(pattern, raw_quantity)
    if match is None:
        return np.nan
    else:
        return float(match.group())

In [None]:
from forex_python.bitcoin import BtcConverter
def btc_to_usd(btc_val):
    return BtcConverter().get_latest_price('USD') * btc_val

### Load and transform data

In [None]:
# load and combine the saved JSON files for cocaine
folder = "/Users/davideverling/metis/sf17_ds7/student_submissions/projects/02-luther/skip/cocaine_jul-14"

listings_dict = {}
for i in range(1, 170):
    page_dict = load_obj(folder + "/page" + str(i))
    listings_dict.update(page_dict)

In [None]:
# transform loaded dict into a Pandas dataframe
cocaine_listings = pd.DataFrame(listings_dict).transpose()
cocaine_listings.reset_index(inplace=True)

In [None]:
##### Process the data ######

### get quantity and quality from product title strings ###

cocaine_listings["grams"] = cocaine_listings["product_title"].apply(get_quantity)
cocaine_listings["quality"] = cocaine_listings["product_title"].apply(get_perc)
cocaine_listings["quality"][cocaine_listings["quality"] < 50] = np.nan # get rid of unusually low quality ratings
cocaine_listings["quality"].fillna(cocaine_listings["quality"].mean()) #set a default quality of the average quality

### clean up Bitcoin prices -- remove btc symbol and convert to float
cocaine_listings["btc_price"] = cocaine_listings["btc_price"].map(lambda x: float(x[1:]))
cocaine_listings["cost_per_gram"] = cocaine_listings["btc_price"] / cocaine_listings["grams"]
cocaine_listings["cost_per_gram"].dropna(inplace=True)
cocaine_listings["cost_per_gram_pure"] = cocaine_listings["cost_per_gram"] * 1/(cocaine_listings['quality']/100)


btc_2_usd_rate = btc_to_usd(1) # get price of 1 bitcoin
### uncomment line below to make a corresponding USD column for analysis
#cocaine_listings["usd_price_at_rate_"+str(btc_2_usd_rate)] = cocaine_listings["btc_price"].map(lambda x: x*btc_2_usd_rate)

### convert escrow string to a 1 or 0
escrow_map = {"NO ESCROW":0, "ESCROW":1}
cocaine_listings["escrow"] = cocaine_listings["escrow"].map(escrow_map)

### get rid of rows that don't have values
cocaine_listings.dropna(inplace=True)

### convert ratings and successful_transactions to numbers
cocaine_listings["rating"] = cocaine_listings["rating"].map(float)
cocaine_listings["successful_transactions"] = cocaine_listings["successful_transactions"].map(int)


### Ships FROM and TO ###

# split ships-from and ships-to into separate columns
cocaine_listings["ships_from"] = cocaine_listings["ships_from_to"].map(lambda x: x.split("\u2192")[0].strip())
cocaine_listings["ships_to"] = cocaine_listings["ships_from_to"].map(lambda x: x.split("\u2192")[1].strip())

# creates dummy variable columns with True/False whether the vendor ships to that country
countries = ['US', 'NL', 'FR', 'GB', 'CA', 'DE', 'AU', 'EU', 'ES', 'N. America', 'BE', 'WW', 'SI',
 'IT', 'DK', 'S. America', 'CH', 'BR', 'CZ', 'SE', 'CO', 'CN', 'PL', 'GR']
for country in countries:
    cocaine_listings['ships_to_'+ country] = cocaine_listings['ships_to'].str.contains(country) 
    cocaine_listings['ships_from_'+ country] = cocaine_listings['ships_from'].str.contains(country)

In [None]:
cocaine_listings.sample()

In [None]:
cocaine_listings.to_excel("dream_market_cocaine_listings.xls")

### Vendor Data

In [None]:
# Load collected vendor information from JSON file
vendor_dict = load_obj("cocaine_vendors")
vendor_df = pd.DataFrame(vendor_dict).transpose()
vendor_df.reset_index(inplace=True)
vendor_df.rename(columns={"index":"vendor_name"}, inplace=True)

# Fix some data types in dataframe since they were collected as generic objects in original dict 
vendor_df['vendorJoinDate'] = pd.to_datetime(vendor_df['vendorJoinDate'])
vendor_df['recent_order_sum_total'] = pd.to_numeric(vendor_df['recent_order_sum_total'])
for column in vendor_df.columns:
    if "Stars" in column:
        vendor_df[column] = pd.to_numeric(vendor_df[column])

vendor_df.head()

In [None]:
# Merge vendor df with listings df
merged_df = pd.merge(cocaine_listings, vendor_df, on='vendor_name')

In [None]:
merged_df.info(3)

## Exploratory Analysis of Full Merged Data Set

In [None]:
df = merged_df

In [None]:
sns.barplot(x='ships_from', y='cost_per_gram', hue='ships_to_AU',data=merged_df)

In [None]:
# number of listings from each country
sns.countplot(x='ships_from', data=model_df, hue='ships_to_US')
plt.title("Listings by country of origin")

In [None]:
sns.distplot(df['cost_per_gram_pure']*btc_to_usd(1))
plt.suptitle("Avg. Cost per gram of 100% pure cocaine")
plt.xlabel("Cost in USD")
plt.ylabel("Proportion of Listings")
plt.axvline((df['cost_per_gram_pure']*btc_to_usd(1)).median(), color='b', linestyle='dashed', linewidth=1)
print("Median:", np.median(df['cost_per_gram_pure']*btc_to_usd(1)))

In [None]:
# visualize vendors and their average price per gram
by_vendor = merged_df.groupby(by='vendor_name').mean()
by_vendor.reset_index(inplace=True)
by_vendor.head(2)

In [None]:
sns.distplot(by_vendor['recent_order_sum_total'])

In [None]:
sns.distplot(by_vendor['cost_per_gram_pure']*btc_to_usd(1))
plt.suptitle("Disribution of rates in USD for 1 gram, by vendor")

## Create dataframes for machine learning algorithms

In [None]:
# Get rid of columns we won't use for machine learning models
model_df = merged_df.drop(['index','product_title','product_link', 'ships_from_to', 'vendor_link', 'cost_per_gram', 'cost_per_gram_pure', 'recent_order_sum_total'], axis=1)

In [None]:
model_df.sample(3)

In [None]:
model_df.max()

In [None]:
simple_df = model_df[["btc_price","escrow","rating","successful_transactions","grams","quality"]]
#simple_df = model_df[["btc_price","grams"]]

## Machine Learning Modeling

In [None]:
from sklearn import linear_model,ensemble, tree, model_selection, cross_validation, metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def scatter_matrix(X):
    feature_count = len(X.columns)
    fig,ax = plt.subplots(ncols=feature_count,nrows=feature_count,figsize=(10*feature_count, 10*feature_count))

    for i,feature_i in enumerate(X):
        for j,feature_j in enumerate(X):
            ax[i][j].scatter(X[feature_i],X[feature_j])
            ax[i][j].set_xlabel('Feature ' + str(feature_j))
            ax[i][j].set_ylabel('Feature ' + str(feature_i))

scatter_matrix(X)

### Baseline Prediction  -- Linear Regression to predict btc_price with quantity

In [None]:
df = model_df[["btc_price","grams"]]
#df = by_vendor.drop('vendor_name', axis=1)
model = linear_model.LinearRegression()

#set predictors and target
X = df.drop('btc_price', axis=1)
y = df['btc_price']

#make splits for training and testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,train_size=.7)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
scores = model_selection.cross_val_score(model, X_train, y_train, n_jobs=1)
score = np.mean(scores)

print("RMSE", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("Score: ", score)
print("Intercept: ", model.intercept_)
#print("Coeff: ", model.coef_)

In [None]:
plt.scatter(y_test, y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
plt.xlabel('Measured')
plt.ylabel('Predicted')

### Beyond Linear Regression models

In [None]:
#choose dataframe to operate on
df = model_df.drop(['vendorJoinDate','ships_to', 'ships_from', 'vendor_name'], axis=1)
#df = model_df[["btc_price","escrow","rating","successful_transactions","grams","quality"]]
#df = by_vendor.drop('vendor_name', axis=1)

#set predictors and target
X = df.drop('btc_price', axis=1) #predict with every column in df except btc_price
y = df['btc_price']

#make splits for training and testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,train_size=.7)

In [None]:
models = {}
parameters = {}

parameters['normalize'] = False

models['linear_model'] = linear_model.LinearRegression()
models['ridge_model'] = linear_model.Ridge()
models['lasso_model'] = linear_model.Lasso(alpha=.5)
models['robust_regression'] = linear_model.SGDRegressor(loss='huber',n_iter=20)
models['eps_insensitive'] = linear_model.SGDRegressor(loss='epsilon_insensitive',n_iter=20)


models['cart'] = tree.DecisionTreeRegressor(max_depth=7)
models['extratrees'] = tree.ExtraTreeRegressor(max_depth=7)
models['randomForest'] = ensemble.RandomForestRegressor()
models['adaboostedTrees'] = ensemble.AdaBoostRegressor()
models['gradboostedTrees'] = ensemble.GradientBoostingRegressor(learning_rate=0.05, n_estimators=1000)


for name,model in models.items():
    scores = model_selection.cross_val_score(model, X_train, y_train, n_jobs=1)
    print('Model: '+ name)
    print("Score: " + str(np.mean(scores)))
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print("RMSE", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    #print("Prediction for 1 gram:", model.predict([1.0])*btc_2_usd_rate)
    
    test_point = X_test.sample(1)
    test_point["grams"] = 1000
    test_point["quality"] = 90.0
    #print(test_point)
    print("Test point:", model.predict(test_point)*btc_to_usd(1))
    print()
    
    if model == models['gradboostedTrees']:
        plt.scatter(y_test, y_pred)
        plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
        plt.xlabel('Measured')
        plt.ylabel('Predicted')

In [None]:
### Test different alphas for Lasso
# alpha_vec = np.logspace(-3,3,7)
# #print(alpha_vec)

# for alpha in alpha_vec:
#     model = linear_model.Lasso(alpha=alpha)

#     scores = model_selection.cross_val_score(model, X_train, y_train, n_jobs=1)
#     #print('Model: '+ str(model))
#     score = str(np.mean(scores))
#     #print("Score: " + score)
#     model.fit(X_train,y_train)
#     print(alpha, score)

    
# coef_df = pd.DataFrame(list(zip(X.columns,model.coef_)))
# coef_df = coef_df[coef_df[1] != 0]
# coef_df

In [None]:
### Alpha around 0.5 to 1.0 works best

##### Lasso selects the following features: grams, fiveStars_1mo, fiveStars_3mo, fiveStars_old3mos

In [None]:
coefs = models['lasso_model'].fit(X,y).coef_
sorted(zip(X.columns,coefs), key = lambda x:x[1], reverse=True)

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

def plot_overfit(X,y,model_obj,param_ranges,param_static=None): 
    for parameter,parameter_range in param_ranges.items():
        avg_train_score, avg_test_score = [],[]
        std_train_score, std_test_score = [],[]
        
        for param_val in parameter_range:
            param = {parameter:param_val}
            if param_static:
                param.update(param_static)
            
                
            model = model_obj(**param)
            
            train_scores,test_scores = [],[]
            for i in range(5):
                X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y,test_size = .3)
                model.fit(X_train,y_train)
                
                train_scores.append(model.score(X_train,y_train))
                test_scores.append(model.score(X_test,y_test))
            
            avg_train_score.append(np.mean(train_scores))
            avg_test_score.append(np.mean(test_scores))
            
            std_train_score.append(np.std(train_scores))
            std_test_score.append(np.std(test_scores))
            
        fig,ax = plt.subplots()
        ax.errorbar(parameter_range,avg_train_score,yerr=std_train_score,label='training score')
        ax.errorbar(parameter_range,avg_test_score,yerr=std_test_score,label='testing score')
        
        ax.set_xlabel(parameter)
        ax.set_ylabel('score')
        ax.legend(loc=0)

In [None]:
model_obj = ensemble.RandomForestRegressor
#model_obj = linear_model.Lasso
param_ranges = {'alpha':np.logspace(-3,3,7)}

plot_overfit(X,y,model_obj,param_ranges)

In [None]:
model = models['gradboostedTrees']
model.fit(X_train,y_train)
mxr = mli.ModelXRay(model, X_test);


In [None]:
indices = mxr.feature_dependence_plots(num_pts=7)

In [None]:
mxr.feature_effect_summary()

# Project Luther

#### by Skip Everling



In [None]:
import pprint
import urllib.parse as urlparse
import time
import random

In [None]:
# functions to save and load a Python object as a JSON file
import json
def save_obj(data, name):
    with open(name + '.json', 'w') as fp:
        json.dump(data, fp, sort_keys=True, indent=4)

def load_obj(name):
    with open(name + '.json', 'r') as fp:
        return json.load(fp)

### Tor (Anonymous Browsing and access to "Dark Web" pages)

In [None]:
# this code sets web proxy to use Tor at port 9050
# ip address inside this code should be different from public ip of the running computer
import socks
import socket
import requests

# changes default
#socks.setdefaultproxy(proxy_type=socks.PROXY_TYPE_SOCKS5, addr="127.0.0.1", port=9050)
#socket.socket = socks.socksocket

In [None]:
print(requests.get("http://lchudifyeqm4ldjj.onion/?ai=1675").text)

In [None]:
import requests
session = requests.session()
# Tor uses the 9050 port as the default socks port
# make sure tor is running
session.proxies = {'http':  'socks5h://127.0.0.1:9050',
                   'https': 'socks5h://127.0.0.1:9050'}

# Make a request through the Tor connection
# IP visible through Tor
print(session.get("http://httpbin.org/ip").text)
# Above should print an IP different than your public IP

# Following prints your normal public IP
print(requests.get("http://httpbin.org/ip").text)

In [None]:
# Dream Market url: http://lchudifyeqm4ldjj.onion/?ai=1675
# Dream Market username: lutherlooker
# Dream Market password: lutherlookersee

In [None]:
session.get("http://lchudifyeqm4ldjj.onion/?ai=1675").text #Dream Market url

In [None]:
# for each listing collected, go to product listing page and get Product Description and Product-specific Reviews

# for each vendor in vendor list, go to their vendor page and get stats
# e.g. http://lchudifyeqm4ldjj.onion/contactMember?member=vendor_name


# go to Ratings tab and get Ratings distribution table as additional predictors

# get avg order value from 50 most recent reviewers?

In [None]:
# above code is not necessary to run Selenium below

### Selenium to navigate sites

In [None]:
# code to make Selenium work with Tor browser
# must open TBB (Tor Browser Bundle) before running this, so that you establish a Tor circuit

import os
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium import webdriver

# path to the firefox binary inside the Tor package
binary = '/Applications/TorBrowser.app/Contents/MacOS/firefox'
if os.path.exists(binary) is False:
    raise ValueError("The binary path to Tor firefox does not exist.")
firefox_binary = FirefoxBinary(binary)

browser = None
def get_browser(binary=None):
    global browser  
    # only one instance of a browser opens, remove global for multiple instances
    if not browser: 
        browser = webdriver.Firefox(firefox_binary=binary)
    return browser

if __name__ == "__main__":
    browser = get_browser(binary=firefox_binary)
    urls = (
        ('tor browser check', 'https://check.torproject.org/'),
        ('ip checker', 'http://icanhazip.com')
    )
    for url_name, url in urls:
        print("getting", url_name, "at", url)
        browser.get(url)

In [None]:
#browser.get("https://check.torproject.com")
browser.get("http://lchudifyeqm4ldjj.onion/?ai=1675") # Navigate to Dream Market

##### ...log in with user credentials manually (to get past bot-detection captcha)

In [None]:
# After log in, go to Cocaine listings
browser.get("http://lchudifyeqm4ldjj.onion/?category=187") 

In [None]:
# Get two big Javascript variables on the page that contain data about vendors and listings displayed
#vendor_data = browser.execute_script("return proddata;")
#proddata = browser.execute_script("return proddata;")

In [None]:
from bs4 import BeautifulSoup

def make_listings_dict(html_doc):
    soup = BeautifulSoup(html_doc, 'html.parser')

    #find div tags that surround each displayed listing in results page
    listings = soup.find_all("div", class_="around")

    listings_dict = {}

    for listing in listings:

        title = listing.find("div", class_="text oTitle")
        title_text = title.find("a").get_text().strip()

        domain = "http://jd6yhuwcivehvdt4.onion"
        product_link= title.a["href"][1:] # get relative path link
        product_link = domain + product_link # turn into absolute path link

        body = listing.find("div", class_="oOfferBody")

        escrow_tag = body.find("div", class_="escrowInfo")
        escrow = escrow_tag.find("div").get_text()
        
        btc_price = body.find("div", class_="bottom oPrice").get_text().strip()

        vendor = body.find("div", class_="oVendor")
        vendor_tag = vendor.find("a")
        vendor_name = vendor.find("a").get_text().strip() # first a tag in vendor div tag is vendor's name
        vendor_link = domain + vendor_tag["href"][1:]

        transactions = body.find("span", title="Successful transactions").get_text().replace("(","").replace(")","")

        rating = body.find("span", class_="userRating gold").get_text().strip() if body.find("span", class_="userRating gold") else None 

        ships_from_to = body.find("span", class_="osBod").get_text().strip()

        listings_dict[title_text] = {
                                "product_title": title_text,
                                "product_link" : product_link,
                                "escrow"       : str(escrow),
                                "btc_price"    : btc_price,
                                "vendor_name"  : vendor_name,
                                "vendor_link"  : vendor_link,
                                "successful_transactions" : transactions,
                                "rating" : rating,
                                "ships_from_to" : ships_from_to
                               }
    
    return listings_dict

In [None]:
def save_page():
    page = make_listings_dict(browser.page_source)
    #print(type(page))
    page_url = browser.current_url
    #print(page_url)

    parsed = urlparse.urlparse(page_url)
    #print(urlparse.parse_qs(parsed.query))
    page_num = urlparse.parse_qs(parsed.query)['page']

    save_obj(page, "page"+ page_num[0])
    print("Saved file: page" + page_num[0] + ".json")
    return

Find and click the "Next Page" button

In [None]:
def go_to_next_page():

    ### Find the "Next Page" button
    # format of HTML: <a class="gPager lastPager" title="Next page" href=...> </a>
    next_page_button = browser.find_element_by_class_name("lastPager")

    ### Go to the page listed in href attribute of that HTML link element
    # example: href="./?page=3"
    next_page = next_page_button.get_attribute("href")
    browser.get(next_page)
    return

In [None]:
### browse and collect listings ###
for i in range(1, 3000):
    
    try:
        save_page()
        go_to_next_page()
        time.sleep(15 + (random.randint(0, 3000) / 1000)) # 15s plus 1-3s
    except:
        print("Error after {} pages.".format(i))
        break
    
    if i % 10 == 0:
        time.sleep(10)
        #browser.get("http://lchudifyeqm4ldjj.onion/?category=104")
        #browser.back()
    