In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from pathlib import Path
import os
import glob

In [3]:
# Define cleanhtml function
def cleanhtml(filename):
  """Extracts relevant information from Hydra html files. Post-5/08/14 format
  Resulting dataframe contains:
  Title, Seller ID, priceUSD, priceBTC, Rating, Reviews, Origin, Destination, Category, Market Name and Date Scraped"""
  with open(filename, encoding='utf8') as infile:  # Open file to support utf8 encoding
    raw_html = str(BeautifulSoup(infile, "html.parser"))
  cleanr = re.compile('<.*?>')  # Regex for html tags
  minustags = re.sub(cleanr, ' ', raw_html)  # Strip html tags
  nlines = re.compile(' ?\n ?')  # Regex for \n
  minusnline = re.sub(nlines, ' ', minustags)  # Strip nlines
  catlocations = re.compile('HYDRA.*?Price')  # Regex for majorite of header text, category/origin/destination options
  minuscatloc = re.sub(catlocations, '', minusnline)  # Strip html directory options 
  end = re.compile('( *&lt;.*?$)|(   Displaying.*?$)')  
  cleantext = re.sub(end, '', minuscatloc)
  attribute_list = re.split(r'\s{3,}', cleantext)
#  subcat = attribute_list[1].split(' :: ')[1].strip()  ##ONLY work on subcats
  title = attribute_list[3::8]  # Title of listing
  sellerid = attribute_list[4::8]  # Seller ID/Username
  rating = attribute_list[5::8]  # Seller Rating. Out of 5.
  reviews = attribute_list[6::8] # The number of customer reviews
  priceUSD = attribute_list[7::8] # Price in American Dollars
  priceBTC = attribute_list[8::8] # Price in Bitcoin
  origin = attribute_list[9::8]  # Origin of shipment
  destination = attribute_list[10::8]  # Where seller is willing to ship to
  category = attribute_list[1].split(' :: ')[0].strip() # The category of item
  category = [category] * len(title) # Fill column with single category value
  market = ['Hydra'] * len(title) # Fill column with single market name value
  date = [str(Path(filename).parents[1])[-10:]] * len(title)  # # Fill column with single date value
  df = pd.DataFrame([title,sellerid,priceUSD,priceBTC,rating,reviews,origin,destination,category,market,date]).transpose()  # Create df and transpose
  df.columns = ['Title','Sellerid','PriceUSD','PriceBTC','Rating','Reviews','Origin','Destination','Category','Market','Date']  # Assign column names to df
  return df

#cleanhtml('D:/Darkweb Data/dnmarchives/hydra.tar/hydra/2014-10-24/category/1120.html.html') #test

In [4]:
# Define cleanhtml2 function for listings before 2014-08-09 but after 2014-04-03
def cleanhtml2(filename):
  """Extracts relevant information from Hydra html files. Pre-5/08/14, post-2014-04-03 format
  Resulting dataframe contains:
  Title, Seller ID, priceUSD, priceBTC, Rating, Reviews, Origin, Destination, Category, Market Name and Date Scraped"""
  with open(filename, encoding='utf8') as infile:  # Open file to support utf8 encoding
    raw_html = str(BeautifulSoup(infile, "html.parser"))
  cleanr = re.compile('<.*?>')  # Regex for html tags
  minustags = re.sub(cleanr, ' ', raw_html)  # Strip html tags
  nlines = re.compile(' ?\n ?')  # Regex for \n
  minusnline = re.sub(nlines, ' ', minustags)  # Strip nlines
  catlocations = re.compile('✧.*?Price')  # Regex for majorite of header text, category/origin/destination options
  minuscatloc = re.sub(catlocations, '', minusnline) # Strip html directory options 
  uppercategories = re.compile('Price.*?$')
  minusuppercats = re.sub(uppercategories, '', minuscatloc)
  attribute_list2 = re.split(r'\s{3,}', minusuppercats)
  category = attribute_list2[-2]  # The category of item
  end = re.compile(' *&lt;.*?$|←.*?$') 
  minusend = re.sub(end, '', minuscatloc)
  pprice = re.compile('^.*?Price')
  cleantext = re.sub(pprice, '', minusend)
  attribute_list = re.split(r'\s{2,}', cleantext)
  attribute_list = attribute_list[:(len(attribute_list)-len(attribute_list)%7)]
  title = attribute_list[2::7]  # Title of listing
  sellerid = attribute_list[3::7]  # Seller ID/Username
  rating = attribute_list[4::7]  # Seller Rating. Out of 5.
  reviews = [] * len(title)
  price = attribute_list[7::7] # Price
  priceUSD = [i.split(' ', 1)[0] for i in price]  #Extract first element of list of all price types (USD, BTC and sometimes LTC)
  priceBTC = [i.split(' ', 1)[0] for i in attribute_list[8::7]]
  origin = attribute_list[5::7]  # Origin of shipment
  destination = attribute_list[6::7]  # Where seller is willing to ship to
  category = [category] * len(title) # Fill column with single category value
#  subcat = subcat_dict[int(filename[64:68])] #### NEED to iterate over whole list
  market = ['Hydra'] * len(title) # Fill column with single market name value
  date = [str(Path(filename).parents[1])[-10:]] * len(title)  # # Fill column with single date value
  df = pd.DataFrame([title,sellerid,priceUSD,priceBTC,rating,reviews,origin,destination,category,market,date]).transpose()  # Create df and transpose
  df.columns = ['Title','Sellerid','PriceUSD','PriceBTC','Rating','Reviews','Origin','Destination','Category','Market','Date']  # Assign column names to df
  return df

#cleanhtml2('D:/Darkweb Data/dnmarchives/hydra.tar/hydra/2014-04-12/category/1130.html.html') #test

In [5]:
subcat_dict = {1100: 'Cannabis', 1110: 'Concentrates', 1120: 'Hash', 1130: 'Weed', 1140: 'Synthetics', 1190: 'Other',
              1200: 'Ecstacy', 1210: 'MDMA', 1220: 'Pills', 1290: 'Other',
              1300: 'Opioids', 1310: 'Heroin', 1390: 'Other',
              1400: 'Dissociatives', 1410: 'Ketamine', 1420: 'GBH', 1490: 'Other',
              1500: 'Psychedelics', 1510: 'LSD', 1520: 'DMT', 1530: 'Mescaline', 1540: 'Mushrooms', 1590: 'Other',
              1600: 'Stimulants', 1610: 'Cocaine', 1620: 'Speed', 1630: 'Meth', 1690: 'Other',
              1700: 'Prescription', 1710: 'Prescription', 
              1800: 'Benzos', 1810: 'Benzos',
              1900: 'Steroids', 1910: 'Steroids',
              2000: 'Services', 2010: 'Services',
              2100: 'Tobacco', 2110: 'Tobacco',
              2200: 'Weapons', 2210: 'Arms', 2220: 'Explosives', 2230: 'Ammo', 2290: 'Other',
              2300: 'Other', 2310: 'Other',
              2400: 'Custom', 2410: 'Custom',
              2500: 'Paraphenalia', 2510: 'Scales', 2520: 'Rolling Supplies', 2530: 'Reagent Test', 2540: 'Pipes', 2550: 'Machinery', 2560: 'Grinders', 2570: 'Bongs', 2580: 'Accessories', 
              2600: 'Apparel', 2610: 'Sunglasses', 2620: 'Handbags', 2630: 'Watches', 2690: 'Other',
              2700: 'Digital Goods', 2710: 'Software', 2720: 'Ebooks', 2730: 'Money', 2740: 'Erotica'}

subcat_dict[2510]

'Scales'

In [6]:
# Create empty variables to populate
data = []
data2 = []

# Specify file directory with folders for different scraping sessions
datedir = "D:/Darkweb Data/dnmarchives/hydra.tar/hydra"

# Adds html ext to all files. Applies file cleaning function to all files from each scraping session and appends to one of two dataframes.
for scrapedate in os.listdir(datedir):
    directory = datedir +'/' + scrapedate + '/category/'
##    converthtml(directory, r'*.html*', r'%s')
    for filename in os.listdir(directory):
      file = directory + '/' + filename # Concatenate directory path and filename for reading into cleanhtml function
      if scrapedate > '2014-08-05':
        filedf = cleanhtml(file)  # Extract dataframe of required variables from each file
        data.append(filedf)  # Append empty list with each dataframe generated. Results in list of dataframes.
        df = pd.concat(data)  # Concatenate list of dataframes into single dataframe 
      else:
        filedf2 = cleanhtml2(file)
        data2.append(filedf2) 
        df2 = pd.concat(data2)  

NameError: name 'subcat' is not defined

In [30]:
# Combine all dataframes
dataframe = df.append(df2).drop_duplicates()
dataframe

Unnamed: 0,Title,Sellerid,PriceUSD,PriceBTC,Rating,Reviews,Origin,Destination,Category,Market,Date
0,*New Vendor Offer* UK Blue Cheese (2 Grams),LDNGstar,$25.00,0.04269 BTC,5.0,1,United Kingdom,European union,Cannabis,Hydra,2014-08-09
1,3.5 g Sample Blue Dream Greenhouse Top Shelf,TheSecretGarden,$29.00,0.04953 BTC,5.0,0,United States,Worldwide,Cannabis,Hydra,2014-08-09
2,1 g Sample Berry Diesel Haze Honeycomb,TheSecretGarden,$45.00,0.07685 BTC,5.0,0,United States,Worldwide,Cannabis,Hydra,2014-08-09
3,4 Oz. (112 g) Berry Diesel Haze Honeycomb,TheSecretGarden,$2800.00,4.78183 BTC,5.0,0,United States,United States,Cannabis,Hydra,2014-08-09
4,2 Oz. (56 g) Berry Diesel Haze Honeycomb,TheSecretGarden,$1500.00,2.56169 BTC,5.0,0,United States,United States,Cannabis,Hydra,2014-08-09
...,...,...,...,...,...,...,...,...,...,...,...
5,French Guide : Le carding de A à Z,Hackyboy,$40.00,฿0.06867660,4.3,,Worldwide,Worldwide,Digital Goods,Hydra,2014-08-05
6,"PORN PASSES! - Lifetime Access to Brazzers, Mo...",MagicHat,$10.00,฿0.01716915,4.8,,Worldwide,Worldwide,Digital Goods,Hydra,2014-08-05
7,LiveJasmin.com [Lifetime Account] Access the B...,MagicHat,$10.00,฿0.01716915,4.8,,United States,Worldwide,Digital Goods,Hydra,2014-08-05
8,Passion-Hd.com Account - [LIFETIME PORN PREMIU...,NotoSeller,$5.00,฿0.00858458,5.0,,Worldwide,Worldwide,Digital Goods,Hydra,2014-08-05


In [31]:
# Check for inconsistencies
dataframe['Origin'].value_counts()[:40]

Worldwide                 17198
United States             11245
Germany                    4528
United Kingdom             3642
Undeclared                 2907
Canada                     2779
European union             2592
Netherlands                1925
Sweden                     1627
5.0                        1103
China                       938
Australia                   628
European Union              567
Hungary                     474
Denmark                     469
Spain                       258
Philippines                 188
Italy                       172
Austria                     168
Belgium                     163
Mexico                      155
Ireland                     144
Singapore                   120
Poland                      106
GoodmedsDK                  100
3.0                          85
Sri Lanka                    75
Switzerland                  74
New Zealand                  68
India                        58
persianrugsuk                45
Czech Re

In [None]:
# RDS info
#db name: cryptomarket-1
#username: dept922
#password: Daniel177

In [None]:
# Saves data to PostgreSQL table
#engine = create_engine('postgresql://[USERNAME]:[PASSWORD]@[DATABASE].as-southeast-1.rds.amazonaws.com:5432/postgres', echo=True)
#df.to_sql('posting',engine, if_exists='replace', index = False)