In [1]:
import re
import requests
import urllib.request as img_request
import time
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from os import makedirs
from os import listdir
from os import getcwd
import os

from sklearn.model_selection import train_test_split

# Init

Defining regex tags

In [None]:
n_pages_tag = re.compile(r'<a href=.*page=(\d*).*/li>') # For getting the amount of pages on the domain

name_tag = re.compile(r'<a href=\"https://sailboatdata\.com/sailboat/.*\">(.*)</a>') # For making a list of all the boats on sailboatdata

specs_tag = re.compile(r'<div class=\" col-\w\w-\d*  col-\w\w-6 sailboatdata-label \">\s*(.*):\s</div>\s<.*\s*(.*)') # For scraping data from specific boat url

image_tag = re.compile(r'(?:photo|drawing)\".*src=\"(http.*)\"/>')

photo_draw_tag = re.compile(r'sailboat/(\w*)')

In [2]:
cwd = getcwd()
specs = []
categories = []
boat_data = pd.DataFrame()
counter = 0

data_dir = "data/"
img_dir = "data/images/"

In [None]:
if not(os.path.exists(img_dir)):
    print("Making dir" , img_dir)
    makedirs(img_dir)

# Scraping

In [None]:
general_url = 'https://sailboatdata.com/sailboat?page={}&paginate=25'

print("Connecting to" , general_url.format(1))
init_request = requests.get(general_url.format(1))
print(init_request.status_code)
pages = re.findall(n_pages_tag , init_request.text)
n_pages = pages[-1]

boat_types = []
for i in tqdm(range (1,int(n_pages)+1)):
  # print("Connecting to" , general_url.format(i))
  r = requests.get(general_url.format(i))
  # print(r.status_code)
  boat_types += re.findall(name_tag,r.text)
  time.sleep(0.5) #To not throw too many requests at the website

print(len(boat_types))

In [None]:
for boat in boat_types:
    # GETTNG REQUEST
    print("Scraping",boat,"...")
    r = requests.get("https://sailboatdata.com/sailboat/{}?units=metric".format(boat.replace(" ","-").replace("(","").replace(")","").replace(".","")))
    raw_text = r.text

    # SCRAPING IMAGES
    images = re.findall(image_tag,raw_text)
    print("         -------scraping")
    for image in images:
        img_path = str(counter).zfill(5)+".jpg"
        if not os.path.isfile(img_path):
            counter += 1
            print(image)
            try:
                img_request.urlretrieve(image,img_dir+img_path)
            except:
                print("Strange url" , image.replace(" ","%20"))
                try:
                    img_request.urlretrieve(image,img_dir+img_path)
                except:
                    has_image = False

            # SCRAPING SPECS
            categories_specs = re.findall(specs_tag,raw_text)
            specs = [el[1] for el in categories_specs]
            categories = [el[0] for el in categories_specs]
            new_row = {categories[i]: specs[i] for i in range(len(categories))}
            new_row['name'] = boat
            new_row['img_path'] = img_path
            boat_data = pd.concat([boat_data, pd.DataFrame([new_row])], ignore_index=True)



    print("Done\n")
    if counter%100 == 1:
        print("saving")
        boat_data.to_csv(data_dir+"boat_data.csv" , index=False)
boat_data.to_csv(data_dir+"boat_data.csv" , index = False)

In [None]:
pd.set_option('display.max_columns', None)
data_raw = pd.read_csv(data_dir+"boat_data.csv")
data_raw

# Cleaning

In [None]:
units_cleaner_tag = re.compile(r'([\d.]+)(\s*m|\s*kg)(<.*>)*')

def clean_row(row):
  cleaned_row = []
  for i in range(len(row)):
    if i != 32:
      try:
        cleaned_row.append(float(re.sub(units_cleaner_tag , r'\g<1>' , row[i].replace(',','') , )))
      except:
        cleaned_row.append(row[i])
    else:
      cleaned_row.append(row[i])
  return cleaned_row

In [None]:
cleaned_data = []
for i , row in data_raw.iterrows():
  cleaned_data.append(clean_row(row))

In [None]:
data_clean = pd.DataFrame(cleaned_data , columns = data_raw.columns)

for column in data_clean.columns:
    if data_clean[column].dtype == "object" and not column == "Download Boat Record" and not column == "url" and not column == "img_path":
        data_clean[column] = data_clean[column].astype('category')
        pd.DataFrame(data_clean[column].cat.categories).to_csv(data_dir+column+".txt" , index = False , header = False)
        data_clean[column] = data_clean[column].cat.codes

data_clean.to_csv(data_dir+"boat_data_clean.csv" , index = False)

# Splitting

In [None]:
data_train , data_test = train_test_split(data_clean , test_size = 0.2 , random_state = 42)


In [None]:
train_dir = "data/train/"
test_dir = "data/test/"

if not(os.path.exists(train_dir)):
    print("Making dir" , train_dir)
    makedirs(train_dir)

if not(os.path.exists(test_dir)):
    print("Making dir" , test_dir)
    makedirs(test_dir)


for image in data_train['img_path']:
    try:
        img = Image.open(img_dir+image)
        img = img.convert('RGB')
        img.save(train_dir+image)
    except:
        print("Image not found" , image)
        data_train = data_train[data_train['img_path'] != image]

for image in data_test['img_path']:
    try:
        img = Image.open(img_dir+image)
        img = img.convert('RGB')
        img.save(test_dir+image)
    except:
        print("Image not found" , image)
        data_test = data_test[data_test['img_path'] != image]


In [None]:
data_train.to_csv(data_dir+"boat_data_train.csv" , index = False)
data_test.to_csv(data_dir+"boat_data_test.csv" , index = False)

In [None]:
data_clean

In [None]:
data_train

In [None]:
data_test

In [None]:
data_clean.__len__()

# Plotting

In [None]:
plot_dir = "plots"
if not(os.path.exists(plot_dir)):
    print("Making dir" , plot_dir)
    os.makedirs(plot_dir)

In [None]:
train_test = ["train" , "test"]
size_factor = [1,0.2]
for i , data in enumerate([data_train , data_test]):
    for column in data_clean.columns:
        if not column == "Download Boat Record" and not column == "name" and not column == "url" and not column == "img_path":
            print("Plotting" , column)
            fig = plt.figure(figsize=(10*2,5*2))
            if data[column].dtype == "object":
                to_remove = 50*size_factor[i]
                try:
                    dat = data[column].value_counts()
                    dat = dat[dat > to_remove]
                    dat = dat/len(data[column]) # Normalising
                    dat.plot(kind='barh')
                    plt.annotate(text = "Total entries were "+str(len(data[column].value_counts())) , xy = (0.8,0.95) , xycoords = "axes fraction")
                    plt.annotate(text = "Removed entries where n<"+str(to_remove) , xy = (0.8,0.9) , xycoords = "axes fraction")
                except:
                    data[column].value_counts().plot(kind='barh')

            else:
                data[column].plot(kind='hist' , logy=True)
            fig.name = column
            fig.suptitle(column+str(" ")+train_test[i] , fontsize=16)
            path = "plots/"+column.replace('/',"_")+"_"+train_test[i]+".png"
            plt.savefig(path)
            fig.clear()

# New

In [3]:
data_clean = pd.read_csv(data_dir+"boat_data_clean.csv")

In [5]:
for column in data_clean.columns:
    if (data_clean[column].dtype == "object" or data_clean[column].dtype == "category") and not column == "Download Boat Record" and not column == "url" and not column == "img_path":
        data_clean[column] = data_clean[column].str.strip().str.rstrip('.').str.lower()
        data_clean[column] = data_clean[column].astype('category')
        pd.DataFrame(data_clean[column].cat.categories).to_csv(data_dir+"labels/"+column.replace(" ","_")+".txt" , index = False , header = False)
        data_clean[column] = data_clean[column].cat.codes

In [116]:
data_train , data_test = train_test_split(data_clean , test_size = 0.2 , random_state = 42)

In [117]:
data_train.to_csv(data_dir+"boat_data_train.csv" , index = False)
data_test.to_csv(data_dir+"boat_data_test.csv" , index = False)

In [130]:
data_train.__len__()

8396

In [131]:
data_test.__len__()

2099

In [118]:
data_train["Hull Type"]

2846    16
6875    19
7273    23
9304    19
7917    25
        ..
5734    25
5191    25
5390    40
860     19
7270    25
Name: Hull Type, Length: 8396, dtype: int8

In [119]:
data_train["Hull Type"].value_counts()

23    1617
25    1245
19     959
40     623
34     465
      ... 
62       1
43       1
26       1
47       1
55       1
Name: Hull Type, Length: 66, dtype: int64

In [121]:
data_clean['img_path']

0        00000.jpg
1        00001.jpg
2        00002.jpg
3        00003.jpg
4        00004.jpg
           ...    
10490    10490.jpg
10491    10491.jpg
10492    10492.jpg
10493    10493.jpg
10494    10494.jpg
Name: img_path, Length: 10495, dtype: object

In [129]:
sum(data_clean['img_path'] == "00000.jpg")

1