In [1]:
!pip install requests beautifulsoup4



In [2]:
# live scraping
import requests 
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import json

#  classification
from fastai.vision.all import *

#  recommendation
import numpy as np
from collections import defaultdict

In [29]:
class Classification_Recommendation():
    def __init__(self):
        self.n = settings['n']
        self.eta = settings['learning_rate']
        self.epochs = settings['epochs']

    
    
    def access_webpage(self, url):
        driver = webdriver.Chrome('./chromedriver') 
        driver.get(url)
        html = driver.page_source 
#         print(html)
        soup = BeautifulSoup(html, 'html.parser')
        
        return soup
    
    
    
    def scrape_data(self, container, items, item_len):
        item_n = [] 
        item_link = []
        item_img = []
        item_price = []
        
        i = 0
        while i != item_len:
            for div in container:
                for item in items:
                    
                    img_len = len(items[2])
                    item_name = item[0].lower()
                    
                    data_fetched = div.findAll(item[1], class_ = item[2])

                    if item_name == 'name':
                        item_n.extend(data_fetched)
                    elif item_name == 'link':
                        item_link.extend(data_fetched)
                    elif item_name == 'image':
                        item_img.extend(data_fetched)
                    elif item_name == 'price':
                        item_price.extend(data_fetched)
                    
                    
                i += 1
                if i == item_len:
                    break
                    
                    
        return  item_n, item_link, item_img, item_price
        
        
        
    def Filter(self, item_n, item_link, item_img, item_price):
        names = []
        prices = []
        links = []
        imgs = []
        
        for name in item_n:
            names.append(name.get_text())
            
        for link in item_link:
            links.append(link['href'])
            
        for price in item_price:
            prices.append(price.get_text())
            
        for img in item_img:
            imgs.append(img['src'])
        
        
        return names, prices, links, imgs
            
            
        
#  fetch items    
    def scrape(self, directories):
        self.dataset = []
        
        for c in range(len(directories)):
            f = open(directories[c])
            parameters = json.load(f)
            
            
            for param in parameters:
                time.sleep(5)
                
                soup = self.access_webpage(param['url'])
                
                con = param['main_container']
                container = soup.findAll(con[0], class_ = con[1])
#                 print(container)
                item_n, item_link, item_img, item_price = self.scrape_data(container, param['fetch_data_item'], param['item_len'])
                
                names, prices, links, imgs = self.Filter(item_n, item_link, item_img, item_price)


                for i in range(param['item_len']):
                    data = {
                        'Name': names[i],
                        'Price': prices[i],
                        'Image': imgs[i],
                        'Link' : links[i],
                        'Category': param['Category'],
                        'Store' : param['Store'],
                        'Gender' : param['Gender']
                    }

                    self.dataset.append(data)
                    
                print('Store:',param["Store"]    ,'Item:',param["Category"],' Gender',param["Gender"])
            
        return self.dataset
    
    
    
    
    def CM_Models(self, gender, img):
        cm_path = "C:\\Users\\emielou\\Desktop\\scraping\\ClassificationModels\\"

        if gender.lower() == 'man':
            cmodel = load_learner(f'{cm_path}m_classification\\m_classification.pkl')
        else:
            cmodel = load_learner(f'{cm_path}w_classification\\w_classification.pkl')

        img_category = cmodel.predict(item = img)

        return img_category[0], gender
    
    
    
    
    def display_classified(self, img_category, gender):
        Items = []
        
        for data in self.dataset:
            Items.append(data)
        
        display_items = []
        for i in range(len(Items)):
            if Items[i]['Category'].lower() == img_category.lower() and Items[i]['Gender'].lower() == gender.lower():
                display_items.append(Items[i])
        
        
        return display_items
    
    
    
    
    def generate_training_data(self, dataset):
        data_pairs = []

        for item in dataset:
            X = item['Name']
            Y = [item['Price'], item['Gender'], item['Category']]
            data_pairs.append([X,Y])


        count = defaultdict(int)
        c_item = defaultdict(int)
        
        for data in data_pairs:
            count[data[0]] + 1
            c_item[data[0]] + 1

            for yc in data[1]:
                count[yc] +1

        self.len_d = len(count.keys())
        self.len_item = len(c_item.keys())
        
        self.list_x = sorted(list(count.keys()), reverse=False)
        self.x_index = {x:i for (i, x) in enumerate(self.list_x)}  
        self.index_x = {i:x for (i, x) in enumerate(self.list_x)}
        
        self.item_list = sorted(list(c_item.keys()), reverse=False)
        self.x_item = {x:i for (i, x) in enumerate(self.item_list)}  
        self.item_x = {i:x for (i, x) in enumerate(self.item_list)}


        training_data = []
        for data in data_pairs:
            target = self.x_index[data[0]]
            i_target = [0 for i in range(0, self.len_d )]

            i_target[target] = 1

            i_content = []
            for y_data in data[1]:
                content = self.x_index[y_data]
                item_content = [0 for i in range(0, self.len_d )]
                item_content[content] = 1

                i_content.append(item_content)

            training_data.append([i_target, i_content])


        return np.array(training_data, dtype=object)
    
    
       
    
    def softmax(self, X):
        e_x = np.exp(X - np.max(X))
        return e_x / e_x.sum(axis=0)

    
        

    def forward_pass(self, x):
        h = np.dot(self.w1.T, x)
        u = np.dot(self.w2.T, h)
        y_c = self.softmax(u)
        return y_c, h, u
                

        
        
    def backprop(self, e, h, x): 
        dl_dw2 = np.outer(h, e)  
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))

        self.w1 = self.w1 - (self.eta * dl_dw1)
        self.w2 = self.w2 - (self.eta * dl_dw2)
        
        pass
    
    
    
    def train(self, dataset):
        training_data = self.generate_training_data(dataset)
        
        self.w1 = np.random.uniform(-1, 1, (self.len_d, self.n) )
        self.w2 = np.random.uniform(-1, 1, (self.n, self.len_d) )
             
        self.loss = 0

        
        for i in range(self.epochs):

            for data in training_data:
                y_pred, h, u = self.forward_pass(data[0])
                
                EI = np.sum([(y_pred - y) for y in data[1]], axis=0)
                self.backprop(EI, h, data[0])

                self.loss =  -np.sum([u[y.index(1)] for y in data[1]]) + len(data[1]) * np.log(np.sum(np.exp(u)))
                 
            print('Iteration: ',i, ' Loss: ', self.loss)

        pass
    
    
    
    
    def word_vec(self, item):
        i_index = self.x_index[item]
        item_vec = self.w1[i_index]
        
        return item_vec
    
    
    
    
    def recommendation(self,dataset, item, item_len):
        item_name = []
        item_embed = []
        item_sim = {}
        
        v_w1 =  self.word_vec(item)
        
        for data in dataset:
            item_name.append(data['Name'])
            item_embed.append(self.word_vec(data['Name']))
            
        
        for i in range(len(item_name)):
            v_w2 = item_embed[i]
            A = np.dot(v_w1, v_w2)
            B = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = A / B
            
            word = item_name[i]
            item_sim[word] = theta

        words_sorted = sorted(item_sim.items(), key=lambda kv: kv[1], reverse=True)
        
        get_word = []
        get_sim = []
        
        for word, sim in words_sorted[:item_len]:
            get_word.append(word)
            get_sim.append(sim)
            
        return (get_word, get_sim)

In [30]:
settings = {
    'n': 100,
    'epochs': 10,
    'learning_rate': 0.01
}

In [31]:
srp = Classification_Recommendation()

In [32]:
dir1 = ['C:/Users/emielou/Desktop/scraping/Queries/flipkart_queries.json', 'C:/Users/emielou/Desktop/scraping/Queries/amazon_queries.json'] 

In [33]:
dataset = srp.scrape(dir1)

  driver = webdriver.Chrome('./chromedriver')


Store: Flipkart Item: Jacket  Gender Men
Store: Flipkart Item: Jeans  Gender Men
Store: Flipkart Item: Pants  Gender Men
Store: Flipkart Item: Shorts  Gender Men
Store: Flipkart Item: Shirts  Gender Men
Store: Flipkart Item: Belt  Gender Men
Store: Flipkart Item: Dress  Gender Woman
Store: Flipkart Item: Jeans  Gender Woman
Store: Flipkart Item: Pants  Gender Woman
Store: Flipkart Item: Shirts  Gender Woman
Store: Flipkart Item: Belt  Gender Woman
Store: Flipkart Item: Skirts  Gender Woman
Store: Flipkart Item: Shorts  Gender Woman
Store: Amazon Item: Jacket  Gender Men
Store: Amazon Item: Jeans  Gender Men
Store: Amazon Item: Pants  Gender Men
Store: Amazon Item: Shorts  Gender Men
Store: Amazon Item: Shirts  Gender Men
Store: Amazon Item: Belt  Gender Men
Store: Amazon Item: Dress  Gender Woman
Store: Amazon Item: Jeans  Gender Woman
Store: Amazon Item: Pants  Gender Woman
Store: Amazon Item: Shirts  Gender Woman
Store: Amazon Item: Shorts  Gender Woman
Store: Amazon Item: Belt  Gend

In [34]:
srp.train(dataset)

Iteration:  0  Loss:  36.857246778987665
Iteration:  1  Loss:  34.21193831567893
Iteration:  2  Loss:  31.802416674127727
Iteration:  3  Loss:  29.462714005742683
Iteration:  4  Loss:  27.196571620813113
Iteration:  5  Loss:  24.928103109595078
Iteration:  6  Loss:  22.634588030857582
Iteration:  7  Loss:  20.34254360073407
Iteration:  8  Loss:  18.108963944721857
Iteration:  9  Loss:  16.0243136830128


In [9]:
item_name = []
for data in dataset:
    item_name.append(data['Name'])

In [10]:
item_name[80]

'Women Regular Fit Green Cotton Blend Trousers'

In [11]:
item_vec = srp.word_vec(item_name[0])

In [12]:
rec1 = srp.recommendation(dataset, item_name[0], 20)

In [13]:
rec1

(['Men Solid Casual Jacket',
  'Women Regular Fit Maroon Viscose Rayon Trousers',
  "Amazon Essentials Men's Full-Zip Polar Fleece Jacket (Available in Big & Tall)",
  "Columbia Men's Steens Mountain 2.0 Full Zip Fleece Jacket",
  'Weatherproof Original Mens Golf Jacket (Mens Windbreaker) Classic Mens Light Jacket',
  'Pack of 2 Solid Women Black, Grey Cycling Shorts, Runni...',
  "Wrangler Authentics Men's Regular Fit Comfort Flex Waist Jean",
  'Solid Men Blue Gym Shorts, Swim Shorts, Running Shorts,...',
  "Resfeber Women's Ripped Boyfriend Jeans Cute Distressed Jeans Stretch Skinny Jeans with Hole",
  "Amazon Essentials Men's Slim-Fit Stretch Jean",
  'Men Regular Mid Rise Grey Jeans',
  'Pack of 2 Solid Men Multicolor Regular Shorts',
  "AUTIWITUA Men's Waterproof Tactical Shorts Outdoor Cargo Shorts, Lightweight Quick Dry Breathable Hiking Fishing Cargo Shorts",
  'WIHOLL Women Lace Short Sleeve Shirts Dressy Casual Tops Crew Neck Tee Shirt',
  "Legendary Whitetails Men's Buck Ca