In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk
import os 

In [2]:
#SCRAPED_FILES_DIR = "./test-pages/"
SCRAPED_FILES_DIR = "./data-science-faq-bot-task-sources/"
QUESTIONS_CSV = "questions.csv"
COMMENTS_CSV = "comments.csv"

In [3]:
def question_attr(soup, file_name):
    """ Find question and its attributes: 
        
        Args:
            soup - BeautifulSoup object
            file_name - name of scraped page file
            
        Returns:
            dictionary with the following question attributes/keys:
                id, author name, 
                date, title, file, url
                hash tags, description
    """
    d = {} # result
    
    description_str = soup.find('meta', property="og:description")
    
    # Check if this is a discussion page, if not - return empty descriptor
    if not description_str: 
        return d
    
    # Date
    discussion_div = soup.find('div', {'class' : 'DiscussionHeader'})
    posted_text = discussion_div.find("span", {'class': 'DateCreated'})
    _,date = posted_text.text.split('on ',1)
    d['date']=date    
    
    # ID
    tag = soup.find('div', {'class' : 'ItemDiscussion'})
    _,id = tag.attrs['id'].split('_', 1)
    #text = soup.select_one("div.Item div.Discussion div.Item-BodyWrap div.Item-Body div.Message").text
    d['id'] = id
 
    # Author
    author = soup.find('span', {'class':"Author"})
    author_name = author.find('a', {'class':"Username"}).text
    d['author_name']=author_name

    # Title
    title = soup.find('meta', property="og:title").attrs['content']
    d['title'] = title

    # Hash tags
    hash_tags = []
    inline_tags_div = soup.find('div', {'class' : 'InlineTags'})
    if inline_tags_div:
        ul = inline_tags_div.find('ul')
        for li in ul.findAll('li'):
            hash_tags.append(li.text)
    
    d['hash_tags']= " ".join(hash_tags)

    # Description
    # description = description_str.attrs['content']
    
    # Question message
    #<div id="Discussion_3132" class="Item ItemDiscussion  Role_Member">
    item_discussion = soup.find('div', {'class':'ItemDiscussion'})
    question_message = item_discussion.find('div', {'class':'Message'})
    question_message_text = question_message.text
    if question_message_text:
        description = question_message.text
    else:
        description = question_message.find('div').text
        
    d['description'] = description

    # File
    d['file'] = file_name
    
    # URL
    url = soup.find('meta', property="og:url").attrs['content']
    d['url'] = url
            
    return d

In [4]:
def comments_attr(soup, file_name, qa):
    """ Find question comments with their attributes: 
        
        Args:
            soup - BeautifulSoup object,
            file_name - name of a file with scraped page,
            qa - question attributes
            
        Returns:
            a list of dictionaries, where each dictionary has a comment text together
            with the following attributes/keys:
                question_id - id of related question, 
                hash_tags - question hash tags,
                file - name of a file with question and comment,
                coment_id - comment id, 
                date - date when comment was created, 
                username - name of the user who created the comment,
                message - comment text,
                helpful_cnt - count of reactions that consider this comment helpful,
                not_helpful_cnt - count of reactions that consider this comment NOT helpful,
                rating  - calculated as helpful_cnt - not_helpful_cnt (may be negative),
    """
    discussion = soup.find('div', {'class' : 'main-ai-section'})
    comments = []

    for el in discussion.findAll('li', {'class':"Item"}):
        d = {}
    
        # Helpful and not helpful comment counts 
        helpful_cnt = 0
        not_helpful_cnt = 0
        
        # Date
        #<span class="MItem DateCreated">
        date_created_text = (el.find('span', {'class':"MItem DateCreated"})).text
        _,date_created = date_created_text.split('on ',1)
        d['date'] = date_created

        # Question id
        d['question_id'] = qa['id']
        
        # Question hash tags
        d['hash_tags'] = qa['hash_tags']
           
        # Comment id
        #<li class="Item ItemComment Role_Member" id="Comment_3582">
        comment_id_text = el.attrs['id']
        _, comment_id = comment_id_text.split('_',1)
        d['comment_id'] = comment_id
    
        # User name
        #<a href="/profile/Hamie" class="Username">Hamie</a>
        username = (el.find('a', {'class':"Username"})).text
        d['username'] = username

        # Comment text (message)
        #<div class="Message">
        message = (el.find('div', {'class':"Message"})).text
        d['message'] = message
          
        # Collect user reactions for the current comment:
        #div class="ReactionRecord"
        reaction_record = el.find('div', {'class':"ReactionRecord"})
        #<span class="UserReactionWrap" data-userid="9479" 
        #title="Tomshipp - Not Helpful on November 29, 2018.">
        user_reaction_wrap = reaction_record.findAll('span', {'class':"UserReactionWrap"})
        reaction_cnt = 0
        # if one or more users reacted:
        if user_reaction_wrap: 
            for user_reaction in user_reaction_wrap:
                reaction_cnt += 1
                if (user_reaction['title'].find('Not Helpful') != -1): 
                    not_helpful_cnt += 1 
                else: 
                    helpful_cnt += 1

        d['helpful_cnt'] = helpful_cnt
        d['not_helpful_cnt'] = not_helpful_cnt
        
        # Comment rating
        d['rating'] = helpful_cnt - not_helpful_cnt
        
        # File
        d['file'] = file_name
       
        # Add comment with its atrributes to comments list
        comments.append(d)
        
    return comments
            

In [5]:
def check_wifi_tv(text):
    """ Check if text has 'wifi' and 'tv' tokens. 
    
        Args:
            text - text to search tokens
            
        Returns:
            True - if text has _all_ tokens, False otherwise
    """
    ts = [t.lower() for t in nltk.word_tokenize(text)]
    
    return (('wifi'in ts or 'wi-fi' in ts) and 'tv' in ts)

In [6]:
def process_page(page, file_name):
    """ Read scraped page, extract question and comment attributes and calculate similarities
    
        Args:
            page - scraped page 
            file_name - name of scraped page file
            
        Returns:
            question attributes and list of related comments with attributes and calculated similarities
 
    """
    # Parse page and extract question attributes
    soup = BeautifulSoup(page.read()) 
    qa=question_attr(soup, file_name) 
    
    # Check if this is a discussion page, if not - return empty descriptors
    if not qa:
        return ({},[])
        
    # Check if question is about problems connecting the TV to Wi-Fi.
    # Set wifi-tv flag in the question structure.    
 
    all_question_words = qa['hash_tags']+qa['title']+qa['description'] 
    # Extract related comments with attributes
    comment_list = comments_attr(soup, file_name, qa)
    
    for comment in comment_list:
        all_question_words += comment['message']

    wifi_tv = check_wifi_tv(all_question_words)
    qa['wifi_tv'] = wifi_tv
    
    return (qa, comment_list)
    

In [7]:
%%time

# Read scraped pages, extract question and comment attributes,
# calculate similarities and write resulting data to csv files

file_list = os.listdir(SCRAPED_FILES_DIR)
file_cnt = (len(file_list))       
print("*** Working with: {} pages ...".format(file_cnt))

exception_files = [] # files we could not process

all_questions = [] 
all_comments = [] 

blank_line = " " * 200
processed_cnt = 0
for file_name in file_list:
    try:
        processed_cnt += 1
        with open(SCRAPED_FILES_DIR + file_name, 'r') as page:          
            print(file_name, end='\r')
            qa, comment_list = process_page(page, file_name)
            if qa and comment_list:
                all_questions.append(qa)
                all_comments += comment_list
                   
            print(blank_line, end='\r')
                
    except Exception as e:
        #print("============> ", e)
        processed_cnt -= 1
        exception_files.append(file_name)
                
    
if exception_files:
    print("*** Exceptions in files: ", exception_files)

print("*** {} pages - done".format(processed_cnt))

print("*** Creating data frames and writing csv files ...")
q_df = pd.DataFrame(all_questions)
q_df['date'] = pd.to_datetime(q_df['date']) 
q_df.to_csv(QUESTIONS_CSV)

c_df = pd.DataFrame(all_comments)
c_df['date'] = pd.to_datetime(c_df['date']) 
c_df.to_csv(COMMENTS_CSV)
print("*** All done.")

*** Working with: 1038 pages ...
*** 1038 pages - done                                                                                                                                                                                   
*** Creating data frames and writing csv files ...
*** All done.
CPU times: user 1min 1s, sys: 601 ms, total: 1min 2s
Wall time: 1min 2s


In [8]:
# Questions
q_df.info()
q_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 9 columns):
date           881 non-null datetime64[ns]
id             881 non-null object
author_name    881 non-null object
title          881 non-null object
hash_tags      881 non-null object
description    881 non-null object
file           881 non-null object
url            881 non-null object
wifi_tv        881 non-null bool
dtypes: bool(1), datetime64[ns](1), object(7)
memory usage: 56.0+ KB


Unnamed: 0,date,id,author_name,title,hash_tags,description,file,url,wifi_tv
0,2020-01-20 17:11:57,11804,ajfromftf,65UM6900PUA won’t stay connected to WiFi when ...,tv connect network wifi,\n Every time I turn off my...,11804-65um6900pua-won-t-stay-connected-to-wifi...,https://lgcommunity.us.com/discussion/11804/65...,True
1,2018-09-25 11:20:47,2069,Pamoola,Smart LED TV Model: 43UH6100 IS connected to W...,wifi internet home network,\n This happens on a daily ...,2069-smart-led-tv-model-43uh6100-is-connected-...,https://lgcommunity.us.com/discussion/2069/sma...,True
2,2018-11-11 16:38:32,2475,waltstanley,65UH5500,,"\n No TV viewers, any reply...",2475-65uh5500,https://lgcommunity.us.com/discussion/2475/65u...,False
3,2018-12-25 16:36:43,3008,TopBanana,"Gallery Mode-Turn Of ""No Signal"" Message",,\n Our 1st LG TV. 55SK0999P...,3008-gallery-mode-turn-of-no-signal-message,https://lgcommunity.us.com/discussion/3008/gal...,False
4,2019-08-19 21:00:57,5745,jmcl1960,"LG 55UK6200PUA Sound Mode is ""Deactivated, can...",,"\nMy LG 55UK6200PUA Sound Mode is ""Deactivated...",5745-lg-55uk6200pua-sound-mode-is-deactivated-...,https://lgcommunity.us.com/discussion/5745/lg-...,False


In [9]:
# Comments
c_df.info()
c_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1849 entries, 0 to 1848
Data columns (total 10 columns):
date               1849 non-null datetime64[ns]
question_id        1849 non-null object
hash_tags          1849 non-null object
comment_id         1849 non-null object
username           1849 non-null object
message            1849 non-null object
helpful_cnt        1849 non-null int64
not_helpful_cnt    1849 non-null int64
rating             1849 non-null int64
file               1849 non-null object
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 144.6+ KB


Unnamed: 0,date,question_id,hash_tags,comment_id,username,message,helpful_cnt,not_helpful_cnt,rating,file
0,2020-01-21 14:11:21,11804,tv connect network wifi,18595,big80s,\n We bought two ne...,0,0,0,11804-65um6900pua-won-t-stay-connected-to-wifi...
1,2018-11-28 17:24:19,2069,wifi internet home network,4162,AhmedLG,\n Hello @Pamoola I...,0,0,0,2069-smart-led-tv-model-43uh6100-is-connected-...
2,2018-11-29 05:32:06,2069,wifi internet home network,4222,Lpops6,\n When I go to the...,0,0,0,2069-smart-led-tv-model-43uh6100-is-connected-...
3,2018-12-03 15:35:01,2069,wifi internet home network,4307,AhmedLG,\n@Lpops6 Is your LG TV connected to the Wi-Fi...,0,1,-1,2069-smart-led-tv-model-43uh6100-is-connected-...
4,2018-11-26 15:52:47,2475,,3939,Hamie,\n Hello waltstanle...,0,0,0,2475-65uh5500
