In [1]:
import sys
import pandas as pd

# BERT
from sentence_transformers import SentenceTransformer
import scipy.spatial

In [2]:
QUESTIONS_CSV = "questions.csv"
COMMENTS_CSV = "comments.csv"
BEST_ANSWERS_CSV = "best-answers.csv"

### Explore data

In [3]:
# Read questions
q_df = pd.read_csv(QUESTIONS_CSV,index_col=[0])
q_df.info()
q_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 881 entries, 0 to 880
Data columns (total 9 columns):
date           881 non-null object
id             881 non-null int64
author_name    881 non-null object
title          881 non-null object
hash_tags      440 non-null object
description    881 non-null object
file           881 non-null object
url            881 non-null object
wifi_tv        881 non-null bool
dtypes: bool(1), int64(1), object(7)
memory usage: 62.8+ KB


Unnamed: 0,date,id,author_name,title,hash_tags,description,file,url,wifi_tv
0,2020-01-20 17:11:57,11804,ajfromftf,65UM6900PUA won’t stay connected to WiFi when ...,tv connect network wifi,\n Every time I turn off my...,11804-65um6900pua-won-t-stay-connected-to-wifi...,https://lgcommunity.us.com/discussion/11804/65...,True
1,2018-09-25 11:20:47,2069,Pamoola,Smart LED TV Model: 43UH6100 IS connected to W...,wifi internet home network,\n This happens on a daily ...,2069-smart-led-tv-model-43uh6100-is-connected-...,https://lgcommunity.us.com/discussion/2069/sma...,True
2,2018-11-11 16:38:32,2475,waltstanley,65UH5500,,"\n No TV viewers, any reply...",2475-65uh5500,https://lgcommunity.us.com/discussion/2475/65u...,False


In [4]:
# Read comments
c_df = pd.read_csv(COMMENTS_CSV, index_col=[0])
c_df.info()
c_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1849 entries, 0 to 1848
Data columns (total 10 columns):
date               1849 non-null object
question_id        1849 non-null int64
hash_tags          942 non-null object
comment_id         1849 non-null int64
username           1849 non-null object
message            1849 non-null object
helpful_cnt        1849 non-null int64
not_helpful_cnt    1849 non-null int64
rating             1849 non-null int64
file               1849 non-null object
dtypes: int64(5), object(5)
memory usage: 158.9+ KB


Unnamed: 0,date,question_id,hash_tags,comment_id,username,message,helpful_cnt,not_helpful_cnt,rating,file
0,2020-01-21 14:11:21,11804,tv connect network wifi,18595,big80s,\n We bought two ne...,0,0,0,11804-65um6900pua-won-t-stay-connected-to-wifi...
1,2018-11-28 17:24:19,2069,wifi internet home network,4162,AhmedLG,\n Hello @Pamoola I...,0,0,0,2069-smart-led-tv-model-43uh6100-is-connected-...
2,2018-11-29 05:32:06,2069,wifi internet home network,4222,Lpops6,\n When I go to the...,0,0,0,2069-smart-led-tv-model-43uh6100-is-connected-...


In [5]:
# Helpful answers
c_df['helpful_cnt'].value_counts()

0    1683
1     133
2      23
3       8
4       2
Name: helpful_cnt, dtype: int64

In [6]:
max_helpful_comments = c_df.loc[c_df['helpful_cnt'] == 4]
max_helpful_comments

Unnamed: 0,date,question_id,hash_tags,comment_id,username,message,helpful_cnt,not_helpful_cnt,rating,file
1366,2018-12-14 16:23:36,1621,tv 4k UHD 4k smrt tv someonehelp,4619,tboudrot,\n This issue is we...,4,0,4,1621-annoying-message-pops-up-on-my-tv
1814,2018-09-04 02:09:31,1106,,2458,patrickg,\n Same here my 65u...,4,0,4,1106-65uh5500-will-not-turn-on


In [7]:
max_helpful_comments['file']

1366    1621-annoying-message-pops-up-on-my-tv
1814            1106-65uh5500-will-not-turn-on
Name: file, dtype: object

In [8]:
max_helpful_comments['message'].values

array(["\n                            This issue is well documented on various Internet sites.\xa0 The issue -- unknown device is disconnected -- along with flaky remote controller behavior and the TV's inability to recognize wireless networks are all related and experienced by many people.\xa0 I called LG tech support two weeks ago and the service agent claimed no awareness of the issue.\xa0 Very disappointing since many people have reported it to LG tech support and only a few have been able to resolve it through warranty channels.\xa0 The older the TV, the less likelihood you'll have in getting warranty service.The issue appears to be related to the wifi board in the TV itself.\xa0 Some individuals experiencing the issue have had that board replaced and a few report success while others do not.\xa0 Since I use the wifi capability of the TV to watch Netflix and Amazon Prime, the TV is now useless to me as it can't access the wireless network.\xa0\xa0                        ",
       

In [9]:
# Not helpful answers
c_df['not_helpful_cnt'].value_counts()

0    1763
1      72
2      12
9       1
3       1
Name: not_helpful_cnt, dtype: int64

In [10]:
max_not_helpful_comments = c_df.loc[c_df['not_helpful_cnt'] == 9]
max_not_helpful_comments

Unnamed: 0,date,question_id,hash_tags,comment_id,username,message,helpful_cnt,not_helpful_cnt,rating,file
1360,2018-08-31 20:36:39,1621,tv 4k UHD 4k smrt tv someonehelp,2390,Alexander,"\n@Bernie110, Good day,Have you identified wha...",0,9,-9,1621-annoying-message-pops-up-on-my-tv


In [11]:
max_not_helpful_comments['file']

1360    1621-annoying-message-pops-up-on-my-tv
Name: file, dtype: object

In [12]:
max_not_helpful_comments['message'].values

array(['\n@Bernie110, Good day,Have you identified what connection is causing this error? do you have multiple electronics connected to your TV? If so, try using it with one of them disconnected and every 15 minutes rotate the product that is disconnected. Once identified, ensure its properly connected, try using different ports and / or cables to confirm if its the TV port, the cable or the product its connected to.If the above does not solve the issue, you can try a factory reset.If that still does not fix the issue please let us know your model number and the software version you are currently running so it can be escalated.                        '],
      dtype=object)

In [13]:
# Answer ratings
c_df['rating'].value_counts()

 0    1612
 1     128
-1      65
 2      21
-2      11
 3       8
 4       2
-3       1
-9       1
Name: rating, dtype: int64

### Question 1. 
- 1.1. Which were the first ten threads of 2019? 
- 1.2. Which of these threads could be considered questions? 
- 1.3. Which received answers? 
- 1.4. For each answer, what is your gut judgement for how likely the answer would have been accepted by
the thread starter? Why?

#### Answers 1.1, 1.2 

In [14]:
# q19_df - threads with all questions in 2019
mask = (q_df['date'] > '2018-12-31') & (q_df['date'] <= '2019-12-31')
q19_df = q_df.loc[mask].copy()
q19_df.info()
q19_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 466 entries, 4 to 873
Data columns (total 9 columns):
date           466 non-null object
id             466 non-null int64
author_name    466 non-null object
title          466 non-null object
hash_tags      245 non-null object
description    466 non-null object
file           466 non-null object
url            466 non-null object
wifi_tv        466 non-null bool
dtypes: bool(1), int64(1), object(7)
memory usage: 33.2+ KB


Unnamed: 0,date,id,author_name,title,hash_tags,description,file,url,wifi_tv
4,2019-08-19 21:00:57,5745,jmcl1960,"LG 55UK6200PUA Sound Mode is ""Deactivated, can...",,"\nMy LG 55UK6200PUA Sound Mode is ""Deactivated...",5745-lg-55uk6200pua-sound-mode-is-deactivated-...,https://lgcommunity.us.com/discussion/5745/lg-...,False
5,2019-02-01 13:29:16,3483,johnnybiral,Some keys on my remote dosent work,Magic Remote Remote Control,\n TV Model: 43UF64000GIRem...,3483-some-keys-on-my-remote-dosent-work,https://lgcommunity.us.com/discussion/3483/som...,False
8,2019-12-07 03:32:31,8599,PickyBiker,ULTRA HD Deep Color mode blanks out the ROKU P...,,\nThere is a problem with this TV and the ROKU...,8599-ultra-hd-deep-color-mode-blanks-out-the-r...,https://lgcommunity.us.com/discussion/8599/ult...,False


In [15]:
# q19_df_10 - is a data frame with first ten threads of 2019 that could be considered questions
# (Threads without questions were filtered out at previous data extraction step in
# (extract-data.ipynb))

q19_df_sorted = q19_df.sort_values(by='date').copy()
q19_df_10 = q19_df_sorted.head(10)
q19_df_10.head(3)

Unnamed: 0,date,id,author_name,title,hash_tags,description,file,url,wifi_tv
326,2019-01-01 22:24:24,3132,dougfern,Re-pairing TV/Bluetooth soundbar after shutdown,,\nMy new LG 50UK6500AUA TV and SK1 soundbar pa...,3132-re-pairing-tv-bluetooth-soundbar-after-sh...,https://lgcommunity.us.com/discussion/3132/re-...,False
256,2019-01-02 19:47:27,3144,joanne01,Blurry picture and text,,\n just set up 60inch 60UK...,3144-blurry-picture-and-text,https://lgcommunity.us.com/discussion/3144/blu...,False
724,2019-01-03 07:38:20,3155,ericsang1,Unknown Device Is Disconnected,,\n I bought my UJ6300 tv ab...,3155-unknown-device-is-disconnected,https://lgcommunity.us.com/discussion/3155/unk...,False


#### Answer 1.3 
 - 1.3. Which received answers? 

#### To answer 1.3:
- We join 10 questions with comments using id of the question in both data frames

In [16]:
def join(q_df, c_df):
    """ Join question and comments dataframes
    
        Args:
            q - question dataframe 
            c - comments dataframe
            
        Returns:
            Left join product with duplicated columns droped and 'x-names' renamed
 
    """  
    df = pd.merge(q_df, c_df, left_on='id', right_on='question_id', how='left')
    drop_cols = ['date_y', 'question_id', 'hash_tags_y', 'file_y' ]
    df = df.drop(drop_cols, axis=1)
    df.rename(columns = {'file_x':'file', 'date_x':'date', 'hash_tags_x': 'hash_tags'}, 
              inplace = True)
    return df

#### First ten threads of 2019 with answers

In [17]:
qc10_df = join(q19_df_10, c_df).copy()
qc10_df.info()
qc10_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 0 to 15
Data columns (total 15 columns):
date               16 non-null object
id                 16 non-null int64
author_name        16 non-null object
title              16 non-null object
hash_tags          7 non-null object
description        16 non-null object
file               16 non-null object
url                16 non-null object
wifi_tv            16 non-null bool
comment_id         16 non-null int64
username           16 non-null object
message            16 non-null object
helpful_cnt        16 non-null int64
not_helpful_cnt    16 non-null int64
rating             16 non-null int64
dtypes: bool(1), int64(5), object(9)
memory usage: 1.9+ KB


Unnamed: 0,date,id,author_name,title,hash_tags,description,file,url,wifi_tv,comment_id,username,message,helpful_cnt,not_helpful_cnt,rating
0,2019-01-01 22:24:24,3132,dougfern,Re-pairing TV/Bluetooth soundbar after shutdown,,\nMy new LG 50UK6500AUA TV and SK1 soundbar pa...,3132-re-pairing-tv-bluetooth-soundbar-after-sh...,https://lgcommunity.us.com/discussion/3132/re-...,False,5115,Snuffy,\n I have EXACTLY t...,1,0,1
1,2019-01-01 22:24:24,3132,dougfern,Re-pairing TV/Bluetooth soundbar after shutdown,,\nMy new LG 50UK6500AUA TV and SK1 soundbar pa...,3132-re-pairing-tv-bluetooth-soundbar-after-sh...,https://lgcommunity.us.com/discussion/3132/re-...,False,5546,Snuffy,\n Another thing I ...,1,0,1
2,2019-01-01 22:24:24,3132,dougfern,Re-pairing TV/Bluetooth soundbar after shutdown,,\nMy new LG 50UK6500AUA TV and SK1 soundbar pa...,3132-re-pairing-tv-bluetooth-soundbar-after-sh...,https://lgcommunity.us.com/discussion/3132/re-...,False,5542,dougfern,\n My experience is...,1,0,1


In [18]:
print("Questions out of first ten ones from 2019 that received answers:  ", 
      len(qc10_df['id'].unique()))
print("Total {} answers for 10 questions". format(len(qc10_df)))

Questions out of first ten ones from 2019 that received answers:   10
Total 16 answers for 10 questions


#### Answewr 1.4 
- 1.4. For each answer, what is your gut judgement for how likely the answer would have been accepted by
the thread starter? Why?

In [19]:
def add2dict(d, key, val):
    if key in d:
        d[key].append(val)
    else:
        d[key]=[val]

In [20]:
def print_qa(df, file=None):
    """ Print questions with answers, including 
    comment helpfull counts and question / comment similarity
    
        Args:
            df - data frame
            file - file to print to
            
        Rerurns:
            None
    """
    if not file:
        log = sys.stdout
    else:
        log = open(file, 'w') 
        
    qa_dict = {}
    qa_array = df[['description','message', 'helpful_cnt', 'not_helpful_cnt']].values
    for idx, qa in enumerate(qa_array):        
        add2dict(qa_dict, qa[0], (qa[1], qa[2], qa[3]))
    
    q_cnt = 0
    for q in qa_dict:
        print("\n*** Question {} {}".format(q_cnt, q), file=log)
        a_cnt = 0
        for a in qa_dict[q]:
            #print(a[0], a[1], a[2], c_sim)
            print("\n* Answer {}\n{}\n-- Helpful: {}\n-- Not Helpful: {}".format(
                a_cnt,a[0], a[1], a[2]), file=log)
            a_cnt += 1
            
        q_cnt +=1        


In [21]:
print_qa(qc10_df)


*** Question 0 
My new LG 50UK6500AUA TV and SK1 soundbar pair up via Bluetooth as expected. But upon powering down the TV and back on, the TV "sound out" ALWAYS reverts back to "Internal Speaker" even as the soundbar Bluetooth LED sits there blinking, waiting to be invited back in. I must go back through the TV settings to re-pair the two. then everything works fine until the next power-down. Docs say I should only after to pair once! What a pain. Help, please! 

* Answer 0

                            I have EXACTLY the same problem on my new 43UK6750PLD TV also with an SK1 soundbar.  To get it to connect to the soundbar when I turn the TV on again, I have to re-select it from the list of paired devices (I don't have to re-pair) but it will not connect automatically.Another way to re-connect is to turn the soundbar on and off with the TV on. Then the TV says something like "The SK1 device is requesting to connect. Do you accept?" (well, words to that effect). And then it works.But I

### <font color=blue> My gut judgement of answers</font>

#### Q0: My new LG 50UK6500AUA TV and SK1 soundbar pair up via Bluetooth as expected. But upon powering down the TV and back on, the TV "sound out" ALWAYS reverts back to "Internal Speaker" even as the soundbar Bluetooth LED sits there blinking, waiting to be invited back in. I must go back through the TV settings to re-pair the two. then everything works fine until the next power-down. Docs say I should only after to pair once! What a pain. Help, please! 

* A0: 
                            I have EXACTLY the same problem on my new 43UK6750PLD TV also with an SK1 soundbar.  To get it to connect to the soundbar when I turn the TV on again, I have to re-select it from the list of paired devices (I don't have to re-pair) but it will not connect automatically.Another way to re-connect is to turn the soundbar on and off with the TV on. Then the TV says something like "The SK1 device is requesting to connect. Do you accept?" (well, words to that effect). And then it works.But I don't want to keep having to do this each time I turn my TV on !         

 <font color=blue>Not an optimal, temporary solution. Can be accepted when there is no better one. </font>
 
* A1: 
                            Another thing I found was that after about 30 minutes on Bluetooth the sound was out of sync with the picture. In the end I bought an optical cable and now it works properly.       

<font color=blue>Does not answer the question. Can not be accepted. </font>


* A2: 
                            My experience is exactly like yours. Had LG service here under warranty; they swapped out something but it had no effect. Told me it was the soundbar (yeah, right) so I'll exchange it just to say I did it. Went to Costco to verify I wasn't crazy; their display model worked exactly as you & I expect it to work..."Bluetooth" setting holds even during power cycle.  Sooner or later I'm going to demand a replacement under warranty. 

<font color=blue>Does not answer the question. Can not be accepted. </font>

* A3: 
                            Yes. optical cable came with my SK1 and it works fine. Unfortunately it's not long enough for the headphones I plan to use in my La-Z-Boy across the room! That's why I want the Bluetooth to work as it's supposed to and as I paid for. I'm going to keep pursuing this issue even if -- as you pointed out -- I can work around it with another couple of clicks.  

<font color=blue>Additional note from the author of the question. Not an answer the question. Can not be accepted. </font>

#### Question 2. 
- How many initial posts from 2019 concerned problems connecting the TV to Wi-Fi? 
- Do any of these have answers which potentially could have resolved the problem? 
- If you were to only make a bot for answering Wi-Fi threads, what answer string 
do you think would be the best? 
- Give in code or pseudocode a method which given the initial post determines 
whether to respond to it with this Wi-Fi answer string. 
- How many of the 2019 wifi threads would your answer string be used for and 
what do you think would be the expected acceptance rate of these answers?

#### Answer 2.1

In [22]:
# All questions and answers in 2019
qc19_df_ = join(q19_df, c_df).copy()
qc19_df = qc19_df_.sort_values(by='date').copy()
qc19_df.info()
qc19_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1023 entries, 373 to 429
Data columns (total 15 columns):
date               1023 non-null object
id                 1023 non-null int64
author_name        1023 non-null object
title              1023 non-null object
hash_tags          574 non-null object
description        1023 non-null object
file               1023 non-null object
url                1023 non-null object
wifi_tv            1023 non-null bool
comment_id         1023 non-null int64
username           1023 non-null object
message            1023 non-null object
helpful_cnt        1023 non-null int64
not_helpful_cnt    1023 non-null int64
rating             1023 non-null int64
dtypes: bool(1), int64(5), object(9)
memory usage: 120.9+ KB


Unnamed: 0,date,id,author_name,title,hash_tags,description,file,url,wifi_tv,comment_id,username,message,helpful_cnt,not_helpful_cnt,rating
373,2019-01-01 22:24:24,3132,dougfern,Re-pairing TV/Bluetooth soundbar after shutdown,,\nMy new LG 50UK6500AUA TV and SK1 soundbar pa...,3132-re-pairing-tv-bluetooth-soundbar-after-sh...,https://lgcommunity.us.com/discussion/3132/re-...,False,5115,Snuffy,\n I have EXACTLY t...,1,0,1
376,2019-01-01 22:24:24,3132,dougfern,Re-pairing TV/Bluetooth soundbar after shutdown,,\nMy new LG 50UK6500AUA TV and SK1 soundbar pa...,3132-re-pairing-tv-bluetooth-soundbar-after-sh...,https://lgcommunity.us.com/discussion/3132/re-...,False,5551,dougfern,\n Yes. optical cab...,0,0,0
375,2019-01-01 22:24:24,3132,dougfern,Re-pairing TV/Bluetooth soundbar after shutdown,,\nMy new LG 50UK6500AUA TV and SK1 soundbar pa...,3132-re-pairing-tv-bluetooth-soundbar-after-sh...,https://lgcommunity.us.com/discussion/3132/re-...,False,5542,dougfern,\n My experience is...,1,0,1
374,2019-01-01 22:24:24,3132,dougfern,Re-pairing TV/Bluetooth soundbar after shutdown,,\nMy new LG 50UK6500AUA TV and SK1 soundbar pa...,3132-re-pairing-tv-bluetooth-soundbar-after-sh...,https://lgcommunity.us.com/discussion/3132/re-...,False,5546,Snuffy,\n Another thing I ...,1,0,1
298,2019-01-02 19:47:27,3144,joanne01,Blurry picture and text,,\n just set up 60inch 60UK...,3144-blurry-picture-and-text,https://lgcommunity.us.com/discussion/3144/blu...,False,5148,JamalofLG,\nAre you streaming the content via the intern...,0,0,0


#### Initial posts from 2019 concerned problems connecting the TV to Wi-Fi

In [23]:
wifi_df = qc19_df[qc19_df['wifi_tv'] == 1]
wifi_df.info()
wifi_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117 entries, 427 to 211
Data columns (total 15 columns):
date               117 non-null object
id                 117 non-null int64
author_name        117 non-null object
title              117 non-null object
hash_tags          42 non-null object
description        117 non-null object
file               117 non-null object
url                117 non-null object
wifi_tv            117 non-null bool
comment_id         117 non-null int64
username           117 non-null object
message            117 non-null object
helpful_cnt        117 non-null int64
not_helpful_cnt    117 non-null int64
rating             117 non-null int64
dtypes: bool(1), int64(5), object(9)
memory usage: 13.8+ KB


Unnamed: 0,date,id,author_name,title,hash_tags,description,file,url,wifi_tv,comment_id,username,message,helpful_cnt,not_helpful_cnt,rating
427,2019-01-06 14:22:27,3188,alireza88,how to display lg tv Service Menu ????,lg49sj800v unknown device disconnected wifi tu...,\ntried everything but still no luck ...i read...,3188-how-to-display-lg-tv-service-menu,https://lgcommunity.us.com/discussion/3188/how...,True,5260,silentmeow,\n There is a bunch...,0,0,0
170,2019-01-06 19:01:46,3196,theo,Wi fi not turning on,Wi-Fi,"\n Hi, i have a tv model 4...",3196-wi-fi-not-turning-on,https://lgcommunity.us.com/discussion/3196/wi-...,True,5298,Saarducci,\n Sounds like some...,0,0,0
992,2019-01-10 08:50:37,3248,mdart,ASK LG Why cant I tun on TV,google home ask lg,\n All other command work w...,3248-ask-lg-why-cant-i-tun-on-tv,https://lgcommunity.us.com/discussion/3248/ask...,True,5519,JamalofLG,\nmdart the reason why is that when the unit i...,0,0,0


In [24]:
print("2019 posts, total: {}; WiFi/TV problems: {} ".format( len(qc19_df), len(wifi_df)))

2019 posts, total: 1023; WiFi/TV problems: 117 


### Finding answers to potentially resolve the problem in Wi-Fi threads that FAQ bot may use

To find answers we use __Multiple Choice Question Answering__ model trained on two different datasets:

1. just a few good answers selected manually
2. all available answers to WiFI-TV thread

As expected model trained with small dataset has a little choice of answers and performs poorly. While model trained in second case finds much better results. In this last case answer strings that model generates can be used for all WiFi-TV threads


#### 1. Find WiFi-TV comments taged 'helpful' in 2019 that can be used as WiFi-TV FAQ answers

In [25]:
wifi_help_df = wifi_df[wifi_df['helpful_cnt'] > 0]#['message']
wifi_help_df[['helpful_cnt', 'comment_id', 'message']]

Unnamed: 0,helpful_cnt,comment_id,message
942,1,12175,\n I'm having the s...
932,1,6631,"\nVischo, we can try a few things: 1) Go to Se..."
686,1,10076,\n No problem. I wi...
157,1,9190,\n I am having the ...
214,1,10674,"\nShazAust_665, If the TV can find and connect..."
620,2,12029,"\nSheiky__, I would recommend checking to ensu..."
88,1,13576,\n After updating s...


In [26]:
wifi_help_df[['comment_id','message']].values

array([[12175,
        "\n                            I'm having the same problem after the YouTube update from a few days ago. Only on YouTube.OLED55C8PLA.\xa0 It'll be working fine, then just freeze, respond to no input from the remote, and I'll need to turn the TV off for the remote to start working again. It's rendered one of my most used apps on the TV useless. "],
       [6631,
        '\nVischo, we can try a few things: 1) Go to Settings, All Settings, General, QuickStart+, turn off, power off the TV & unplug for 1 minute. This would help with freeing up some of the RAM, and eliminate the freezing. 2) While you have updated the internet speed to 1GBPS, we would need to do a speed test. Open the web browser and type in fast.com and it will automatically run a Speed Test. Once completed, advise as to what speed the TV is receiving.  '],
       [10076,
        '\n                            No problem. I will have this checked out.                        '],
       [9190,
        '

In [27]:
# Drop useless messages: 9190, 10076
wifi_help_df.drop(wifi_help_df[wifi_help_df['comment_id'] == 9190].index, inplace = True)
wifi_help_df.drop(wifi_help_df[wifi_help_df['comment_id'] == 10076].index, inplace = True) 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [28]:
wifi_faq_array = wifi_help_df['message'].values
wifi_faq_array

array(["\n                            I'm having the same problem after the YouTube update from a few days ago. Only on YouTube.OLED55C8PLA.\xa0 It'll be working fine, then just freeze, respond to no input from the remote, and I'll need to turn the TV off for the remote to start working again. It's rendered one of my most used apps on the TV useless. ",
       '\nVischo, we can try a few things: 1) Go to Settings, All Settings, General, QuickStart+, turn off, power off the TV & unplug for 1 minute. This would help with freeing up some of the RAM, and eliminate the freezing. 2) While you have updated the internet speed to 1GBPS, we would need to do a speed test. Open the web browser and type in fast.com and it will automatically run a Speed Test. Once completed, advise as to what speed the TV is receiving.  ',
       '\nShazAust_665, If the TV can find and connect to the 2.4 GHz network,\xa0there would not be a defect with the TV. I would say the TV does not support the 5 GHz network. \

#### 2.Create Multiple Choice Question Answering  model

#### Answer 2.4 Method which given the initial post determines whether to respond to it with this Wi-Fi answer string.

In [29]:
def model_demo(embedder, corpus, queries):
    """ Demonstrate Multiple Choice Question Answering model
    
        Args:
            embedder - SentenceTransformer to embed text
            corpus - corpus to train model 
            queries - query strings to find best match for
    """

    corpus_embeddings = embedder.encode(corpus)
    query_embeddings = embedder.encode(queries)

        # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 5
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])

        print("*** Query:\n", query)
        print("\nTop 5 most similar sentences in corpus:")
        print("\n\n======================\n\n")

        for idx, distance in results[0:closest_n]:
            print("(Score: %.4f)" % (1-distance), corpus[idx].strip())
            print("\n\n======================\n\n")



In [30]:
# Query sentences:
queries = [""" Hi,\xa0 i have a tv model 49uh850v and frequentlly is disconecting 
from internetnt while i\xa0 watch youtube or netflix and then wi fi cannot turn on, 
any solution?if i unplug tv for a while the problem is fixed but then again the problem 
appears again, any solution?
"""]

#### 3.a. Train and run model on comments taged 'helpful' in 2019

In [31]:
%%time
# Create Model
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

# Find best matches
corpus_wifi_faq = wifi_faq_array
model_demo(embedder, corpus_wifi_faq, queries)

*** Query:
  Hi,  i have a tv model 49uh850v and frequentlly is disconecting 
from internetnt while i  watch youtube or netflix and then wi fi cannot turn on, 
any solution?if i unplug tv for a while the problem is fixed but then again the problem 
appears again, any solution?


Top 5 most similar sentences in corpus:




(Score: 0.8621) I'm having the same problem after the YouTube update from a few days ago. Only on YouTube.OLED55C8PLA.  It'll be working fine, then just freeze, respond to no input from the remote, and I'll need to turn the TV off for the remote to start working again. It's rendered one of my most used apps on the TV useless.




(Score: 0.6964) ShazAust_665, If the TV can find and connect to the 2.4 GHz network, there would not be a defect with the TV. I would say the TV does not support the 5 GHz network.




(Score: 0.6225) Sheiky__, I would recommend checking to ensure the TV is on HDMI 1. If already confirmed, Switch to another HDMI input and switch the TV to see

#### 3.b. Train and run model on *all* WiFi-TV comments from 2019

In [32]:
%%time
# Create Model
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

# Find best matches
corpus_wifi_all = wifi_df['message'].values
model_demo(embedder, corpus_wifi_all, queries)

*** Query:
  Hi,  i have a tv model 49uh850v and frequentlly is disconecting 
from internetnt while i  watch youtube or netflix and then wi fi cannot turn on, 
any solution?if i unplug tv for a while the problem is fixed but then again the problem 
appears again, any solution?


Top 5 most similar sentences in corpus:




(Score: 0.8791) My 65” B7 and have the same problem. My tv randomly becomes unresponsive and slow with the remote. Buttons on tv is fast. Then today I tried to kill YouTube app (longe press back) and everything become smooth again. Something is wrong with the YouTube app and it only getting worse.




(Score: 0.8621) I'm having the same problem after the YouTube update from a few days ago. Only on YouTube.OLED55C8PLA.  It'll be working fine, then just freeze, respond to no input from the remote, and I'll need to turn the TV off for the remote to start working again. It's rendered one of my most used apps on the TV useless.




(Score: 0.8236) There is a bunch of us wh

#### Function to get best answer and its score

In [33]:
def best_answer(embedder, query, corpus, corpus_embeddings):
    """ Find the closest sentence of the corpus 
    for a query sentence based on cosine similarity and 
    return the best sentence with its score
    
        Args:
            embedder - SentenceTransformer to embed text        
            query - query strings
            query_embeddings - query embeddings
            corpus - corpus used to train model
            corpus_embeddings - corpus embedding
            
        Returns:
            (1- distance, sentence) - best score and best sentence 
    """ 
    query_embedding = embedder.encode(query)[0]
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    idx = results[0][0]
    distance = results[0][1]
    best_result = ( "%.4f"%(1-distance), corpus[idx].strip())
    
    return best_result

#### Find best answers with model trained on *all* WiFi-TV comments from 2019

In [34]:
%%time
# Create Model
embedder = SentenceTransformer('bert-base-nli-mean-tokens')
corpus_wifi_all_embeddings = embedder.encode(corpus_wifi_all)

CPU times: user 39.3 s, sys: 1.82 s, total: 41.1 s
Wall time: 11.9 s


In [35]:
def print_best_answer(embedder, query, corpus, corpus_embeddings):
    print("***Query:\n", query, "\n*** Best Answer:\n",
      best_answer(embedder, query, corpus, corpus_embeddings))

In [36]:
%%time
x1 = [""" Hi,\xa0 i have a tv model 49uh850v and frequentlly is disconecting 
from internetnt while i\xa0 watch youtube or netflix and then wi fi cannot turn on, 
any solution?if i unplug tv for a while the problem is fixed but then again the problem 
appears again, any solution?
"""]
x2 = ["""\ntried everything but still no luck ...i read somewhere the ridiculous " 
unknown device disconnected / wifi turned off " might be fixed doing a hard reset .\xa0my 
issue got worse ever since i update to latest firmware ...
any help would be greatly appreciated .\xa0thnx .model : 49sj800\xa0                """]

queries = [x1,x2]
for x in queries:
    print_best_answer(embedder, x, corpus_wifi_all, corpus_wifi_all_embeddings)

***Query:
 [' Hi,\xa0 i have a tv model 49uh850v and frequentlly is disconecting \nfrom internetnt while i\xa0 watch youtube or netflix and then wi fi cannot turn on, \nany solution?if i unplug tv for a while the problem is fixed but then again the problem \nappears again, any solution?\n'] 
*** Best Answer:
 ('0.8791', 'My 65” B7 and have the same problem. My tv randomly becomes unresponsive and slow with the remote. Buttons on tv is fast. Then today I tried to kill YouTube app (longe press back) and everything become smooth again. Something is wrong with the YouTube app and it only getting worse.')
***Query:
 ['\ntried everything but still no luck ...i read somewhere the ridiculous " \nunknown device disconnected / wifi turned off " might be fixed doing a hard reset .\xa0my \nissue got worse ever since i update to latest firmware ...\nany help would be greatly appreciated .\xa0thnx .model : 49sj800\xa0                '] 
*** Best Answer:
 ('0.8319', 'There is a bunch of us who have t

### Question 3 and 4
- Analyze the forum and come up with the best five answer strings for the bot. 
- Produce and share a CSV which contains a row for each thread started in 2019 
with the thread URL, the first post, and which of the five answer
strings the bot should respond with, if any. 
- Provide an estimate for how many of those responses would have been
accepted respectively rejected.

- Suppose that we could do the work of Question 3 but for up to 50 answer strings. Estimate the number of accepted and rejected proposed answers for 2019 threads for the best one answer string, for the best two, etc, up to the best 50 answer strings. 

### Find best answers for all threads in 2019

#### Answer 3.2:  CSV which contains a row for each thread started in 2019 with the thread URL, the first post, and which of the five answer strings the bot should respond with, if any.

In [37]:
# All 2019 questions with comments
df = join(q19_df, c_df).copy()
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1023 entries, 0 to 1022
Data columns (total 15 columns):
date               1023 non-null object
id                 1023 non-null int64
author_name        1023 non-null object
title              1023 non-null object
hash_tags          574 non-null object
description        1023 non-null object
file               1023 non-null object
url                1023 non-null object
wifi_tv            1023 non-null bool
comment_id         1023 non-null int64
username           1023 non-null object
message            1023 non-null object
helpful_cnt        1023 non-null int64
not_helpful_cnt    1023 non-null int64
rating             1023 non-null int64
dtypes: bool(1), int64(5), object(9)
memory usage: 120.9+ KB


Unnamed: 0,date,id,author_name,title,hash_tags,description,file,url,wifi_tv,comment_id,username,message,helpful_cnt,not_helpful_cnt,rating
0,2019-08-19 21:00:57,5745,jmcl1960,"LG 55UK6200PUA Sound Mode is ""Deactivated, can...",,"\nMy LG 55UK6200PUA Sound Mode is ""Deactivated...",5745-lg-55uk6200pua-sound-mode-is-deactivated-...,https://lgcommunity.us.com/discussion/5745/lg-...,False,11591,jmcl1960,\n It's on Cinema m...,0,0,0
1,2019-08-19 21:00:57,5745,jmcl1960,"LG 55UK6200PUA Sound Mode is ""Deactivated, can...",,"\nMy LG 55UK6200PUA Sound Mode is ""Deactivated...",5745-lg-55uk6200pua-sound-mode-is-deactivated-...,https://lgcommunity.us.com/discussion/5745/lg-...,False,11606,JamalofLG,"\njmcl1960, Are you using a sound bar or the i...",0,0,0
2,2019-08-19 21:00:57,5745,jmcl1960,"LG 55UK6200PUA Sound Mode is ""Deactivated, can...",,"\nMy LG 55UK6200PUA Sound Mode is ""Deactivated...",5745-lg-55uk6200pua-sound-mode-is-deactivated-...,https://lgcommunity.us.com/discussion/5745/lg-...,False,11647,jmcl1960,\n I am using inter...,0,0,0


In [38]:
df.columns

Index(['date', 'id', 'author_name', 'title', 'hash_tags', 'description',
       'file', 'url', 'wifi_tv', 'comment_id', 'username', 'message',
       'helpful_cnt', 'not_helpful_cnt', 'rating'],
      dtype='object')

In [39]:
drop_cols = ['author_name','comment_id','username']
df = df.drop(drop_cols, axis=1)

In [40]:
# Data frame with row for each thread started in 2019 
# that have thread URL and the first post
df['date'] = pd.to_datetime(df['date']) 
q19c_df = df.loc[df.groupby('id')['date'].idxmin()].copy()
q19c_df.info()
q19c_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 466 entries, 373 to 429
Data columns (total 12 columns):
date               466 non-null datetime64[ns]
id                 466 non-null int64
title              466 non-null object
hash_tags          245 non-null object
description        466 non-null object
file               466 non-null object
url                466 non-null object
wifi_tv            466 non-null bool
message            466 non-null object
helpful_cnt        466 non-null int64
not_helpful_cnt    466 non-null int64
rating             466 non-null int64
dtypes: bool(1), datetime64[ns](1), int64(4), object(6)
memory usage: 44.1+ KB


Unnamed: 0,date,id,title,hash_tags,description,file,url,wifi_tv,message,helpful_cnt,not_helpful_cnt,rating
373,2019-01-01 22:24:24,3132,Re-pairing TV/Bluetooth soundbar after shutdown,,\nMy new LG 50UK6500AUA TV and SK1 soundbar pa...,3132-re-pairing-tv-bluetooth-soundbar-after-sh...,https://lgcommunity.us.com/discussion/3132/re-...,False,\n I have EXACTLY t...,1,0,1
298,2019-01-02 19:47:27,3144,Blurry picture and text,,\n just set up 60inch 60UK...,3144-blurry-picture-and-text,https://lgcommunity.us.com/discussion/3144/blu...,False,\nAre you streaming the content via the intern...,0,0,0
796,2019-01-03 07:38:20,3155,Unknown Device Is Disconnected,,\n I bought my UJ6300 tv ab...,3155-unknown-device-is-disconnected,https://lgcommunity.us.com/discussion/3155/unk...,False,"\nHi, I'm gathering data to escalate this issu...",0,0,0
711,2019-01-03 08:00:11,3156,49UK6200PUA HDMI Problems,,\n After only having this t...,3156-49uk6200pua-hdmi-problems,https://lgcommunity.us.com/discussion/3156/49u...,False,\nrocky17901 1) Disconnect every device from t...,0,0,0
147,2019-01-03 17:43:35,3161,"Two different products, two different remotes,...",Remote Infrared Remote malfunctioning,\n I now own:LG 24LJ48440 T...,3161-two-different-products-two-different-remo...,https://lgcommunity.us.com/discussion/3161/two...,False,"\n Hi Mralick,I wan...",0,0,0


In [41]:
# c19_df - all comments in 2019
mask_c = (c_df['date'] > '2018-12-31') & (c_df['date'] <= '2019-12-31')
c19_df = c_df.loc[mask_c].copy()
c19_df.info()
c19_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1079 entries, 7 to 1848
Data columns (total 10 columns):
date               1079 non-null object
question_id        1079 non-null int64
hash_tags          582 non-null object
comment_id         1079 non-null int64
username           1079 non-null object
message            1079 non-null object
helpful_cnt        1079 non-null int64
not_helpful_cnt    1079 non-null int64
rating             1079 non-null int64
file               1079 non-null object
dtypes: int64(5), object(5)
memory usage: 92.7+ KB


Unnamed: 0,date,question_id,hash_tags,comment_id,username,message,helpful_cnt,not_helpful_cnt,rating,file
7,2019-08-19 21:04:08,5745,,11591,jmcl1960,\n It's on Cinema m...,0,0,0,5745-lg-55uk6200pua-sound-mode-is-deactivated-...
8,2019-08-20 13:47:18,5745,,11606,JamalofLG,"\njmcl1960, Are you using a sound bar or the i...",0,0,0,5745-lg-55uk6200pua-sound-mode-is-deactivated-...
9,2019-08-21 14:06:48,5745,,11647,jmcl1960,\n I am using inter...,0,0,0,5745-lg-55uk6200pua-sound-mode-is-deactivated-...


#### Answer to Question 4: up to 50 best answer strings.

In [42]:
# Create corpus from all comments (answers) in the thread, from all years
#corpus_all = c_df['message'].values #1849 messages

# Create corpus from all comments (answers) in 2019
#corpus_all = c19_df['message'].values #1079 messages

# Create corpus from all _helpful_ comments (answers) in 2019
corpus_all = c19_df[c19_df['helpful_cnt'] > 0]['message'].values

len(corpus_all)

83

In [43]:
%%time
# Create Model from all comments
embedder = SentenceTransformer('bert-base-nli-mean-tokens')
corpus_all_embeddings = embedder.encode(corpus_all)

CPU times: user 37.4 s, sys: 1.81 s, total: 39.2 s
Wall time: 11.5 s


#### CSV which contains a row for each thread started in 2019 with the thread URL, the first post, and which of the five answer strings the bot should respond with

In [44]:
%%time

# Add best answer with score to 2019 questions data frame 

q19c_df[['best_score', 'best_answer']] = q19c_df.apply(lambda x: 
                                                       pd.Series(
                                                           best_answer(
                                                               embedder, 
                                                               [x['description']], 
                                                               corpus_all, 
                                                               corpus_all_embeddings)
                                                       ), axis=1)

CPU times: user 4min 1s, sys: 4.62 s, total: 4min 5s
Wall time: 1min 1s


In [45]:
q19c_df.info()
q19c_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 466 entries, 373 to 429
Data columns (total 14 columns):
date               466 non-null datetime64[ns]
id                 466 non-null int64
title              466 non-null object
hash_tags          245 non-null object
description        466 non-null object
file               466 non-null object
url                466 non-null object
wifi_tv            466 non-null bool
message            466 non-null object
helpful_cnt        466 non-null int64
not_helpful_cnt    466 non-null int64
rating             466 non-null int64
best_score         466 non-null object
best_answer        466 non-null object
dtypes: bool(1), datetime64[ns](1), int64(4), object(8)
memory usage: 51.4+ KB


Unnamed: 0,date,id,title,hash_tags,description,file,url,wifi_tv,message,helpful_cnt,not_helpful_cnt,rating,best_score,best_answer
373,2019-01-01 22:24:24,3132,Re-pairing TV/Bluetooth soundbar after shutdown,,\nMy new LG 50UK6500AUA TV and SK1 soundbar pa...,3132-re-pairing-tv-bluetooth-soundbar-after-sh...,https://lgcommunity.us.com/discussion/3132/re-...,False,\n I have EXACTLY t...,1,0,1,0.8698,I have EXACTLY the same problem on my new 43UK...
298,2019-01-02 19:47:27,3144,Blurry picture and text,,\n just set up 60inch 60UK...,3144-blurry-picture-and-text,https://lgcommunity.us.com/discussion/3144/blu...,False,\nAre you streaming the content via the intern...,0,0,0,0.7711,I have the LG 65UJ6300 which my wife bought on...
796,2019-01-03 07:38:20,3155,Unknown Device Is Disconnected,,\n I bought my UJ6300 tv ab...,3155-unknown-device-is-disconnected,https://lgcommunity.us.com/discussion/3155/unk...,False,"\nHi, I'm gathering data to escalate this issu...",0,0,0,0.7402,So we are having the same issue. We bought ou...


#### Write best answers to csv file

In [46]:
q19c_df.to_csv(BEST_ANSWERS_CSV)

#### Explore some best answers in 2019 thread

In [47]:
# Select questions that have 'not helpful' tags
not_helpful_df = q19c_df[q19c_df['not_helpful_cnt'] != 0]
not_helpful_df.info()
not_helpful_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 319 to 863
Data columns (total 14 columns):
date               30 non-null datetime64[ns]
id                 30 non-null int64
title              30 non-null object
hash_tags          19 non-null object
description        30 non-null object
file               30 non-null object
url                30 non-null object
wifi_tv            30 non-null bool
message            30 non-null object
helpful_cnt        30 non-null int64
not_helpful_cnt    30 non-null int64
rating             30 non-null int64
best_score         30 non-null object
best_answer        30 non-null object
dtypes: bool(1), datetime64[ns](1), int64(4), object(8)
memory usage: 3.3+ KB


Unnamed: 0,date,id,title,hash_tags,description,file,url,wifi_tv,message,helpful_cnt,not_helpful_cnt,rating,best_score,best_answer
319,2019-03-01 17:09:38,3730,HDMI Input No Longer Works on my 55UK6300PUE,HDMI Problem TV INPUT,\n The HDMI input #2 on my ...,3730-hdmi-input-no-longer-works-on-my-55uk6300pue,https://lgcommunity.us.com/discussion/3730/hdm...,False,"\nproblemunsolved, two things you can try as a...",0,2,-2,0.8634,I've Google searched the internet for this pro...
682,2019-05-30 01:37:32,4479,LG 55SM8600PUA Netflix issues,,"\n Hello, I recently purcha...",4479-lg-55sm8600pua-netflix-issues,https://lgcommunity.us.com/discussion/4479/lg-...,True,"\nTrahurst, thanks for the info. Have you noti...",0,1,-1,0.8576,I'm having the same problem after the YouTube ...
659,2019-06-02 16:56:06,4520,55UK6090PUA randomly starts my BPM25 Blu-Ray,ATMOS ARC SIMPLINK LG 55UK6090 BPM25 Blu Ray,\n We recently bought a 55U...,4520-55uk6090pua-randomly-starts-my-bpm25-blu-ray,https://lgcommunity.us.com/discussion/4520/55u...,False,"\nDWNKS49, the SIMPLINK settings is causing th...",0,1,-1,0.8239,I have EXACTLY the same problem on my new 43UK...


In [48]:
# Select questions that have 'not helpful' tags
#queries = not_helpful_df['description'].head(5).values
#queries = not_helpful_df['description'].sample(5).values

In [49]:
# Randomly select any 2019 questions 
queries = q19c_df['description'].sample(5).values
for x in queries:
    print_best_answer(embedder, [x], corpus_all, corpus_all_embeddings)

***Query:
 ["\n                    I literally just opened and set up my new LG UHD TV AI ThinQ 43UK63, and the top of the TV has a thin purple line of light across it and the physical top of the TV is curved upward in this same area. I cannot find anywhere online that says anything about this, and it makes me believe I have gotten a damaged product. I don't know if this is a setting that needs to be adjusted or if it a defect. PLEASE HELP!                "] 
*** Best Answer:
 ('0.7955', 'I have EXACTLY the same problem on my new\xa043UK6750PLD TV also with an SK1 soundbar.\xa0 To get it to connect to the soundbar when I turn the TV on again, I have to re-select it from the list of paired devices (I don\'t have to re-pair) but it will not connect automatically.Another way to re-connect is to turn the soundbar on and off with the TV on. Then the TV says something like "The SK1 device is requesting to connect. Do you accept?" (well, words to that effect). And then it works.But I don\'t w