# MSIN0093: Business Strategy and Analytics – Sentiment analysis of YouTube comments

# Table of Contents
* [0. Importing dependencies](#0.-Importing-dependencies)
* [1. Data scraping with YouTube API](#1.-Data-scraping-with-YouTube-API)
* [2. Topic analysis](#2.-Topic-analysis)
    * [2a. Data cleaning and pre-processing](#2a.-Data-cleaning-and-pre-processing)
    * [2b. One-word frequency distribution](#2b.-One-word-frequency-distribution)
    * [2c. Two-word frequency distribution](#2b.-Two-word-frequency-distribution)
* [3. Sentiment analysis](#3.-Sentiment-analysis)
    * [3a. Data cleaning and pre-processing](#3a.-Data-cleaning-and-pre-processing)
    * [3b. Feature identification](#3b.-Feature-identification)
    * [3c. Sentiment scoring](#3c.-Sentiment-scoring)

# 0. Importing dependencies

In [1]:
# Display full output rather than just the last line of output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# some modules might require pip install
# modules for data scraping
import os
import pickle
import google.oauth2.credentials
import pandas as pd
#from googleapiclient.discovery import build
#from googleapiclient.errors import HttpError
#from google_auth_oauthlib.flow import InstalledAppFlow
#from google.auth.transport.requests import Request
#import google.oauth2.credentials


# some modules might require pip install
# modules for topic and sentiment analysis
import numpy as np
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')
import nltk.data
tokenizer = nltk.downloader.download('punkt')
from nltk import ngrams
import re
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')
vader = SentimentIntensityAnalyzer()

[nltk_data] Downloading package punkt to /home/faculty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/faculty/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# 1. Data scraping with YouTube API

In [None]:
# API details
client_file = "client_secret.json"
scopes = ['https://www.googleapis.com/auth/youtube.force-ssl']
api_service_name = 'youtube'
api_version = 'v3'

In [None]:
# to be run on first try in case of issues with authentification

def get_authenticated_service():
     flow = InstalledAppFlow.from_client_secrets_file(client_file, scopes)
     credentials = flow.run_console()
     return build(api_service_name, api_version, credentials = credentials)

In [None]:
if __name__ == '__main__':
    # when running locally, disable OAuthlib's HTTPs verification
    # when running in production *do not* leave this option enabled
    os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
    service = get_authenticated_service()

In [None]:
# Main authentification function and import of libraries and modules

def get_authenticated_service():
    credentials = None
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            credentials = pickle.load(token)
    #  Check if the credentials are invalid or do not exist
    if not credentials or not credentials.valid:
        # Check if the credentials have expired
        if credentials and credentials.expired and credentials.refresh_token:
            credentials.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                client_file, scopes)
            credentials = flow.run_console()
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(credentials, token)
        
    return build(api_service_name, api_version, credentials = credentials)

In [None]:
# defining functions for extracting comments

def extract_infos_from_comment(comment,fields=["textOriginal"]):
    snippet = comment.get("snippet")
    if(snippet):
        return( {key:snippet.get(key) for key in fields})
    else:
        return(None)

def get_comment_from_raw_result(result,fields=["textOriginal"]):
    main_comment = None
    replies = []
    snippet = result.get("snippet")
    if(snippet):
        top_level_comment = snippet.get("topLevelComment")
        if(top_level_comment):
            main_comment = extract_infos_from_comment(top_level_comment,fields=fields)
        else:
            pass
    else:
        pass
    list_replies = result.get("replies")
    if(list_replies):
        comments = list_replies.get("comments")
        if(comments):
            for comment in comments:
                replies.append(extract_infos_from_comment(comment,fields=fields))
    dic = {"Main_comment":main_comment,"replies":replies}            
    return(dic)

def get_all_comments_from_results(results,fields=["textOriginal"]):
    items = results.get("items")
    all_comments = [get_comment_from_raw_result(item,fields=fields) for item in items]
    return(all_comments)

def get_all_comments(config_request,fields=["textOriginal"],verbose=False):
    all_comments = []
    service = get_authenticated_service()
    results = service.commentThreads().list(**config_request).execute()
    current_page = 0
    n_total_comments = 0
    while results:
        current_page += 1
        if( verbose):
            print("parsing comments for page {}..".format(current_page))
        comments_this_page = get_all_comments_from_results(results,fields=fields)
        n_comments = len(comments_this_page)
        n_total_comments += n_comments
        if(verbose):
            print("Found {} comments on this page..".format(n_comments))
        all_comments.append(comments_this_page)
        if 'nextPageToken' in results:
                    config_request['pageToken'] = results['nextPageToken']
                    results = service.commentThreads().list(**config_request).execute()
        else:
            break
    dic = {"all_comments":all_comments,"n_comments":n_total_comments}
    return(dic)

In [None]:
# requesting comments from selected video

config_request = {"part":"id,snippet,replies",
                  "order":"time",
                  "videoId":"0wR8-9tjpP4"} # insert unique video id here

fields = ["textOriginal","publishedAt",'videoId','authorDisplayName','likeCount']

all_comments = get_all_comments(config_request,fields=fields,verbose=True)

In [None]:
config_request = {"part":"id,snippet,replies",
                  "order":"time",
                  "videoId":"0wR8-9tjpP4"}


In [None]:
# looping through output to build lists for dataframe

comments_temp = []
comment_time_temp = []
video_id_temp = []
comment_author_temp = []
like_count_temp = []
reply_flag_temp = []

for full_list in all_comments["all_comments"]:
    for element in full_list:
        for j in element:
            if j == "Main_comment":
                comments_temp.append(element["Main_comment"]["textOriginal"])
                comment_time_temp.append(element["Main_comment"]["publishedAt"])
                video_id_temp.append(element["Main_comment"]["videoId"])
                comment_author_temp.append(element["Main_comment"]["authorDisplayName"])
                like_count_temp.append(element["Main_comment"]["likeCount"])
                reply_flag_temp.append(0)
            elif j  == "replies":
                for reply in element["replies"]:
                    comments_temp.append(reply["textOriginal"])
                    comment_time_temp.append(reply["publishedAt"])
                    video_id_temp.append(reply["videoId"])
                    comment_author_temp.append(reply["authorDisplayName"])
                    like_count_temp.append(reply["likeCount"])
                    reply_flag_temp.append(1)

In [None]:
# creating dataframe
output_df = pd.DataFrame({'Video ID': video_id_temp,
              'Comment': comments_temp,
              'Author name': comment_author_temp,
              'Timestamp': comment_time_temp,
              'Likes': like_count_temp,
              'Reply flag': reply_flag_temp})

output_df

In [None]:
# exporting dataframe to csv file
video_name = config_request["videoId"]
output_df.to_csv(video_name + '_comments.csv')

# 2. Topic Analysis

# 2a. Data cleaning and pre-processing

In [3]:
# importing file that contains all youtube comments 
corpus_raw_df = pd.read_csv('all_comments.csv') 

# importing video list to extract information of which video refers to which brand
mapping_df = pd.read_csv('mapping.csv') 

# join mapping_df into corpus_raw_df
corpus_raw_df = pd.merge(corpus_raw_df, mapping_df, on='id', how='left')

# drop unnecessary columns
corpus_raw_df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'Comment #', 'Video ID', 'Timestamp', 'Likes', 'Reply flag', 'Author name'], inplace= True)

# rearranging columns
corpus_raw_df = corpus_raw_df.reindex(columns=['id','Comment_id','Comment','brand'])

# lowercasing column names
corpus_raw_df.columns = map(str.lower, corpus_raw_df.columns)

In [4]:
# creating function for removing emojis
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

## 2b. One-word frequency distribution

In [5]:
# splitting text/comment into list of sentences
corpus_raw = [nltk.sent_tokenize(str(comment)) for comment in corpus_raw_df['comment']]

# converting list of lists to one single list
corpus_raw = [item for sublist in corpus_raw for item in sublist]

# removing to lowercase so that cases, such as "HR" and "hr", can be matched
corpus_raw = [sentence.lower() for sentence in corpus_raw]

# removing non-letter characters
corpus_raw = [re.sub('[^a-zA-Z]',' ', str(sentence)) for sentence in corpus_raw] 

# removing emojis
corpus_raw = [remove_emoji(sentence) for sentence in corpus_raw]

# splitting sentences into list of words
corpus_words = [word for sentence in corpus_raw for word in sentence.split()]

# removing stopwords (e.g. "and", "he", "are") 
corpus_words = [word for word in corpus_words if word not in stop_words]

# creating dictionary of all words from corpus with their counts
words_counts = {}
for word in corpus_words:
        if word not in words_counts:
            words_counts[word] = 1
        words_counts[word] += 1
        
# sorting dictionary from highest to lowest value 
bag_of_words_ranked = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)

# showing the 250 most frequent words
bag_of_words_ranked[:250]

[('watch', 4860),
 ('apple', 2538),
 ('thanks', 1921),
 ('garmin', 1813),
 ('like', 1538),
 ('series', 1425),
 ('review', 1309),
 ('great', 1308),
 ('one', 1238),
 ('fenix', 1221),
 ('get', 1198),
 ('would', 1093),
 ('video', 1015),
 ('use', 965),
 ('good', 948),
 ('battery', 903),
 ('really', 848),
 ('still', 754),
 ('much', 753),
 ('running', 726),
 ('think', 702),
 ('gps', 694),
 ('also', 694),
 ('new', 681),
 ('need', 662),
 ('better', 652),
 ('watches', 633),
 ('want', 630),
 ('thank', 629),
 ('always', 629),
 ('love', 622),
 ('app', 616),
 ('time', 601),
 ('se', 598),
 ('life', 578),
 ('phone', 571),
 ('x', 560),
 ('best', 557),
 ('buy', 555),
 ('run', 552),
 ('got', 545),
 ('see', 538),
 ('know', 536),
 ('music', 535),
 ('features', 528),
 ('pro', 526),
 ('screen', 514),
 ('hr', 488),
 ('go', 473),
 ('even', 467),
 ('work', 456),
 ('sure', 454),
 ('well', 454),
 ('nice', 437),
 ('iphone', 437),
 ('thing', 424),
 ('heart', 421),
 ('day', 410),
 ('could', 403),
 ('rate', 395),
 ('

# 2c. Two-word frequency distribution

In [6]:
# splitting sentences into n-grams word combinations (e.g. n=2, "hr monitor")
n = 2 # testing for 2-words combinations
corpus_ngram = list(ngrams(corpus_words, n))

# creating dictionary of all words from corpus with their counts.
ngram_counts = {}
for word in corpus_ngram:
        if word not in ngram_counts:
            ngram_counts[word] = 1
        ngram_counts[word] += 1
        
# sorting dictionary from highest to lowest value
ngram_ranked = sorted(ngram_counts.items(), key=lambda x: x[1], reverse=True)

# showing the ten most frequent n-words combination
ngram_ranked[:250]

[(('apple', 'watch'), 1609),
 (('battery', 'life'), 479),
 (('heart', 'rate'), 362),
 (('great', 'review'), 314),
 (('watch', 'series'), 234),
 (('great', 'video'), 222),
 (('thanks', 'watching'), 213),
 (('fenix', 'x'), 191),
 (('always', 'display'), 183),
 (('garmin', 'connect'), 158),
 (('sleep', 'tracking'), 156),
 (('galaxy', 'watch'), 150),
 (('smart', 'watch'), 142),
 (('watch', 'se'), 137),
 (('sure', 'thing'), 136),
 (('chase', 'summit'), 130),
 (('x', 'pro'), 121),
 (('blood', 'oxygen'), 112),
 (('garmin', 'fenix'), 104),
 (('fenix', 'pro'), 101),
 (('venu', 'sq'), 101),
 (('thanks', 'much'), 98),
 (('thanks', 'great'), 93),
 (('watch', 'face'), 83),
 (('would', 'like'), 82),
 (('would', 'recommend'), 81),
 (('much', 'better'), 78),
 (('vo', 'max'), 77),
 (('hope', 'helps'), 76),
 (('dc', 'rainmaker'), 76),
 (('thanks', 'review'), 76),
 (('desfit', 'thanks'), 76),
 (('review', 'thanks'), 75),
 (('pro', 'solar'), 72),
 (('x', 'plus'), 71),
 (('thank', 'much'), 71),
 (('first',

# 3. Sentiment analysis

## 3a. Data cleaning and pre-processing

In [7]:
# creating subset for each brand and applying little data cleaning

# ------------------- SUBSET GARMIN -------------------

# creating subset for Garmin-related comments
corpus_garmin = corpus_raw_df[corpus_raw_df['brand'] == 'Garmin'] 

# reporting number of comments before comment to sentence transformation
print("Number of comments in Garmin subset:", len(corpus_garmin))

# splitting text/comment into list of sentences
corpus_garmin = [nltk.sent_tokenize(str(comment)) for comment in corpus_garmin["comment"]]

# converting list of lists to one single list
corpus_garmin = [item for sublist in corpus_garmin for item in sublist]

# lowercase for better matching with keyword lists
corpus_garmin = [sentence.lower() for sentence in corpus_garmin] 


# ------------------- SUBSET APPLE -------------------

# creating subset for Apple-related comments
corpus_apple = corpus_raw_df[corpus_raw_df['brand'] == 'Apple'] 

# reporting number of comments before comment to sentence transformation
print("Number of comments in Apple subset:", len(corpus_apple))

# splitting text/comment into list of sentences
corpus_apple = [nltk.sent_tokenize(str(comment)) for comment in corpus_apple["comment"]]

# converting list of lists to one single list
corpus_apple = [item for sublist in corpus_apple for item in sublist]

# lowercase for better matching with keyword lists
corpus_apple = [sentence.lower() for sentence in corpus_apple] 

# reporting number of comments after comment to sentence transformation
print("Number of sentences in Apple subset:", len(corpus_garmin))
print("Number of sentences in Garmin subset:", len(corpus_apple))

Number of comments in Garmin subset: 6878
Number of comments in Apple subset: 7431
Number of sentences in Apple subset: 15734
Number of sentences in Garmin subset: 14718


# 3b. Feature-identification

In [54]:
# defining lists of associated keywords for each feature
keywords_topic_gps = ['gps', 'geolocation']
keywords_topic_hr = ['hr', 'heart rate', 'hr', 'bpm']
keywords_topic_battery = ['battery', 'battery life', 'charging', 'battery lifetime']
keywords_topic_vo2 = ['vo2', 'pulseox', 'oxygen']
keywords_topic_price = ['budget', 'overpriced', 'expensive', 'affordable', 'dollar', "$" 'dollars', 'cheap', '€', 'euro', 'euros']

**Identifying feature-related sentences for Apple**

In [19]:
# removing sentences with question marks
to_delete = []
for sentence in corpus_apple:
    for word in sentence:
        if word == "?":
            to_delete.append(sentence)

corpus_apple_2 = []

for sentence in corpus_apple:
    if sentence not in to_delete:
        corpus_apple_2.append(sentence)
        
# reporting number of sentences before and after removing questions
print("number of sentences before removing questions:", len(corpus_apple))
print("number of sentences after removing question marks:", len(corpus_apple_2))

number of sentences before removing questions: 14718
number of sentences after removing question marks: 12847


In [57]:
# removing sentences that refer to other brands
non_apple = ['forerunner', 'garmin', 'vivoactive', 'huawei', 'polar', 'fenix', 'withings', 'suunto', 'xiaomi', 'fitbit']
to_delete_2 = []
non_apple_count = 0

for sentence in corpus_apple_2:
    for word in sentence.split():
        if word in non_apple:
                non_apple_count += 1
                to_delete_2.append(sentence)  

corpus_apple_3 = []

for sentence in corpus_apple_2:
    if sentence not in to_delete_2:
        corpus_apple_3.append(sentence)

# reporting number of sentences after removing non-Apple brand related sentences
print("number of sentences after removing non-Apple brand related sentences:", len(corpus_apple_3)) 


# assigning sentences of a review to a topic list if a word in the sentence matches a keyword of a topic list
app_assigned_to_topic_gps = []
app_assigned_to_topic_hr = []
app_assigned_to_topic_battery = []
app_assigned_to_topic_vo2 = []
app_assigned_to_topic_price = []

for sentence in corpus_apple_3:
    for word in sentence.split():
        if word in keywords_topic_hr:
            app_assigned_to_topic_hr.append(sentence)
        if word in keywords_topic_gps:
            app_assigned_to_topic_gps.append(sentence)
        if word in keywords_topic_battery:
            app_assigned_to_topic_battery.append(sentence)
        if word in keywords_topic_vo2:
            app_assigned_to_topic_vo2.append(sentence)
        if word in keywords_topic_price:
            app_assigned_to_topic_price.append(sentence)

                       
# reporting number of extracted sentences after one-word matching
app_sentences_one_w = len(app_assigned_to_topic_hr) + len(app_assigned_to_topic_gps) + len(app_assigned_to_topic_battery) + len(app_assigned_to_topic_vo2) + len(app_assigned_to_topic_price) 
                
print("Number of sentences after one-word matching:", app_sentences_one_w)

# matching with two-word combinations
for sentence in corpus_apple_3:
    for grams in ngrams(sentence.split(), 2): # creating two word combination
        if ' '.join(grams) in keywords_topic_hr:
            if sentence not in app_assigned_to_topic_hr:
                app_assigned_to_topic_hr.append(sentence)
        if ' '.join(grams) in keywords_topic_gps:
            if sentence not in app_assigned_to_topic_gps:
                app_assigned_to_topic_gps.append(sentence)
        if ' '.join(grams) in keywords_topic_battery:
            if sentence not in app_assigned_to_topic_battery:
                app_assigned_to_topic_battery.append(sentence)
        if ' '.join(grams) in keywords_topic_vo2:
            if sentence not in app_assigned_to_topic_vo2:
                app_assigned_to_topic_vo2.append(sentence)
        if ' '.join(grams) in keywords_topic_price:
            if sentence not in app_assigned_to_topic_price:
                app_assigned_to_topic_price.append(sentence)
                
app_sentences_two_w = len(app_assigned_to_topic_hr) + len(app_assigned_to_topic_gps) + len(app_assigned_to_topic_battery) + len(app_assigned_to_topic_vo2) + len(app_assigned_to_topic_price) 
                
print("Number of sentences after two-word matching:", app_sentences_two_w)

# screening list of assigned sentences of topic gps
gps_fraud = []
for sentence in app_assigned_to_topic_gps:
    if sentence in app_assigned_to_topic_hr or app_assigned_to_topic_battery or app_assigned_to_topic_vo2 or app_assigned_to_topic_price:
        gps_fraud.append(sentence)


# screening list of assigned sentences to topic hr
hr_fraud = []
for sentence in app_assigned_to_topic_hr:
    if sentence in app_assigned_to_topic_gps or app_assigned_to_topic_battery or app_assigned_to_topic_vo2 or app_assigned_to_topic_price:
        hr_fraud.append(sentence)

# screening list of assigned sentences to topic battery
battery_fraud = []
for sentence in app_assigned_to_topic_battery:
    if sentence in app_assigned_to_topic_hr or app_assigned_to_topic_gps or app_assigned_to_topic_vo2 or app_assigned_to_topic_price:
        battery_fraud.append(sentence)    
        
# screening list of assigned sentences to topic vo2
vo2_fraud = []
for sentence in app_assigned_to_topic_vo2:
    if sentence in app_assigned_to_topic_hr or app_assigned_to_topic_gps or app_assigned_to_topic_battery or app_assigned_to_topic_price:
        vo2_fraud.append(sentence) 
        
# screening list of assigned sentences to topic price
price_fraud = []
for sentence in app_assigned_to_topic_price:
    if sentence in app_assigned_to_topic_hr or app_assigned_to_topic_gps or app_assigned_to_topic_battery or app_assigned_to_topic_vo2:
           price_fraud.append(sentence) 
    
      
# reporting number of sentences after validation
app_sentences_after = len(app_assigned_to_topic_gps) + len(app_assigned_to_topic_hr) + len(app_assigned_to_topic_battery) + len(app_assigned_to_topic_vo2) + len(app_assigned_to_topic_price) 
             
print("Number of sentences after validation:", app_sentences_after)

# reporting number of sentences for each feature topic
print("Number of sentences for gps:", len(app_assigned_to_topic_gps))
print("Number of sentences for hr:", len(app_assigned_to_topic_hr))
print("Number of sentences for battery:", len(app_assigned_to_topic_battery))
print("Number of sentences for vo2:", len(app_assigned_to_topic_vo2))
print("Number of sentences for price:", len(app_assigned_to_topic_price))

number of sentences after removing non-Apple brand related sentences: 12306
Number of sentences after one-word matching: 808
Number of sentences after two-word matching: 860
Number of sentences after validation: 860
Number of sentences for gps: 135
Number of sentences for hr: 94
Number of sentences for battery: 419
Number of sentences for vo2: 136
Number of sentences for price: 76


**Exporting data for Apple**

In [59]:
# create dataframe for each list
app_gps_df = pd.DataFrame(app_assigned_to_topic_gps, columns = ['comment'])
app_hr_df = pd.DataFrame(app_assigned_to_topic_hr, columns = ['comment'])
app_battery_df = pd.DataFrame(app_assigned_to_topic_battery, columns = ['comment'])
app_vo2_df = pd.DataFrame(app_assigned_to_topic_vo2, columns = ['comment'])
app_price_df = pd.DataFrame(app_assigned_to_topic_price, columns = ['comment'])

# add label column for topic
app_gps_df['topic'] = 'gps'
app_hr_df['topic'] = 'hr'
app_battery_df['topic'] = 'battery'
app_vo2_df['topic'] = 'vo2'
app_price_df['topic'] = 'price'

# join dataframes together
frames = [app_gps_df,
          app_hr_df,
          app_battery_df,
          app_vo2_df,
          app_price_df]


apple_topic_comments = pd.concat(frames)

# reporting number of sentences before removing duplicates
print("Number of comments after removing duplicates:", len(apple_topic_comments))

# removing duplicates to ensure one sentence is not assigned to more than one feature topic
apple_topic_comments.drop_duplicates(subset=['comment'], inplace = True)

# reporting number of sentences after removing duplicates
print("Number of comments after removing duplicates:", len(apple_topic_comments))

# reporting number of sentences for each feature topic
print("Number of sentences for gps:", len(apple_topic_comments[apple_topic_comments['topic'] == 'gps']))
print("Number of sentences for hr:", len(apple_topic_comments[apple_topic_comments['topic'] == 'hr']))
print("Number of sentences for battery:", len(apple_topic_comments[apple_topic_comments['topic'] == 'battery']))
print("Number of sentences for vo2:", len(apple_topic_comments[apple_topic_comments['topic'] == 'vo2']))
print("Number of sentences for price:", len(apple_topic_comments[apple_topic_comments['topic'] == 'price']))

# export to csv
date = '2020-12-08'
apple_topic_comments.to_csv(date + '_apple_topic_comments.csv', index=False)

Number of comments after removing duplicates: 860
Number of comments after removing duplicates: 772
Number of sentences for gps: 122
Number of sentences for hr: 80
Number of sentences for battery: 378
Number of sentences for vo2: 125
Number of sentences for price: 67


**Identifying feature-related sentences for Garmin**

In [61]:
# removing sentences with question marks
to_delete = []
for sentence in corpus_garmin:
    for word in sentence:
        if word == "?":
            to_delete.append(sentence)

corpus_garmin_2 = []

for sentence in corpus_garmin:
    if sentence not in to_delete:
        corpus_garmin_2.append(sentence)
        
# reporting number of sentences before and after removing questions
print("number of sentences before removing questions:", len(corpus_garmin))
print("number of sentences after removing question marks:", len(corpus_garmin_2))

number of sentences before removing questions: 15734
number of sentences after removing question marks: 13020


In [62]:
# removing sentences that refer to other brands
non_garmin = ['apple','huawei', 'polar', 'withings', 'suunto', 'xiaomi', 'fitbit']
to_delete_3 = []
non_garmin_count = 0

for sentence in corpus_garmin_2:
    for word in sentence.split():
        if word in non_garmin:
                non_garmin_count += 1
                to_delete_3.append(sentence)  

corpus_garmin_3 = []

for sentence in corpus_garmin_2:
    if sentence not in to_delete_3:
        corpus_garmin_3.append(sentence)

# reporting number of sentences after removing non-Garmin brand related sentences
print("number of sentences after removing non-Garmin brand related sentences:", len(corpus_garmin_3)) 


# assigning sentences of a review to a topic list if a word in the sentence matches a keyword of a topic list
gar_assigned_to_topic_gps = []
gar_assigned_to_topic_hr = []
gar_assigned_to_topic_battery = []
gar_assigned_to_topic_vo2 = []
gar_assigned_to_topic_price = []

for sentence in corpus_garmin_3:
    for word in sentence.split():
        if word in keywords_topic_hr:
            gar_assigned_to_topic_hr.append(sentence)
        if word in keywords_topic_gps:
            gar_assigned_to_topic_gps.append(sentence)
        if word in keywords_topic_battery:
            gar_assigned_to_topic_battery.append(sentence)
        if word in keywords_topic_vo2:
            gar_assigned_to_topic_vo2.append(sentence)
        if word in keywords_topic_price:
            gar_assigned_to_topic_price.append(sentence)

                       
# reporting number of extracted sentences after one-word matching
gar_sentences_one_w = len(gar_assigned_to_topic_hr) + len(gar_assigned_to_topic_gps) + len(gar_assigned_to_topic_battery) + len(gar_assigned_to_topic_vo2) + len(gar_assigned_to_topic_price) 
                
print("Number of sentences after one-word matching:", gar_sentences_one_w)

# matching with two-word combinations
for sentence in corpus_garmin_3:
    for grams in ngrams(sentence.split(), 2): # creating two word combination
        if ' '.join(grams) in keywords_topic_hr:
            if sentence not in gar_assigned_to_topic_hr:
                gar_assigned_to_topic_hr.append(sentence)
        if ' '.join(grams) in keywords_topic_gps:
            if sentence not in gar_assigned_to_topic_gps:
                gar_assigned_to_topic_gps.append(sentence)
        if ' '.join(grams) in keywords_topic_battery:
            if sentence not in gar_assigned_to_topic_battery:
                gar_assigned_to_topic_battery.append(sentence)
        if ' '.join(grams) in keywords_topic_vo2:
            if sentence not in gar_assigned_to_topic_vo2:
                gar_assigned_to_topic_vo2.append(sentence)
        if ' '.join(grams) in keywords_topic_price:
            if sentence not in gar_assigned_to_topic_price:
                gar_assigned_to_topic_price.append(sentence)
                
gar_sentences_two_w = len(gar_assigned_to_topic_hr) + len(gar_assigned_to_topic_gps) + len(gar_assigned_to_topic_battery) + len(gar_assigned_to_topic_vo2) + len(gar_assigned_to_topic_price) 
                
print("Number of sentences after two-word matching:", gar_sentences_two_w)

# screening list of assigned sentences of topic gps
gar_gps_fraud = []
for sentence in gar_assigned_to_topic_gps:
    if sentence in gar_assigned_to_topic_hr or gar_assigned_to_topic_battery or gar_assigned_to_topic_vo2 or gar_assigned_to_topic_price:
        gar_gps_fraud.append(sentence)


# screening list of assigned sentences to topic hr
gar_hr_fraud = []
for sentence in gar_assigned_to_topic_hr:
    if sentence in gar_assigned_to_topic_gps or gar_assigned_to_topic_battery or gar_assigned_to_topic_vo2 or gar_assigned_to_topic_price:
        gar_hr_fraud.append(sentence)

# screening list of assigned sentences to topic battery
gar_battery_fraud = []
for sentence in gar_assigned_to_topic_battery:
    if sentence in gar_assigned_to_topic_hr or gar_assigned_to_topic_gps or gar_assigned_to_topic_vo2 or gar_assigned_to_topic_price:
        gar_battery_fraud.append(sentence)    
        
# screening list of assigned sentences to topic vo2
gar_vo2_fraud = []
for sentence in gar_assigned_to_topic_vo2:
    if sentence in gar_assigned_to_topic_hr or gar_assigned_to_topic_gps or gar_assigned_to_topic_battery or gar_assigned_to_topic_price:
        gar_vo2_fraud.append(sentence) 
        
# screening list of assigned sentences to topic price
gar_price_fraud = []
for sentence in gar_assigned_to_topic_price:
    if sentence in gar_assigned_to_topic_hr or gar_assigned_to_topic_gps or gar_assigned_to_topic_battery or gar_assigned_to_topic_vo2:
           gar_price_fraud.append(sentence) 
    
      
# reporting number of sentences after validation
gar_sentences_after = len(gar_assigned_to_topic_gps) + len(gar_assigned_to_topic_hr) + len(gar_assigned_to_topic_battery) + len(gar_assigned_to_topic_vo2) + len(gar_assigned_to_topic_price) 
             
print("Number of sentences after validation:", gar_sentences_after)

# reporting number of sentences for each feature topic
print("Number of sentences for gps:", len(gar_assigned_to_topic_gps))
print("Number of sentences for hr:", len(gar_assigned_to_topic_hr))
print("Number of sentences for battery:", len(gar_assigned_to_topic_battery))
print("Number of sentences for vo2:", len(gar_assigned_to_topic_vo2))
print("Number of sentences for price:", len(gar_assigned_to_topic_price))

number of sentences after removing non-Apple brand related sentences: 12645
Number of sentences after one-word matching: 846
Number of sentences after two-word matching: 959
Number of sentences after validation: 959
Number of sentences for gps: 238
Number of sentences for hr: 323
Number of sentences for battery: 279
Number of sentences for vo2: 29
Number of sentences for price: 90


**Exporting data for Garmin**

In [63]:
# create dataframe for each list
gar_gps_df = pd.DataFrame(gar_assigned_to_topic_gps, columns = ['comment'])
gar_hr_df = pd.DataFrame(gar_assigned_to_topic_hr, columns = ['comment'])
gar_battery_df = pd.DataFrame(gar_assigned_to_topic_battery, columns = ['comment'])
gar_vo2_df = pd.DataFrame(gar_assigned_to_topic_vo2, columns = ['comment'])
gar_price_df = pd.DataFrame(gar_assigned_to_topic_price, columns = ['comment'])

# add label column for topic
gar_gps_df['topic'] = 'gps'
gar_hr_df['topic'] = 'hr'
gar_battery_df['topic'] = 'battery'
gar_vo2_df['topic'] = 'vo2'
gar_price_df['topic'] = 'price'

# join dataframes together
frames_2 = [gar_gps_df,
          gar_hr_df,
          gar_battery_df,
          gar_vo2_df,
          gar_price_df]


garmin_topic_comments = pd.concat(frames_2)

# reporting number of sentences before removing duplicates
print("Number of comments after removing duplicates:", len(garmin_topic_comments))

# removing duplicates to ensure one sentence is not assigned to more than one feature topic
garmin_topic_comments.drop_duplicates(subset=['comment'], inplace = True)

# reporting number of sentences after removing duplicates
print("Number of comments after removing duplicates:", len(garmin_topic_comments))

# reporting number of sentences for each feature topic
print("Number of sentences for gps:", len(garmin_topic_comments[garmin_topic_comments['topic'] == 'gps']))
print("Number of sentences for hr:", len(garmin_topic_comments[garmin_topic_comments['topic'] == 'hr']))
print("Number of sentences for battery:", len(garmin_topic_comments[garmin_topic_comments['topic'] == 'battery']))
print("Number of sentences for vo2:", len(garmin_topic_comments[garmin_topic_comments['topic'] == 'vo2']))
print("Number of sentences for price:", len(garmin_topic_comments[garmin_topic_comments['topic'] == 'price']))

# export to csv
date = '2020-12-08'
garmin_topic_comments.to_csv(date + '_garmin_topic_comments.csv', index=False)

Number of comments after removing duplicates: 959
Number of comments after removing duplicates: 849
Number of sentences for gps: 225
Number of sentences for hr: 291
Number of sentences for battery: 223
Number of sentences for vo2: 24
Number of sentences for price: 86


# 3a. Sentiment scoring

In [66]:
# ------------------- GARMIN -------------------

# load data
garmin_df = pd.read_csv('2020-12-08_garmin_topic_comments.csv') 

# create empty dictionary for sentiment scores
garmin_sentiment = {}

# run loop to iterate through every review while applying sentiment scoring
for i,x in enumerate(garmin_df.comment):
    garmin_sentiment[i] = vader.polarity_scores(x)
    
# convert dictionary into dataframe
garmin_sentiment = pd.DataFrame(garmin_sentiment)

# transpose to get correct form
garmin_sentiment = garmin_sentiment.transpose()

# add index column to dataframes
garmin_sentiment.reset_index(drop=True, inplace=True)
garmin_df.reset_index(drop=True, inplace=True)

# join dataframesa together 
garmin_results = pd.concat([garmin_df, garmin_sentiment],  axis=1, ignore_index=False)

# export to csv
garmin_results.to_csv('03_Results/garmin_results.csv', index=False)


# ------------------- APPLE -------------------

# load data
apple_df = pd.read_csv('2020-12-08_apple_topic_comments.csv') 

# create empty dictionary for sentiment scores
apple_sentiment = {}

# run loop to iterate through every review while applying sentiment scoring
for i,x in enumerate(apple_df.comment):
    apple_sentiment[i] = vader.polarity_scores(x)
    
# convert dictionary into dataframe
apple_sentiment = pd.DataFrame(apple_sentiment)

# transpose to get correct form
apple_sentiment = apple_sentiment.transpose()

# add index column to dataframes
apple_sentiment.reset_index(drop=True, inplace=True)
apple_df.reset_index(drop=True, inplace=True)

# join dataframesa together 
apple_results = pd.concat([apple_df, apple_sentiment],  axis=1, ignore_index=False)

# export to csv
apple_results.to_csv('03_Results/apple_results.csv', index=False)

# reporting number of sentences for each brand to cross-check with previous data set
print('Number of feature-related sentences for Apple:', len(apple_results))
print('Number of feature-related sentences for Garmin:', len(garmin_results))

Number of feature-related sentences for Apple: 772
Number of feature-related sentences for Garmin: 849
