<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> Reading comment dataset</h2>
</div>

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os
import json
import string
import nltk

# topic modiling
import gensim
from gensim import corpora, models
from gensim.test.utils import datapath
from gensim.test.utils import datapath

# stemmer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer

# stopWords
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words("english")

# NLP packages
from textblob import TextBlob

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emadarmiti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# define coment dictionaries list
comment_data = []

# define directory path
DIR_PATH = "./../../../comments/"

In [16]:
# go over all files in the dir
for file_name in os.listdir(DIR_PATH):
    
    # check if the file is a json comment file
    if file_name.endswith('comments.json'):
        
        # get the entire path of the file
        json_file_path = os.path.join(DIR_PATH, file_name)
        
        # open the json file
        with open (json_file_path, "r") as f:
            
            # load file to dict
            json_data = json.load(f)
        
        # go over the comments 
        for comment in json_data:

            # slice comment meta data
            comment_meta = {
                "comment_id" : comment.get('id'), 
                "comment" : comment.get('commentText'),
                "user_id" : comment.get('author').get('id'),
                "video_id" : file_name[:-14]
            }
            
            # append the data to the final list
            comment_data.append(comment_meta)

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> Create dataframe</h2>
</div>

In [17]:
# create the dataframe 
comment_data_df = pd.DataFrame(comment_data)

In [29]:
# show first five rows
comment_data_df.head()

Unnamed: 0,comment_id,comment,user_id,video_id
0,UgxxbiYbyE-8dvwdhhN4AaABAg,Who&#39;s from 2021,UCZ5PuK7lI3nbyJzF6FqGr4A,ClRH1aQuP_Q
1,UgxG1-BosIxdVhhBaAt4AaABAg,Ahhh. So this is what dbrand looked like back ...,UCawT3eZDr1TevBhNHy-0Ilw,ClRH1aQuP_Q
2,Ugy4sBuOaAl16zTyZA14AaABAg,MY GOD,UCv-mX9oG3kSBcxEDHg299KA,ClRH1aQuP_Q
3,UgziFtIlBMiQnPZsggp4AaABAg,2020,UCYMPRzaW0ElU5aLZKpsuEBg,ClRH1aQuP_Q
4,UgzJCQUwyi68Uu7fexp4AaABAg,It’s the year 2020 and little did anyone think...,UC8Vswk4tmtNxb73eipXBLQg,ClRH1aQuP_Q


In [43]:
# # save dataframe
# comment_data_df.to_parquet('comment_data_df.parquet.gzip', compression='gzip')

In [4]:
# read dataframe
comment_data_df = pd.read_parquet('./../../../comment_data_df.parquet.gzip')  

In [34]:
comment_data_df.head()

Unnamed: 0,comment_id,comment,user_id,video_id
0,UgxxbiYbyE-8dvwdhhN4AaABAg,Who&#39;s from 2021,UCZ5PuK7lI3nbyJzF6FqGr4A,ClRH1aQuP_Q
1,UgxG1-BosIxdVhhBaAt4AaABAg,Ahhh. So this is what dbrand looked like back ...,UCawT3eZDr1TevBhNHy-0Ilw,ClRH1aQuP_Q
2,Ugy4sBuOaAl16zTyZA14AaABAg,MY GOD,UCv-mX9oG3kSBcxEDHg299KA,ClRH1aQuP_Q
3,UgziFtIlBMiQnPZsggp4AaABAg,2020,UCYMPRzaW0ElU5aLZKpsuEBg,ClRH1aQuP_Q
4,UgzJCQUwyi68Uu7fexp4AaABAg,It’s the year 2020 and little did anyone think...,UC8Vswk4tmtNxb73eipXBLQg,ClRH1aQuP_Q


In [6]:
# get number of rows
comment_data_df.shape

(15154352, 4)

> we have 15 million comments

In [7]:
# check for nulls
comment_data_df.isnull().any()

comment_id    False
comment       False
user_id       False
video_id      False
dtype: bool

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> Topic modeling</h2>
</div>

### - preprocessing

In [8]:
def preprocess(data):
    data = data.translate(str.maketrans('', '', string.punctuation))

    nltk_tokens = nltk.word_tokenize(data)
    cleanTokens = [x for x in nltk_tokens if not x in sw]

    stems = []
    ps = PorterStemmer()
    for w in cleanTokens:
        stems.append(ps.stem(WordNetLemmatizer().lemmatize(w, pos='v'))) 
        
    return stems 


### - reading model

In [11]:
# define lda model path
lda_model_path = datapath("/Users/emadarmiti/Desktop/cap-s6/data mining/"+
                          "community_detection/Community-Detection/Data-Processing-(Stage 1)/LDAmodel")

In [12]:
# load the model from disk
lda_model = models.ldamodel.LdaModel.load(lda_model_path)

In [13]:
# define dictionary path
dictionary_path = datapath("/Users/emadarmiti/Desktop/cap-s6/data mining/"+
                          "community_detection/Community-Detection/Data-Processing-(Stage 1)/dictionary")

In [14]:
# load the dictionary
dictionary = gensim.corpora.Dictionary.load(dictionary_path)

### - topic modeling

In [None]:
# get the topic of all comments
comment_data_df['topic'] = comment_data_df['comment'].apply(lambda comment:
                           sorted(lda_model[dictionary.doc2bow(preprocess(comment))],key=lambda tup: -1*tup[1])[0])


<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'>Sentiment analysis </h2>
</div>

In [35]:
def sentiment_calc(text):
    """
    Calculating the Sentiment Polarity
    if the text cannot be processed by TextBlob (not string format)
    then return None 
    else compute the polarity
    """
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None


In [None]:
# get the sentiment of the comments
comment_data_df['sentiment'] = comment_data_df['comment'].apply(sentiment_calc)

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> Dealing with low confidence comment topics</h2>
</div>

### - reading datasets

In [2]:
# read comment topic modeling
comment_topic = pd.read_parquet('./../../../comment_data_df_topic_modeling.parquet.gzip')  

In [23]:
# read video topic modeling
video_topic = pd.read_csv("./../../Data-Processing-(Stage 1)/videosAndTopics.csv")

### - get low topic confidence comments

In [4]:
# get cemment below 0.6 confidence
low_confidence_comments = comment_topic[comment_topic['confidence']<0.6]

In [7]:
low_confidence_comments.head()

Unnamed: 0,comment_id,comment,user_id,video_id,topicID,confidence
0,UgxxbiYbyE-8dvwdhhN4AaABAg,Who&#39;s from 2021,UCZ5PuK7lI3nbyJzF6FqGr4A,ClRH1aQuP_Q,11,0.516047
1,UgxG1-BosIxdVhhBaAt4AaABAg,Ahhh. So this is what dbrand looked like back ...,UCawT3eZDr1TevBhNHy-0Ilw,ClRH1aQuP_Q,15,0.484454
2,Ugy4sBuOaAl16zTyZA14AaABAg,MY GOD,UCv-mX9oG3kSBcxEDHg299KA,ClRH1aQuP_Q,20,0.51657
3,UgziFtIlBMiQnPZsggp4AaABAg,2020,UCYMPRzaW0ElU5aLZKpsuEBg,ClRH1aQuP_Q,28,0.51654
6,Ugx_hnT8TVbD5Ru7joh4AaABAg,Wow 2013 to 2009,UCow13dvmTlgnAsOFfXh_ysw,ClRH1aQuP_Q,21,0.41554


In [6]:
# get the percantage of the low confidence comments
low_confidence_comments.shape[0]/comment_topic.shape[0]

0.3500789740135375

> 35% of the comments have confidences below the threshold

### - replace low confidence comment topic with video topic

In [10]:
# drop topic id column
low_confidence_comments = low_confidence_comments.drop(['topicID'], 1)

In [11]:
low_confidence_comments.head()

Unnamed: 0,comment_id,comment,user_id,video_id,confidence
0,UgxxbiYbyE-8dvwdhhN4AaABAg,Who&#39;s from 2021,UCZ5PuK7lI3nbyJzF6FqGr4A,ClRH1aQuP_Q,0.516047
1,UgxG1-BosIxdVhhBaAt4AaABAg,Ahhh. So this is what dbrand looked like back ...,UCawT3eZDr1TevBhNHy-0Ilw,ClRH1aQuP_Q,0.484454
2,Ugy4sBuOaAl16zTyZA14AaABAg,MY GOD,UCv-mX9oG3kSBcxEDHg299KA,ClRH1aQuP_Q,0.51657
3,UgziFtIlBMiQnPZsggp4AaABAg,2020,UCYMPRzaW0ElU5aLZKpsuEBg,ClRH1aQuP_Q,0.51654
6,Ugx_hnT8TVbD5Ru7joh4AaABAg,Wow 2013 to 2009,UCow13dvmTlgnAsOFfXh_ysw,ClRH1aQuP_Q,0.41554


In [24]:
# delete unnecessary columns and rename the video column
video_topic = video_topic.drop(['Unnamed: 0', 'stems', 'confidence'], 1).rename({"videoId":"video_id"}, axis=1)

In [25]:
video_topic.head()

Unnamed: 0,video_id,topicID
0,-7gyHZEving,26
1,-cOYX11AfPc,14
2,-g1mHQwkpQY,26
3,-lCQMFC2D5Q,26
4,-My0ls6Da-c,26


In [27]:
# merge the video dataframe with low confidence one on video id
new_topics_low_confidence = low_confidence_comments.merge(video_topic, on='video_id')

In [28]:
new_topics_low_confidence.head()

Unnamed: 0,comment_id,comment,user_id,video_id,confidence,topicID
0,UgxxbiYbyE-8dvwdhhN4AaABAg,Who&#39;s from 2021,UCZ5PuK7lI3nbyJzF6FqGr4A,ClRH1aQuP_Q,0.516047,28
1,UgxG1-BosIxdVhhBaAt4AaABAg,Ahhh. So this is what dbrand looked like back ...,UCawT3eZDr1TevBhNHy-0Ilw,ClRH1aQuP_Q,0.484454,28
2,Ugy4sBuOaAl16zTyZA14AaABAg,MY GOD,UCv-mX9oG3kSBcxEDHg299KA,ClRH1aQuP_Q,0.51657,28
3,UgziFtIlBMiQnPZsggp4AaABAg,2020,UCYMPRzaW0ElU5aLZKpsuEBg,ClRH1aQuP_Q,0.51654,28
4,Ugx_hnT8TVbD5Ru7joh4AaABAg,Wow 2013 to 2009,UCow13dvmTlgnAsOFfXh_ysw,ClRH1aQuP_Q,0.41554,28


In [33]:
# get the comment with confidence above threshold and concat it with previous dataframe
new_comment_topic = pd.concat([comment_topic[comment_topic['confidence']>=0.6],
                               new_topics_low_confidence]).drop(['confidence'], 1)

In [34]:
new_comment_topic.head()

Unnamed: 0,comment_id,comment,user_id,video_id,topicID
4,UgzJCQUwyi68Uu7fexp4AaABAg,It’s the year 2020 and little did anyone think...,UC8Vswk4tmtNxb73eipXBLQg,ClRH1aQuP_Q,28
5,UgyvxSrlfJ7p4c3qFFl4AaABAg,"Little did people know, hes a tech genius",UCxlQBu6zxLgKix4AEqFQA9Q,ClRH1aQuP_Q,2
7,UgzHIjDvxP-MRm9VupN4AaABAg,i was bummed by Gelaskins. I have a 17&quot; l...,UChzJZI4ZT9UlsPEqcvlXGkw,ClRH1aQuP_Q,24
8,UgzJN2-mHNd1qRUllJV4AaABAg,I put the skin on totally wrong.,UCF3S5iELTiGWSG4uC0O4PxA,ClRH1aQuP_Q,15
9,UgxOwy95Z7ERmXlZ7Rh4AaABAg,i hate air bubbles! lol,UCA_MKLGMYPPfncthtkrcN0g,ClRH1aQuP_Q,9


In [35]:
new_comment_topic.shape

(15141505, 5)