In [168]:
import pandas as pd
import numpy as np
from pathlib import Path
import glob
import os
from datetime import datetime
from nltk.corpus import stopwords
import nltk
import re
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter
from nltk.util import ngrams
import html
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel as gensim_LdaModel, CoherenceModel
import matplotlib.pyplot as plt

# libs for saving model
import pickle

# Plotting tools
import pyLDAvis
import pyLDAvis.lda_model
import matplotlib.pyplot as plt
%matplotlib inline
import pyLDAvis.gensim_models as gensimvis

In [169]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kalok\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kalok\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kalok\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kalok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Data processing
### Define the data structure of rows

In [170]:
# is_gold - true/false - If true, will only match if the author has reddit gold. If false, will only match if they do not have gold.
# is_submitter - true/false - (only relevant when checking comments) If true, will only match if the author 
# was also the submitter of the post being commented inside. If false, will only match if they were not.
# send_replies – When True, messages will be sent to the submission author when comments are made to the submission

data_type = {"subreddit": "string", "subreddit_id": "string", "subreddit_type": "string", "author": "string", "body" : "string", 
            "created_date" : "string", "created_utc": "string", "retrieved_on" : "string", 
            "id": "string", "parent_id": "string", "link_id": "string", "score": "int", "total_awards_received": "int", 
            "controversiality": "int", "gilded": "int", 
            "collapsed_because_crowd_control": "int", "collapsed_reason": "string", "distinguished": "string", "removal_reason": "string",
            "author_created_utc": "string", "author_fullname": "string", "author_patreon_flair": "bool", "author_premium": "bool",
            "can_gild": "bool", "can_mod_post": "bool", "collapsed": "bool", "is_submitter": "bool", "_edited": "string", "locked": "bool",
            "quarantined": "bool", "no_follow": "bool", "send_replies": "bool", "stickied": "bool", "author_flair_text": "string"}

### read all the csv files using the defined data structure, remove deleted rows

Some of the rows are marked as "deleted" which indicate the contents had been removed for some reasons. We need to remove then as they do not contribute to our analysis.

In [171]:
# read the data and remove [deleted] row
# we comment out the codes as we already process the data

"""
for x in range(2008, 2020):
    df = pd.read_csv(f'./data/{x}.csv')
    df = df[(df.body != '[deleted]') & (df.body != '[removed]')]
    df.drop(columns = ['Unnamed: 0'], inplace = True)
    df.to_csv(f'./data/{x}_revised.csv')
"""


"\nfor x in range(2008, 2020):\n    df = pd.read_csv(f'./data/{x}.csv')\n    df = df[(df.body != '[deleted]') & (df.body != '[removed]')]\n    df.drop(columns = ['Unnamed: 0'], inplace = True)\n    df.to_csv(f'./data/{x}_revised.csv')\n"

### Identify All Posts Related to iPhone on Reddit

We conducted a review and found that the subreddits **apple** and **iphone** contain the most relevant discussions about the iPhone. However, the 'apple' subreddit also includes posts about other Apple products like Macs, iPod Touch, and iTunes. To ensure relevance, we will filter out these non-iPhone related posts. Our approach assumes that all posts within a single discussion thread focus on the same topic. Therefore, we will retain any thread in the 'apple' subreddit if at least one post within that thread mentions the iPhone."

In [172]:
def locate_iphone_post(data):
    """
    Find the parent_ids that contain discussions about iPhone.
    """
    df_apple = data[data.subreddit == 'apple']
    related_discussion = df_apple[df_apple.body.str.contains('iphone', flags = re.IGNORECASE)]
    return(related_discussion.link_id.unique())

In [173]:
def extract_all_iphone_post(data):
    ids = locate_iphone_post(data)
    return(data[(data.subreddit == 'iphone') | (data.link_id.isin(ids))])

In [174]:
# final clean, only retain iPhone related posts each year

def get_iphone_data_yearly(data, filename = None):
    """
    """
    df = extract_all_iphone_post(data)
    df.drop(columns = ['Unnamed: 0'], inplace = True)
    if filename != None:
        df.to_csv(filename)
    return(df)

In [177]:
# comment the following code because we already run it

"""
for x in range(2008, 2020):
    file_name = f'./data/{x}_revised.csv'
    save_path = f'./data/{x}_iphone_v2.csv'
    data = pd.read_csv(file_name, dtype=data_type, header = 0)
    get_iphone_data_yearly(data, filename = save_path)
"""


"\nfor x in range(2008, 2020):\n    file_name = f'./data/{x}_revised.csv'\n    save_path = f'./data/{x}_iphone_v2.csv'\n    data = pd.read_csv(file_name, dtype=data_type, header = 0)\n    get_iphone_data_yearly(data, filename = save_path)\n"

### process the special chars in the comment

It seems that the Reddit comment data are raw text data which means the HTML character entities are preserved. For example, we ofter encounter the following chars in the comment:

1. `&lt;` is an HTML entity for the less-than symbol ("<").
2. `&gt;` is an HTML entity for the greater-than symbol (">").

We need to convert these entities back to their original characters to ensure that the text is correctly interpreted and analyzed. This can usually be done using HTML parsing libraries.

Also, escape sequences such as `\n` and `\'` need to be replaced.

In [178]:
def remove_escape_seq(data):
    """
    """
    while data.body.str.contains('&gt;').sum() > 0:
        data['body'] = data['body'].apply(html.unescape)
        
    # Replace common escape sequences
    escape_sequences = {"\\n": " ", "\\r": " ", "\\t": " ", "\\'": "'", '\\"': '"', '\\b': '', '\\0': ''}
    for seq, replacement in escape_sequences.items():
        data['body'] = data['body'].str.replace(seq, replacement, regex=False)
    
    data['body'] = data.body.apply(lambda x: x.strip())

    return data

In [179]:
def convert_create_utc(data):
    """
    """
    # Ensure 'created_utc' is a datetime object
    data['created_utc'] = pd.to_datetime(data['created_utc'])
    return(data)

In [180]:
def aggregate_post_by_link_id(data) -> pd.Series:
    """
    """
    # Group by 'link_id', sort by 'created_utc', and concatenate 'body'
    grouped_texts = data.sort_values(by='created_utc').groupby('link_id')['body'].apply(lambda x: ' '.join(x)).reset_index()
    return grouped_texts

In [181]:
# testing function, or reduce noises
def combine_special_words(data):
    """
    """
    # mark special words
    """
    sw = {'face id': 'face-id', 'touch id': 'touch-id', 'apple pay': 'apple-pay', '3d touch': '3d-touch',
         'ipad pro': 'ipad-pro', 'ipod pro': 'ipod-pro', 'ipadpro': 'ipad-pro', 'ipodprod': 'ipod-pro',
         'mac pro': 'mac-pro', 'macpro': 'mac-pro', 'apple watch': 'apple-watch', 'applewatch': 'apple-watch',
         'homepod': 'home-pod', 'home pod': 'home-pod', 'apple tv': 'apple-tv', 'appletv': 'apple-tv', 'apple music': 'apple-music',
         'applemusic': 'apple-music', 'lightning cable': 'lightning-cable', 'true tone': 'true-tone', 
         'retina display': 'retina-display',  'dark mode': 'dark-mode', 'rose gold': 'rose-gold', 'jet black': 'jet black', 
         'space gray': 'space-gray', 'oled display': 'oled-display', 'truedepth camera': 'truedepth-camera', 
         'truedepth': 'truedepth-camera', 'night shift': 'night-shift', 'portrait mode': 'portrait-mode', 'live photos': 'live-photos', 
         'live photo': 'live-photos', 'force touch': 'force-touch', 'slo-mo video': 'slo-mo-video', 'battery life': 'battery-life',
         'wireless charger': 'wireless-charger', 'colour': 'color', 'google map': 'google-map', 'macbook pro': 'mac-pro',
         'macbook air': 'mac-air', 'ipad air': 'ipad-air', 'release date': 'release-date', 'wi fi': 'wifi', '30 pin': '30-pin', 
         'black friday': 'black-friday', 'ipad mini': 'ipad-mini', 'ipod touch': 'ipod-touch'}
    
    sw2 = {'lightning': 'lightning-cable', 'retina': 'retina-display', 'oled': 'oled-display', 'portrait': 'portrait-mode',
          'jailbreaking': 'jailbreak', 'jailbroken': 'jailbreak'}
          
    for seq, replacement in sw.items():
        data['body'] = data['body'].str.replace(seq, replacement, regex=False)
    data['body'] = data.body.apply(lambda x: x.strip())
    
    for seq, replacement in sw2.items():
        data['body'] = data['body'].str.replace(seq, replacement, regex=False)
    data['body'] = data.body.apply(lambda x: x.strip())
    """
    
    good_synonyms = {
    "nice": "good",
    "excellent": "good",
    "great": "good",
    "amazing": "good",
    "fantastic": "good",
    "awesome": "good",
    "wonderful": "good",
    "perfect": "good",
    "fabulous": "good",
    "superb": "good",
    "outstanding": "good",
    "beautiful": "good",
    "remarkable": "good",
    "impressive": "good",
    "splendid": "good",
    "terrific": "good",
    "marvelous": "good",
    "positive": "good",
    "satisfactory": "good",
    "pretty": "good",
    "pleasing": "good"}
    
    bad_synonyms = {
    "poor": "bad",
    "terrible": "bad",
    "awful": "bad",
    "horrible": "bad",
    "dreadful": "bad",
    "abysmal": "bad",
    "worse": "bad",
    "lousy": "bad",
    "atrocious": "bad",
    "inferior": "bad",
    "unsatisfactory": "bad",
    "inadequate": "bad",
    "substandard": "bad",
    "unsuitable": "bad",
    "unpleasant": "bad",
    "negative": "bad",
    "deficient": "bad",
    "mediocre": "bad",
    "pathetic": "bad",
    "lacking": "bad",
    "undesirable": "bad"}
    
    for seq, replacement in good_synonyms.items():
        data['body'] = data['body'].str.replace(seq, replacement, regex=False)
    data['body'] = data.body.apply(lambda x: x.strip())
    
    for seq, replacement in bad_synonyms.items():
        data['body'] = data['body'].str.replace(seq, replacement, regex=False)
    data['body'] = data.body.apply(lambda x: x.strip())

    return data

In [182]:
stopwords = nltk.corpus.stopwords.words('english')

improved_sw_list = ['gonna', 'understand', 'seen', 'wanted', 'haha', 'max', 'restart', 'deleted', 'happening', 'possible', 'understand', 
                    'literally', 'matter', 'based', 'regarding', 'possible', 'latest', 'np', 'handed', 'iphone', 'use', 'people', 'http', 
                    'https', 'www', 'com', 'really', 'apple', 'actually', 'thanks', 'thank', 'think', 'phone', 'oh', 'sorry', 'hi', 'imgur', 
                    'like', 'get', 'got', 'used', 'make', 'work', 'worked', 'apps', 'want' 'wants', 'file', 'thing', 'say', 'know', 
                    'knew', 'reddit', 'subreddit', 'need', 'using', 'app', 'year', 'month', 'day', 'going', 'window', 'product', 'good', 
                    'pay', 'song', 'way', 'love', 'great', 'free', 'point', 'product', 'new', 'try', 'fix', 'work', 'issue', 'problem', 'got', 
                    'store', 'order', 'ordered', 'date', 'ship', 'shipping', 'org', 'usage', 'wikipedia', 'updates', 'update', 'feature', 
                    'gb', 'backup', 'file', 'space', 'lot', 'time', 'better', 'look', 'right', 'maybe', 'might', 'can', 'could', 'be', 'come', 
                    'device', 'user', 'run', 'nice', 'version', 'buy', 'software', 'hardware', 'application', 'mac', 'article', 'commit', 
                    'comment', 'opinion', 'reason', 'mean', 'little', 'computer', 'job', 'mobile', 'market', 'sure', 'yes', 'no', 'let', 
                    'probably', 'game', 'price', 'jailbreaking', 'open', 'close', 'best', 'fun', 'guy', 'gay', 'shit', 'past', 'fit', 
                    'pretty', 'cool', 'long', 'story', 'real', 'video', 'company', 'platform', 'version', 'case', 'getting', 'idea', 
                    'bought', 'play', 'post', 'link', 'review', 'awesome', 'said', 'fact', 'different', 'making', 'technology', 'service', 
                    'data', 'icon', 'home', 'page', 'button', 'jpg', 'png', 'imac', 'pc', 'desktop', 'drive', 'music', 'developer', 'dev', 
                    '99', 'message', 'text', 'hour', 'bit', 'unlocked', 'customer', 'io', 'help', 'release', 'able', 'took', 'feel', 'felt', 
                    'turn', 'street', 'city', 'town', 'cheng', 'check', 'delete', 'add', 'enter', 'quality', 'sound', 'support', 'team', 
                    'read', 'ad', 'iphones', 'old', 'new', 'tv', 'control', 'plan', 'business', 'hand', 'big', 'download', 'upload', 
                    'complain', 'install', 'installed', 'wait', 'code', 'site', 'edit', 'option', 'image', 'year', 'thought', 'word', 
                    'trying', 'try', 'tell', 'paid', 'install', 'friend', 'question', 'man', 'woman', 'girl', 'boy', 'kind', 'hope', 
                    'hopefully', 'tried', 'account', 'number', 'called', 'laptop', 'called', 'week', 'launch', 'password', 'money', 'dollar', 
                    'lol', 'wow', 'machine', 'gen', 'paying', 'buying', 'op', 'definitely', 'second', 'running', 'worked', 'agree', 
                    'wrong', 'place', 'forum', 'far', 'away', 'click', 'talk', 'car', 'came', 'difference', 'hard', 'reading', 'hold', 
                    'example', 'notification', 'happens', 'listen', 'see', 'copy', 'access', 'looking', 'default', 'miss', 'switch', 
                    'switched', 'happened', 'start', 'started', 'note', 'today', 'wallpaper', 'change', 'close', 'today', 'wallpaper', 
                    'change', 'close', 'likely', 'mail', 'email', 'list', 'push', 'reset', 'went', 'charging', 'ask', 'answer', 
                    'ago', 'asked', 'guess', 'useful', 'easy', 'playing', 'sell', 'cost', 'black', 'white', 'setting', 'report', 'told', 
                    'tomorrow', 'morning', 'hey', 'night', 'simple', 'step', 'night', 'simple', 'step', 'picture', '10', '100', '200',
                    'shipped', '14', '13', '100', '50', 'fucking', 'party', 'mt', '30', 'screen', 'test', 'remote', 'fine', '9am', 
                    'et', '9am et', 'automatically', 'posted', 'posted', 'regrettably', 'posted', 'noticed', 'tablet', 'model', 
                    'calling', 'wi fi', 'send', '2fapple', '2fr', 'rule', 'available', 'available', 'keyboard', 'keyboard', 
                    'thread' ,'inbox', 'mailbox', 'performed', 'alarm', 'le', 'stuff', 'subreddit compose', 'moderator subreddit compose', 
                    'compose', 'normal', 'end', 'topic', 'saturday', 'wednesday', 'self', 'promotion', 'bank', 'sr', 'sort', '100', 
                    '100', '20', 'pro', 'air', 'ipads', 'id', 'retina', 'display', 'lightning', 'mode', 'downloading', 'unlimited', 'mini', 
                    'saying', 'save', 'sm', 'yeah', 'want', 'imgur', 'www', 
                    'turned', 'on', 'later', 'soon', 'express', 'reposting', 'iup', 'left', 'type', 'instead', 'luck', 'hello', 'congrats',
                    'ok', 'winning', 'win', '20the', '20this', '20of', '20by', '2fu', 'fall', '20have', 'notice', 'cheaper', 'current', 'value',
                    'random', 'att', '19', 'minute', 'yesterday', 'color', 'set', 'easier', '24', '25', 'ebay', 'older', 'line', 
                   ]

stopwords.extend(improved_sw_list)
stop_words_list = set(stopwords)

# Function to clean text data
def clean_text(text):
    # Lowercasing
    text = text.lower()
    # Removing non-alphabetic characters
    # text = re.sub(r'-', '_', text)
    text = re.sub(r'[^a-z0-9&-]', ' ', text)
    # Tokenization
    words = text.split()
    # Removing stopwords and lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words_list]
    # Rejoining the words back into a single string
    text = ' '.join(words)
    return text

In [183]:
def extract_nouns_adjectives(text):
    # Tokenize the text
    text = re.sub(r'[^a-zA-Z0-9&-]', ' ', text)
    tokens = word_tokenize(text)

    # POS Tagging
    tagged = nltk.pos_tag(tokens)

    # Filter nouns and adjectives
    nouns_adj = [word for word, tag in tagged if tag.startswith('NN') or tag.startswith('JJ')]

    # Lemmatization and stopword removal
    lemmatizer = WordNetLemmatizer()
    filtered_words = [lemmatizer.lemmatize(word) for word in nouns_adj if word.lower() not in stop_words_list]
    filtered_words = ' '.join(filtered_words)
    filtered_words.lower()

    return filtered_words

In [184]:
def nlp_data_process(posts: pd.Series) -> pd.Series:
    cleaned_posts = posts.apply(clean_text)
    #cleaned_posts = posts.apply(extract_nouns_adjectives)
    return(cleaned_posts)

LDA

### Methodology ###
1. Data Collection and Preparation:

   * We collected a substantial dataset of Reddit comments, year-wise, to ensure a comprehensive analysis.
   * Rigorous preprocessing was performed on this textual data to clean and standardize it for effective topic modeling.
  
  
2. Topic Extraction Using LDA:

   * For each year's data, we applied Latent Dirichlet Allocation (LDA), a powerful technique for topic modeling.
   * To determine the optimal number of topics for each year, we utilized GridSearch provided by scikit-learn. This approach allowed us to identify the most coherent and meaningful number of topics for each year's dataset.
   
   
3. Observations:

   * A notable trend emerged from our analysis: the number of topics increased with each passing year.
   * This increase is attributed to two primary factors:
      * Growing Data Volume: As the volume of Reddit comments expanded annually, it naturally led to a broader spectrum of discussions.
      * Divergence of Topics: Over the years, the discussions on Reddit became more diverse and multifaceted, reflecting the evolving interests and concerns of the Reddit community.
      
      
4. Manual Review and Categorization:

   * With the topics for each year extracted, the next step involved a manual review. This review process was crucial to deeply understand the context and nuances of each topic.
   * Our goal is to classify these topics into a fixed number of overarching categories. This categorization will not only streamline the topics for easier comprehension but also help in identifying common or persistent themes across different years.
   
   
5. Significance and Next Steps


   * Understanding Community Evolution: This analysis provides valuable insights into how online communities evolve, highlighting changes in interests, concerns, and popular discussions over time.
   * Strategic Application: The findings from this study can inform content strategies, marketing approaches, and community engagement plans for entities interested in leveraging Reddit's vast user base.
   * Future Exploration: Building upon this research, we aim to explore correlations between these topics and external factors such as global events, technological advancements, and cultural shifts.

In [209]:
def train_lda_model(posts, vectorizer = None, doc_topic_matrix = None,  num_topics = 12, num_iter = 10, tf_idf=False):
    """
    """
    if (vectorizer == None) or (doc_topic_matrix == None):
        print('--- starting processing the data in sklearn ---')
        clean_text = nlp_data_process(posts)
    
        print('training WordCountVec and doc-topic-matrix')
        # Extracting features for LDA
        if tf_idf == True:
            vectorizer = TfidfVectorizer(max_df = 0.8, min_df=0.02, stop_words='english', ngram_range=(1,3), 
                                     token_pattern=r'(?u)\b\w[\w&-]*\w\b')
        else:
            vectorizer = CountVectorizer(max_df = 0.8, min_df = 0.02, stop_words = 'english', ngram_range = (1,3), 
                                         token_pattern=r'(?u)\b\w[\w&-]*\w\b')
        dtm = vectorizer.fit_transform(clean_text) # doc-word-matrix
    else:
        print(f'word-vec and doc-topic-matrix are provided, skip the fisrt part')
    
    print('training LDA model')
    # Fitting LDA model
    lda = LatentDirichletAllocation(n_components = num_topics, random_state = 0, n_jobs = -1, learning_decay = 0.8, max_iter = num_iter)
    lda.fit(dtm)
    
    print('--- finish training ---')
    return((vectorizer, dtm, lda))

In [210]:
# Function to display topics
def display_topics(model, feature_names, no_top_words, save_path = ''):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d:" % (topic_idx)] = ", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        
    if save_path != '':
        with open(save_path, 'w') as file:
            for topic, words in topic_dict.items():
                file.write(f"{topic} {words}\n")
        
    return topic_dict

In [211]:
def display_lda_vis(vectorizer, dtm, model):
    """
    """
    pyLDAvis.enable_notebook()
    panel = pyLDAvis.lda_model.prepare(lda, dtm, vectorizer, mds='tsne')
    panel
    return(panel)

In [212]:
def get_document_topic_matrix_gensim(lda_model, corpus):
    # Number of topics
    num_topics = lda_model.num_topics

    # Create a matrix to hold topic distributions for each document
    doc_topic_matrix = np.zeros((len(corpus), num_topics))

    # Iterate over the corpus to get topic distribution for each document
    for i, row in enumerate(corpus):
        # Get the distribution for the document
        doc_topics = lda_model.get_document_topics(row, minimum_probability=0)
        # Update the matrix
        doc_topic_matrix[i, :] = [prob for _, prob in doc_topics]

    return doc_topic_matrix

In [213]:
def train_lda_model_gensim(posts, num_topic = 12):
    """
    Trains an LDA model using Gensim with preprocessed posts.
    """
    print('--- starting processing the data ---')

    # Convert preprocessed posts to list of words
    data_words = [post.split() for post in posts]

    # Create Dictionary
    id2word = corpora.Dictionary(data_words)
    
    # Filter out tokens that appear in
    # less than 30 documents (absolute number) or
    # more than 80% documents (fraction of total corpus size, not absolute number).
    # after the above two steps, keep only the first 100000 most frequent tokens.
    id2word.filter_extremes(no_below=30, no_above=0.8, keep_n=10000)

    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data_words]

    print('training LDA model')
    lda_model = gensim_LdaModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=num_topic,
                         random_state=0,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto')

    print('--- finish training ---')
    return((id2word, corpus, lda_model))

In [214]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute coherence scores for various number of topics.

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max number of topics
    start : Min number of topics
    step : Step size

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim_LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, random_state=0)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [215]:
def gensim_display_topics(lda_model, num_top_words):
    """
    """
    
    # Display the top words for each topic
    for i, topic in lda_model.show_topics(formatted=True, num_topics=lda_model.num_topics, num_words=num_top_words):
        print(f"Topic {i}: {topic}\n")

In [216]:
# this function is not used as we use sklearn instead of gensim
def search_best_num_topics(posts, start = 1, stop = 40, step = 1):
    """
    """
    print('getting the id2word and corpus')
    # Convert preprocessed posts to list of words
    data_words = [post.split() for post in posts]

    # Create Dictionary
    id2word = corpora.Dictionary(data_words)
    
    # Filter out tokens that appear in
    # less than 30 documents (absolute number) or
    # more than 80% documents (fraction of total corpus size, not absolute number).
    # after the above two steps, keep only the first 100000 most frequent tokens.
    id2word.filter_extremes(no_below=0.02, no_above=0.8, keep_n=10000)
    
    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data_words]
    
    print('calculate the LDA model and corresponding coherence score')
    # Assuming 'id2word' is your Gensim dictionary, 'corpus' is your Gensim corpus, and 'data_words' is your preprocessed text data
    model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_words, start=start, limit=limit, step=step)
    
    return(model_list, coherence_values)

In [217]:
def search_for_best_esti(year, start_num, end_num, step, iters: list):
    """
    """
    data_path = './data/' + str(year) + '_iphone.csv'
    df = pd.read_csv(data_path, dtype=data_type, header = 0)
    df = remove_escape_seq(df)
    df = convert_create_utc(df)
    grouped_posts = aggregate_post_by_link_id(df)
    posts = nlp_data_process(grouped_posts.body)
    
    vectorizer, dtm, lda = train_lda_model(posts, 6)


    # Define Search Param
    search_params = {'n_components': [i for i in range(start_num, end_num, step)], 'max_iter': iters}

    # Init the Model
    lda_model = LatentDirichletAllocation(n_jobs=-1)

    # Init Grid Search Class
    model = GridSearchCV(lda_model, param_grid=search_params, verbose=4)

    # Do the Grid Search
    model.fit(dtm)
    return(model)

In [218]:
def train_lda_of_year(year: int, save_path = '', save = False, tf_idf = False, version = 'v1'):
    """
    """
    # load the data
    path_data = f'./data/{str(year)}_iphone_{version}.csv'
    
    # process data
    print(f'processing the data of year {str(year)}')
    df = pd.read_csv(path_data, dtype=data_type, header = 0)
    df = remove_escape_seq(df)
    df = combine_special_words(df)
    df = convert_create_utc(df)
    grouped_posts = aggregate_post_by_link_id(df)
    df_posts = grouped_posts[['link_id', 'body']]
    df_posts.to_csv(f'./data/{year}_aggregated_posts_{version}.csv')
    posts = nlp_data_process(grouped_posts.body)
    
    # determine the best params, this part is done by function search_for_best_esti
    # 2009 (2008) topic = 2, max_iter = 20
    # 2011 topic = 4, max_iter = 40
    # 2013 topic = 6, max_iter = 40 (2012)
    # 2015 tioic = 12, max_iter = 60 (2014)
    # 2016 topic = 18, n_components = 80
    # 2017 (2018, 2019) topic = 24, n_component = 90
    # but I find out that the result is not goood, so I have to increase the num_topic a bit
    
    if year == 2008:
        num_topics = 1
        num_iter = 10
    elif year == 2009 or year == 2010:
        num_topics = 9
        num_iter = 20
    elif year == 2011:
        num_topics = 10
        num_iter = 40
    elif year == 2012:
        # word 20
        num_topics = 12
        num_iter = 40
    elif year == 2013:
        num_topics = 12
        num_iter = 40
    elif year == 2014:
        num_topics = 17
        num_iter = 60
    elif year == 2015:
        num_topics = 16
        num_iter = 60
    elif year == 2016:
        num_topics = 20
        num_iter = 80
    elif year == 2017 or year == 2018:
        num_topics = 24
        num_iter = 90
    else:
        num_topics = 21
        num_iter = 80
    
    print(f'start training the model')
    (vectorizer, dtm, lda) = train_lda_model(posts, num_topics=num_topics, num_iter=num_iter, tf_idf=tf_idf)
    
    # Generate the doc-topic matrix
    # Document topic distribution for X
    doc_topic_matrix = lda.transform(dtm)
    
    print(f'printing the results')
    display_topics(lda, vectorizer.get_feature_names_out(), 30, save_path = save_path + '_topics.txt')
    
    if save == True:
        with open(save_path + '.pkl', 'wb') as fout:
            pickle.dump((vectorizer, dtm, lda), fout)
        with open(save_path + '_dtm' + '.pkl', 'wb') as fout:
            pickle.dump(dtm, fout)
        with open(save_path + '_doc_topic_matrix.pkl', 'wb') as fout:
            pickle.dump(doc_topic_matrix, fout)
    
    return(vectorizer, dtm, lda)

In [219]:
def train_lda_of_whole_date(version = 'v1'):
    """
    train the lda model from 2008 to 2019
    """
    for x in range(2008, 2020):
        train_lda_of_year(x, f'./data/lda_model_{x}_{version}', save = True, tf_idf = False, version=version)

In [220]:
# comment it out as we already process the data
# train_lda_of_whole_date('v2')

In [221]:
def convert_topics_to_csv_with_pandas(year, version):
    txt_file_path = f'./data/lda_model_{year}_{version}_topics.txt'
    topics_data = []

    with open(txt_file_path, 'r') as txt_file:
        for line in txt_file:
            # Extract topic index and keywords
            parts = line.strip().split(': ')
            topic_index = parts[0].split(' ')[1]  # Assumes format "Topic X"
            keywords = parts[1] if len(parts) > 1 else ''

            # Append to data list
            topics_data.append({
                'Year': year,
                'Topic Index': topic_index,
                'Keywords': keywords,
                'Topic Category': '',  # Empty for now
                'Topic Story': ''      # Empty for now
            })

    # Create DataFrame and save to CSV
    df = pd.DataFrame(topics_data)
    return(df)

In [222]:
def convert_all_topics(data_path, version = 'v1'):
    """
    """
    yearly_topic_data = []
    for y in range(2008, 2020):
        yearly_topic_data.append(convert_topics_to_csv_with_pandas(y, version))
    
    df = yearly_topic_data[0]
    for i in range(1, len(yearly_topic_data)):
        df = pd.concat([df, yearly_topic_data[i]], ignore_index=True)
    
    df.to_csv(data_path, index=False, sep=';')

In [223]:
# convert_all_topics('./data/topics_iphonve_v2.csv', 'v2')