In [15]:
import pandas as pd
import numpy as np
from pathlib import Path
import glob
import os
from datetime import datetime
from nltk.corpus import stopwords
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter
from nltk.util import ngrams
import html

## Data processing
### Define the data structure of rows

In [3]:
# is_gold - true/false - If true, will only match if the author has reddit gold. If false, will only match if they do not have gold.
# is_submitter - true/false - (only relevant when checking comments) If true, will only match if the author 
# was also the submitter of the post being commented inside. If false, will only match if they were not.
# send_replies – When True, messages will be sent to the submission author when comments are made to the submission

data_type = {"subreddit": "string", "subreddit_id": "string", "subreddit_type": "string", "author": "string", "body" : "string", 
            "created_date" : "string", "created_utc": "string", "retrieved_on" : "string", 
            "id": "string", "parent_id": "string", "link_id": "string", "score": "int", "total_awards_received": "int", 
            "controversiality": "int", "gilded": "int", 
            "collapsed_because_crowd_control": "int", "collapsed_reason": "string", "distinguished": "string", "removal_reason": "string",
            "author_created_utc": "string", "author_fullname": "string", "author_patreon_flair": "bool", "author_premium": "bool",
            "can_gild": "bool", "can_mod_post": "bool", "collapsed": "bool", "is_submitter": "bool", "_edited": "string", "locked": "bool",
            "quarantined": "bool", "no_follow": "bool", "send_replies": "bool", "stickied": "bool", "author_flair_text": "string"}

### read all the csv files using the defined data structure, remove deleted rows

Some of the rows are marked as "deleted" which indicate the contents had been removed for some reasons. We need to remove then as they do not contribute to our analysis.

In [4]:
# read the data and remove [deleted] row
# we comment out the codes as we already process the data

"""
for x in range(2008, 2020):
    df = pd.read_csv(f'./data/{x}.csv')
    df = df[(df.body != '[deleted]') & (df.body != '[removed]')]
    df.drop(columns = ['Unnamed: 0'], inplace = True)
    df.to_csv(f'./data/{x}_revised.csv')
"""


"\nfor x in range(2008, 2020):\n    df = pd.read_csv(f'./data/{x}.csv')\n    df = df[(df.body != '[deleted]') & (df.body != '[removed]')]\n    df.drop(columns = ['Unnamed: 0'], inplace = True)\n    df.to_csv(f'./data/{x}_revised.csv')\n"

### Identify All Posts Related to iPhone on Reddit

We conducted a review and found that the subreddits **apple** and **iphone** contain the most relevant discussions about the iPhone. However, the 'apple' subreddit also includes posts about other Apple products like Macs, iPod Touch, and iTunes. To ensure relevance, we will filter out these non-iPhone related posts. Our approach assumes that all posts within a single discussion thread focus on the same topic. Therefore, we will retain any thread in the 'apple' subreddit if at least one post within that thread mentions the iPhone."

In [5]:
def locate_iphone_post(data):
    """
    Find the parent_ids that contain discussions about iPhone.
    """
    df_apple = data[data.subreddit == 'apple']
    related_discussion = df_apple[df_apple.body.str.contains('iphone', flags = re.IGNORECASE)]
    return(related_discussion.parent_id.unique())

In [6]:
def extract_all_iphone_post(data):
    ids = locate_iphone_post(data)
    return(data[(data.subreddit == 'iphone') | (data.parent_id.isin(ids))])

In [7]:
# final clean, only retain iPhone related posts each year

def get_iphone_data_yearly(data, filename = None):
    """
    """
    df = extract_all_iphone_post(data)
    df.drop(columns = ['Unnamed: 0'], inplace = True)
    if filename != None:
        df.to_csv(filename)
    return(df)

In [8]:
# comment the following code because we already run it
"""
for x in range(2008, 2020):
    file_name = f'./data/{x}_revised.csv'
    save_path = f'./data/{x}_iphone.csv'
    data = pd.read_csv(file_name, dtype=data_type, header = 0)
    get_iphone_data_yearly(data, filename = save_path)
"""

"\nfor x in range(2008, 2020):\n    file_name = f'./data/{x}_revised.csv'\n    save_path = f'./data/{x}_iphone.csv'\n    data = pd.read_csv(file_name, dtype=data_type, header = 0)\n    get_iphone_data_yearly(data, filename = save_path)\n"

### process the special chars in the comment

It seems that the Reddit comment data are raw text data which means the HTML character entities are preserved. For example, we ofter encounter the following chars in the comment:

1. `&lt;` is an HTML entity for the less-than symbol ("<").
2. `&gt;` is an HTML entity for the greater-than symbol (">").

We need to convert these entities back to their original characters to ensure that the text is correctly interpreted and analyzed. This can usually be done using HTML parsing libraries.

Also, escape sequences such as `\n` and `\'` need to be replaced.

In [109]:
def remove_escape_seq(data):
    """
    """
    while data.body.str.contains('&gt;').sum() > 0:
        data['body'] = data['body'].apply(html.unescape)
        
    # Replace common escape sequences
    escape_sequences = {"\\n": " ", "\\r": " ", "\\t": " ", "\\'": "'", '\\"': '"', '\\b': '', '\\0': ''}
    for seq, replacement in escape_sequences.items():
        data['body'] = data['body'].str.replace(seq, replacement, regex = False)

    return data

LDA

to do

In [159]:
df_2 = pd.read_csv('./data/2009.csv', dtype=data_type, header = 0)
df_trial = df_2
apple_related_comment = df_trial[df_trial.subreddit.isin(['iphone']) & (df_trial.body != '[deleted]')].loc[:, ('parent_id', 'body')]

# Grouping comments by 'parent_id'
apple_related_comment = apple_related_comment.groupby('parent_id')['body'].apply(' '.join).reset_index()

nltk.download('stopwords')
nltk.download('wordnet')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('iphone')
stopwords.append('use')
stopwords.append('people')
stopwords.append('http')
stopwords.append('www')
stopwords.append('com')
stopwords.append('really')
stopwords.append('apple')
stopwords.append('actually')
stopwords.append('thanks')
stopwords.append('thank')
stopwords.append('think')
stopwords.append('phone')
stopwords.append('oh')

# Function to clean text data
def clean_text(text):
    # Lowercasing
    text = text.lower()
    # Removing non-alphabetic characters
    text = re.sub(r'[^a-z0-9]', ' ', text)
    # Tokenization
    words = text.split()
    # Removing stopwords and lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords]
    # Rejoining the words back into a single string
    text = ' '.join(words)
    return text

# Cleaning the comments
apple_related_comment['cleaned_body'] = apple_related_comment['body'].apply(clean_text)

# Displaying the first few rows of cleaned data
apple_related_comment[['body', 'cleaned_body']].head()

# Extracting features for LDA
vectorizer = CountVectorizer(max_df = 0.8, min_df = 10, stop_words = 'english')
dtm = vectorizer.fit_transform(apple_related_comment['cleaned_body'])

# Fitting LDA model
lda = LatentDirichletAllocation(n_components=15, random_state=0)
lda.fit(dtm)

# Function to display topics
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d:" % (topic_idx)] = ", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
    return topic_dict

# Displaying the topics
no_top_words = 10
display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kalok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kalok\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


{'Topic 0:': 'amp, network, like, service, contract, 3g, battery, year, make, know',
 'Topic 1:': 'like, keyboard, want, apps, screen, sm, feature, time, button, make',
 'Topic 2:': 'case, screen, battery, new, contact, like, time, 3g, store, replace',
 'Topic 3:': 'update, post, link, yes, new, reddit, hope, mm, sorry, paste',
 'Topic 4:': 'app, like, apps, developer, comment, reddit, store, gt, make, time',
 'Topic 5:': 'itunes, jailbreak, restore, firmware, jailbreaking, 3gs, jailbroken, file, dev, tool',
 'Topic 6:': 'memory, apps, gt, app, problem, safari, running, jailbroken, run, time',
 'Topic 7:': 'app, apps, 3g, setting, icon, theme, sbsettings, cydia, like, thing',
 'Topic 8:': 'game, app, google, free, like, great, version, 99, fun, play',
 'Topic 9:': 'know, game, time, like, sure, want, good, 10, way, look',
 'Topic 10:': 'plan, data, user, month, photo, carrier, pay, voice, free, year',
 'Topic 11:': 'app, apps, flash, page, store, video, want, site, web, like',
 'Topic 

In [279]:
df_2 = pd.read_csv('./data/2009.csv', dtype=data_type, header = 0)

In [280]:
df_trial = df_2
apple_related_comment = df_trial[df_trial.subreddit.isin(['iphone']) & (df_trial.body != '[deleted]')].loc[:, ('parent_id', 'body')]

# Grouping comments by 'parent_id'
apple_related_comment = apple_related_comment.groupby('parent_id')['body'].apply(' '.join).reset_index()

In [282]:
# Downloading necessary NLTK components
nltk.download('stopwords')
nltk.download('wordnet')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('iphone')
stopwords.append('use')
stopwords.append('people')
stopwords.append('http')
stopwords.append('www')
stopwords.append('com')
stopwords.append('really')
stopwords.append('apple')
stopwords.append('actually')
stopwords.append('thanks')
stopwords.append('thank')
stopwords.append('think')
stopwords.append('phone')
stopwords.append('oh')
stopwords.append('sorry')
stopwords.append('hi')
stopwords.append('even')

tech_term = [ 'app', 'apps', 'iphone', 'phone', 'device', 'mobile', 'screen', 'button', 'battery', 'camera', 'keyboard', 
               'wifi', 'network', 'service', 'call', 'data', 'version', 'update', 'store', 'itunes', 'cydia', 'firmware', 'jailbreak', 'jailbroken', 
               'jailbreaking', 'flash', 'link', 'post', 'code', 'file', 'download', 'sync', 'contact', 'email', 'gps', '3g', '3gs', 'icon', 'theme', 
               'sbsettings', 'winterboard', 'blog', 'dev', 'tool', 'setting', 'settings', 'card', 'memory', 'photo', 'video', 'youtube', 'promo', 
               'promo code', 'gt', 'org', 'pdf', 'mp3', 'ipod', 'itunes', 'mac', 'apple', 'safari', 'google', 'gmail', 'stanza', 'verizon', 
               'customer', 'contract', 'price', 'unlock', 'install', 'price', 'free', 'developer', 'web', 'site', 'page', 'application', 'app store', 'view', 
               'look like', 'touch']

common_term = ['like', 'one', 'get', 'would', 'know', 'think', 'make', 'go', 'want', 'need', 'say', 'come', 'time', 'take', 'see', 'look', 'use', 'really', 
               'good', 'great', 'well', 'still', 'also', 'much', 'could', 'way', 'thing', 'got', 'around', 'first', 'new', 'lot', 'try', 'might', 'even', 
               'something', 'anything', 'everything', 'nothing', 'day', 'year', 'month', 'week', 'today', 'yesterday', 'tomorrow', 'now', 'then', 'here', 
               'there', 'where', 'why', 'how', 'what', 'which', 'who', 'whom', 'amp', 'let', 'put', 'end', 'start', 'seem', 'feel', 'sound', 'look', 'tend', 
               'may', 'might', 'must', 'will', 'shall', 'can', 'could', 'should', 'would', 'did', 'do', 'does', 'done', 'have', 'has', 'had', 'give', 'given', 
               'gave', 'take', 'taken', 'took', 'say', 'said', 'telling', 'tell', 'told', 'go', 'went', 'gone', 'going', 'keep', 'kept', 'keeping', 'seem', 
               'seemed', 'seeming', 'seems', 'become', 'became', 'becomes', 'becoming', 'stay', 'stayed', 'staying', 'stays', 'fall', 'fell', 'fallen', 'falling', 
               'stand', 'stood', 'standing', 'stands', 'become', 'became', 'becoming', 'becomes', 'come', 'came', 'coming', 'comes', 'provide', 'provided', 'provides', 
               'providing', 'include', 'included', 'includes', 'including', 'continue', 'continued', 'continues', 'continuing', 'expect', 'expected', 'expecting', 
               'expects', 'hope', 'hoped', 'hopes', 'hoping', 'appear', 'appeared', 'appearing', 'appears', 'remain', 'remained', 'remaining', 'remains', 'suggest', 
               'suggested', 'suggesting', 'suggests', 'want', 'wanted', 'wanting', 'wants', 'wish', 'wished', 'wishes', 'wishing', 'prefer', 'preferred', 'preferring', 
               'prefers', 'desire', 'desired', 'desires', 'desiring', 'love', 'loved', 'loves', 'loving', 'like', 'liked', 'likes', 'liking', 'admire', 'admired', 
               'admires', 'admiring', 'appreciate', 'appreciated', 'appreciates', 'appreciating', 'value', 'valued', 'values', 'valuing', 'choose', 'chose', 'chosen']

#stopwords.extend(tech_term)
#stopwords.extend(common_term)


# Function to clean text data
def clean_text(text):
    # Lowercasing
    text = text.lower()
    # Removing non-alphabetic characters
    text = re.sub(r'[^a-z0-9%]', ' ', text)
    # Tokenization
    words = text.split()
    # Removing stopwords and lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords]
    # Rejoining the words back into a single string
    text = ' '.join(words)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kalok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kalok\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [283]:
# Cleaning the comments
apple_related_comment['cleaned_body'] = apple_related_comment['body'].apply(clean_text)

# Displaying the first few rows of cleaned data
apple_related_comment[['body', 'cleaned_body']].head()

Unnamed: 0,body,cleaned_body
0,That would never fit into any normal pocket. ...,would never fit normal pocket ipod big stretch...
1,You apparently got downvoted for criticizing A...,apparently got downvoted criticizing lame prod...
2,"+1 for you. Relax. One person downvoted him, h...",1 relax one person downvoted 4 big conspiracy
3,It is a big conspiracy. It's reddit.,big conspiracy reddit
4,WORST comes to worst you do a restore and you ...,worst come worst restore working nothing lose


In [284]:
# Function to extract n-grams (bigrams and trigrams)
def extract_ngrams(data, num):
    n_grams = ngrams(data.split(), num)
    return [' '.join(grams) for grams in n_grams]

# Extracting bigrams and trigrams
bigrams = []
trigrams = []
for doc in apple_related_comment['cleaned_body']:
    bigrams.extend(extract_ngrams(doc, 2))
    trigrams.extend(extract_ngrams(doc, 3))

# Counting occurrences of bigrams and trigrams
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)

# Displaying the most common bigrams and trigrams
most_common_bigrams = bigram_counts.most_common(10)
most_common_trigrams = trigram_counts.most_common(10)

most_common_bigrams, most_common_trigrams

([('app store', 219),
  ('3 0', 199),
  ('3 1', 134),
  ('look like', 81),
  ('ipod touch', 76),
  ('sound like', 74),
  ('1 2', 72),
  ('battery life', 57),
  ('pretty much', 56),
  ('seems like', 54)],
 [('3 1 2', 52),
  ('webobjects mzstore woa', 27),
  ('mzstore woa wa', 27),
  ('youtube watch v', 26),
  ('woa wa viewsoftware', 26),
  ('wa viewsoftware id', 26),
  ('amp mt 8', 26),
  ('itunes webobjects mzstore', 25),
  ('en wikipedia org', 23),
  ('wikipedia org wiki', 23)])

In [285]:
# Extracting features for LDA
vectorizer = CountVectorizer(ngram_range=(1, 3), max_df = 0.8, min_df = 25)
dtm = vectorizer.fit_transform(apple_related_comment['cleaned_body'])

# Fitting LDA model
lda = LatentDirichletAllocation(n_components=12, random_state=0)
lda.fit(dtm)

# Function to display topics
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d:" % (topic_idx)] = ", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
    return topic_dict

# Displaying the topics
no_top_words = 15
display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

{'Topic 0:': 'amp, 3g, call, network, data, service, would, year, contract, plan, month, 3gs, make, get, like',
 'Topic 1:': 'game, play, app, work, good, free, get, fun, great, version, like, music, control, time, playing',
 'Topic 2:': 'google, gt, contact, sync, like, know, yes, sure, would, work, one, calendar, way, photo, voice',
 'Topic 3:': 'jailbreak, apps, jailbroken, jailbreaking, cydia, get, itunes, restore, firmware, need, install, app, device, store, 3gs',
 'Topic 4:': 'case, like, screen, button, make, one, keyboard, would, also, look, feel, type, find, great, time',
 'Topic 5:': 'app, store, app store, apps, get, developer, would, time, pretty, buy, good, make, money, like, one',
 'Topic 6:': 'get, see, new, update, got, one, like, mm, used, first, could, please, time, would, guy',
 'Topic 7:': 'apps, app, battery, icon, application, push, theme, background, sbsettings, text, cydia, life, much, like, mail',
 'Topic 8:': 'reddit, comment, app, flash, get, way, back, page,

In [286]:
lda.perplexity(dtm)

623.6987584402699

In [287]:
lda.score(dtm)

-512819.72850376583

In [288]:
# Get topic distribution for each document
topic_distributions = lda.transform(dtm)

# Calculate the weight of each topic
topic_weights = topic_distributions.sum(axis=0) / topic_distributions.sum()

# Print the weight of each topic
for topic_idx, topic_weight in enumerate(topic_weights):
    print(f"Topic {topic_idx}: {topic_weight:.2%}")

Topic 0: 8.51%
Topic 1: 8.05%
Topic 2: 7.41%
Topic 3: 7.71%
Topic 4: 9.43%
Topic 5: 10.94%
Topic 6: 9.26%
Topic 7: 7.12%
Topic 8: 8.88%
Topic 9: 7.96%
Topic 10: 6.38%
Topic 11: 8.35%


In [179]:
lda.transform(dtm)

array([[0.56647282, 0.00666667, 0.00666667, ..., 0.0066667 , 0.00666667,
        0.00666668],
       [0.00512821, 0.00512821, 0.00512821, ..., 0.00512821, 0.00512822,
        0.00512822],
       [0.01666677, 0.01666671, 0.01666669, ..., 0.01666674, 0.01666669,
        0.76666588],
       ...,
       [0.00606063, 0.00606062, 0.00606061, ..., 0.00606062, 0.00606062,
        0.00606062],
       [0.00333334, 0.00333334, 0.00333333, ..., 0.00333334, 0.00333334,
        0.00333335],
       [0.02222227, 0.02222225, 0.02222227, ..., 0.02222223, 0.02222222,
        0.02222225]])

In [104]:
from gensim.models.phrases import Phrases, Phraser

# Creating bigrams and trigrams
bigram = Phrases(apple_related_comment['cleaned_body'], min_count=5, threshold=100)
trigram = Phrases(apple_related_comment['cleaned_body'], threshold=100)

# Phraser for efficiency
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# Function to form bigrams and trigrams
def make_ngrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# Applying the function to our dataset
ngrams_corpus = make_ngrams(apple_related_comment['cleaned_body'])

# Creating a new dictionary and corpus for LDA with n-grams
dictionary_ngrams = corpora.Dictionary(ngrams_corpus)
dictionary_ngrams.filter_extremes(no_below=20, no_above=0.85)
corpus_ngrams = [dictionary_ngrams.doc2bow(text) for text in ngrams_corpus]

# Applying LDA using Gensim on the data with n-grams
lda_model_with_ngrams = LdaModel(corpus_with_ngrams, num_topics=15, id2word=dictionary_with_ngrams, passes=15)

# Extracting and displaying topics with n-grams
lda_topics_with_ngrams = lda_model_with_ngrams.print_topics(num_words=10)
lda_topics_with_ngrams

[(0,
  '0.007*"rice" + 0.003*"bitching" + 0.003*"toilet" + 0.003*"free app" + 0.003*"parallel" + 0.003*"phone really" + 0.003*"rotated" + 0.002*"nerd" + 0.002*"launching app" + 0.002*"hate spotlight"'),
 (1,
  '0.005*"air" + 0.004*"apt" + 0.004*"regarding" + 0.003*"many people" + 0.003*"dark" + 0.003*"23" + 0.002*"upvote" + 0.002*"geohot" + 0.002*"get work" + 0.001*"furthermore"'),
 (2,
  '0.003*"dollar" + 0.003*"white" + 0.003*"art" + 0.003*"lie" + 0.002*"nsfw" + 0.002*"attempt" + 0.002*"gadget" + 0.002*"find one" + 0.002*"operation" + 0.002*"would great"'),
 (3,
  '0.002*"protection" + 0.002*"know work" + 0.002*"paragraph" + 0.002*"apps without" + 0.001*"advertisement" + 0.001*"download apps" + 0.001*"bigboss" + 0.001*"radio station" + 0.001*"know saurik" + 0.001*"via itunes"'),
 (4,
  '0.020*"iphone" + 0.013*"phone" + 0.012*"app" + 0.009*"like" + 0.009*"one" + 0.008*"would" + 0.008*"get" + 0.008*"use" + 0.008*"apple" + 0.007*"time"'),
 (5,
  '0.006*"rock" + 0.005*"stanza" + 0.005*"h