In [4]:
!pip install zstandard emoji vaderSentiment scikit-learn

Collecting zstandard
  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji
  Downloading emoji-2.11.1-py2.py3-none-any.whl (433 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: zstandard, emoji, vaderSentiment
Successfully installed emoji-2.11.1 vaderSentiment-3.3.2 zstandard-0.22.0


In [5]:
import pandas as pd
import zstandard as zstd
import re
import emoji
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.preprocessing import QuantileTransformer
import numpy as np
import gc

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
def clean_text(text):
    # remove emojis
    text = emoji.demojize(text, delimiters=("", ""))

    # remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+\.(com|net|org)\S*', '', text)
    # remove markdown
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    # remove references to other subreddits
    text = re.sub(r'@\w+|r/\w+', '', text)

    text = re.sub(r'[^a-zA-Z0-9\s.,!?:;-]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [5]:
def normalize_scores(dataframe):
    #  apply rank gauss normalization
    scaler = QuantileTransformer(
        output_distribution='normal', n_quantiles=min(len(dataframe), 1000))

    normalized_data = scaler.fit_transform(dataframe[['score']])

    dataframe['normalized_score'] = normalized_data

    return dataframe

In [6]:
# Keyword list relevant to China
keywords = [
    "China", "Chinese", "Yuan", "¥", "Renminbi", "CNY", "RMB", "Beijing", "Shanghai", "Shenzhen", "People's Currency", "元", "圆"
    "Hong Kong", "CCP", "Xi Jinping", "Belt and Road", "Silk Road",
    "Taiwan", "Trade War", "Sino",
    "PBOC", "Alibaba", "Tencent", "Baidu", "Weibo", "Huawei", "ByteDance", "ZTE", "Sinopec", "Haier", "Geely", "Xiaomi", "Hang Seng",
    "Mandarin", "Cantonese", "Sichuan", "Fujian",
    "Guangdong", "One Belt One Road", "AIIB"
]


def filter_keywords(text, keywords):
    return any(keyword.lower() in text.lower() for keyword in keywords)

In [7]:
analyzer = SentimentIntensityAnalyzer()


def get_vader_score(text):
    return analyzer.polarity_scores(text)['compound']

# r/FOREX

## Posts

In [37]:
#forex_posts_path = "/content/drive/My Drive/CIS-5190-Project/Code/raw/Forex_submissions.zst"
forex_posts_path = "/content/drive/My Drive/Forex_submissions.zst"

with open(forex_posts_path, 'rb') as fh:
    dctx = zstd.ZstdDecompressor()
    reader = dctx.stream_reader(fh)
    decompressed_data = reader.read().decode('utf-8')

In [38]:
forex_posts = pd.read_json(decompressed_data, lines=True)

In [39]:
forex_posts['date'] = pd.to_datetime(forex_posts['created_utc'], unit='s')

In [40]:
forex_posts.shape[0]

94495

In [41]:
forex_posts = forex_posts[['subreddit', 'id',
                           'selftext', 'title', 'date', 'score']]

In [42]:
forex_posts = forex_posts[forex_posts['score'] >= 0]

In [43]:
# concatenate the body to the title
forex_posts['text'] = forex_posts.apply(
    lambda row: row['title'] + (': ' + row['selftext'] if row['selftext'] else ''), axis=1)

In [44]:
forex_posts['related_to_china'] = forex_posts['text'].apply(
    filter_keywords, keywords=keywords)
forex_posts = forex_posts[forex_posts['related_to_china']]

forex_posts.shape[0]

682

In [45]:
forex_posts['cleaned_body'] = forex_posts['text'].apply(clean_text)

In [46]:
forex_posts['vader_score'] = forex_posts['cleaned_body'].apply(get_vader_score)

In [47]:
forex_posts = normalize_scores(forex_posts)

In [48]:
forex_posts['weighted_vader'] = forex_posts['vader_score'] * \
    forex_posts['normalized_score']

In [49]:
forex_posts['weighted_vader']

245     -0.000000
404     -0.238959
502     -0.000000
507     -0.261850
509     -0.000000
           ...   
91739   -0.373830
91937    0.144065
92571   -1.051826
92944   -1.539004
93842   -4.692922
Name: weighted_vader, Length: 682, dtype: float64

In [50]:
forex_posts['vader_score'].describe()

count    682.000000
mean       0.119478
std        0.611240
min       -0.998500
25%       -0.352700
50%        0.000000
75%        0.708325
max        0.999800
Name: vader_score, dtype: float64

In [51]:
forex_posts['date'].describe()

count                              682
mean     2017-12-29 15:14:56.197946880
min                2010-01-21 15:40:27
25%      2015-09-08 05:01:22.750000128
50%                2018-04-17 17:41:42
75%                2020-05-28 12:37:58
max                2022-12-18 20:43:12
Name: date, dtype: object

In [52]:
forex_post_ids = set(forex_posts['id'])


In [53]:
forex_posts = forex_posts[['date', 'weighted_vader']]

In [None]:
forex_posts.to_csv('/content/drive/My Drive/CIS-5190-Project/Code/cleaned/forex_posts.csv', index=False)

In [None]:
del forex_posts
gc.collect()


NameError: name 'forex_posts' is not defined

## Comments

In [None]:
forex_comments_path = "/content/drive/My Drive/CIS-5190-Project/Code/raw/Forex_comments.zst"

with open(forex_comments_path, 'rb') as fh:
    dctx = zstd.ZstdDecompressor()
    reader = dctx.stream_reader(fh)
    decompressed_data = reader.read().decode('utf-8')

In [None]:
forex_comments = pd.read_json(decompressed_data, lines=True)

In [None]:
forex_comments.shape[0]

In [None]:
forex_comments['link_id'] = forex_comments['link_id'].apply(lambda x: x[3:])


In [None]:
forex_comments = forex_comments[['subreddit', 'score', 'body', 'parent_id', 'created_utc']]
forex_comments.shape[0]

In [None]:
forex_comments = forex_comments[forex_comments['score'] >= 0]
forex_comments.shape[0]

In [None]:
forex_comments

In [None]:
forex_comments['related_to_china'] = forex_comments['body'].apply(
    filter_keywords, keywords=keywords)

forex_comments = forex_comments[(forex_comments['related_to_china']) | (forex_comments['parent_id'].isin(post_ids))]

forex_comments.shape[0]

In [None]:
forex_comments.head()

In [None]:
print(forex_comments.shape[0])

In [None]:
forex_comments['body'] = forex_comments['body'].apply(clean_text)

# r/China

In [None]:
#china_path = "/content/drive/My Drive/CIS-5190-Project/Code/raw/China_submissions.zst"
china_path = "/content/drive/My Drive/China_submissions.zst"


with open(china_path, 'rb') as fh:
    dctx = zstd.ZstdDecompressor()
    reader = dctx.stream_reader(fh)
    decompressed_data = reader.read().decode('utf-8')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
china_posts = pd.read_json(decompressed_data, lines=True)

In [None]:
china_posts['date'] = pd.to_datetime(china_posts['created_utc'], unit='s')

In [None]:
china_posts.shape[0]

225916

In [None]:
china_posts = china_posts[['subreddit', 'id',
                           'selftext', 'title', 'date', 'score']]

In [None]:
china_posts = china_posts[china_posts['score'] >= 0]

In [None]:
# concatenate the body to the title
china_posts['text'] = china_posts.apply(
    lambda row: row['title'] + (': ' + row['selftext'] if row['selftext'] else ''), axis=1)

In [None]:
china_posts['cleaned_body'] = china_posts['text'].apply(clean_text)

In [None]:
china_posts['vader_score'] = china_posts['cleaned_body'].apply(get_vader_score)

In [None]:
china_posts = normalize_scores(china_posts)

In [55]:
forex_posts

Unnamed: 0,date,weighted_vader
245,2010-01-21 15:40:27,-0.000000
404,2010-04-07 20:59:12,-0.238959
502,2010-06-22 02:16:33,-0.000000
507,2010-06-24 19:07:33,-0.261850
509,2010-06-25 20:36:11,-0.000000
...,...,...
91739,2022-10-29 23:53:48,-0.373830
91937,2022-11-03 12:35:43,0.144065
92571,2022-11-17 08:42:56,-1.051826
92944,2022-11-27 14:10:51,-1.539004


In [56]:
china_posts

Unnamed: 0,subreddit,id,selftext,title,date,score,text,cleaned_body,vader_score,normalized_score
0,China,66nlq,,QQ和QQ堂,2008-01-26 02:23:12,1,QQ和QQ堂,qqqq,0.0000,-0.494971
1,China,66uad,,ZDT: Chinese learning tool,2008-01-28 10:22:16,5,ZDT: Chinese learning tool,zdt: chinese learning tool,0.0000,0.336178
2,China,675k3,,"Before Olympic Games, China quells dissent",2008-01-30 21:20:16,2,"Before Olympic Games, China quells dissent","before olympic games, china quells dissent",0.0000,-0.023839
3,China,67c15,,List of the best Chinese learning tools,2008-02-01 10:06:36,3,List of the best Chinese learning tools,list of the best chinese learning tools,0.6369,0.135911
4,China,67te0,,"Universal, Sony BMG, and Warner Sue Baidu",2008-02-06 05:57:46,1,"Universal, Sony BMG, and Warner Sue Baidu","universal, sony bmg, and warner sue baidu",0.0000,-0.494971
...,...,...,...,...,...,...,...,...,...,...
225911,China,1005l0u,,Wangzhihe Soyabean cube. Ever try it?,2022-12-31 22:40:20,4,Wangzhihe Soyabean cube. Ever try it?,wangzhihe soyabean cube. ever try it?,0.0000,0.245840
225912,China,10062wd,,"Viral NetEase 2022 Year in Review, deleted wit...",2022-12-31 23:05:33,23,"Viral NetEase 2022 Year in Review, deleted wit...","viral netease 2022 year in review, deleted wit...",0.0000,1.061884
225913,China,10063ju,,my chill spot in chongqing [20:54],2022-12-31 23:06:30,8,my chill spot in chongqing [20:54],my chill spot in chongqing 20:54,0.0000,0.551076
225914,China,1006bt5,I'd like to send the GM of a factory in China ...,I'd like to send the GM of a factory in China ...,2022-12-31 23:18:22,0,I'd like to send the GM of a factory in China ...,id like to send the gm of a factory in china i...,0.9195,-5.199338


In [57]:
china_posts['weighted_vader'] = china_posts['vader_score'] * \
    china_posts['normalized_score']

In [58]:
china_posts['weighted_vader']

0        -0.000000
1         0.000000
2        -0.000000
3         0.086562
4        -0.000000
            ...   
225911    0.000000
225912    0.000000
225913    0.000000
225914   -4.780791
225915   -4.956009
Name: weighted_vader, Length: 225916, dtype: float64

In [59]:
china_posts['vader_score'].describe()

count    225916.00000
mean          0.04227
std           0.45890
min          -1.00000
25%          -0.25000
50%           0.00000
75%           0.38020
max           1.00000
Name: vader_score, dtype: float64

In [60]:
china_posts['date'].describe()

count                           225916
mean     2018-09-10 04:39:53.121483776
min                2008-01-26 02:23:12
25%                2016-09-10 07:18:16
50%         2019-05-19 05:56:56.500000
75%      2020-11-27 17:22:08.249999872
max                2022-12-31 23:39:49
Name: date, dtype: object

In [61]:
china_posts = china_posts[['date', 'weighted_vader']]

In [62]:
#china_posts.to_csv('/content/drive/My Drive/CIS-5190-Project/Code/cleaned/china.csv', index=False)
china_posts.to_csv('/content/drive/My Drive/china.csv', index=False)

In [63]:
del china_posts
gc.collect()


8

# r/News

In [6]:
#news_path = "/content/drive/My Drive/CIS-5190-Project/Code/raw/news_submissions.zst"
news_path = "/content/drive/My Drive/news_submissions.zst"

with open(news_path, 'rb') as fh:
    dctx = zstd.ZstdDecompressor()
    reader = dctx.stream_reader(fh)
    decompressed_data = reader.read().decode('utf-8')

In [None]:
news = pd.read_json(decompressed_data, lines=True)

In [None]:
news['date'] = pd.to_datetime(news['created_utc'], unit='s')

In [None]:
news.shape[0]

In [None]:
news = news[['subreddit', 'id',
                           'selftext', 'title', 'date', 'score']]

In [None]:
news = news[news['score'] >= 0]

In [None]:
# concatenate the body to the title
news['text'] = news.apply(
    lambda row: row['title'] + (': ' + row['selftext'] if row['selftext'] else ''), axis=1)

In [None]:
news['related_to_china'] = news['text'].apply(
    filter_keywords, keywords=keywords)
news = news[news['related_to_china']]

news.shape[0]

In [None]:
news['cleaned_body'] = news['text'].apply(clean_text)

In [None]:
news['vader_score'] = news['cleaned_body'].apply(get_vader_score)

In [None]:
news = normalize_scores(news)

In [None]:
news['weighted_vader'] = news['vader_score'] * \
    news['normalized_score']

In [None]:
news['weighted_vader']

In [None]:
news['vader_score'].describe()

count    682.000000
mean       0.119478
std        0.611240
min       -0.998500
25%       -0.352700
50%        0.000000
75%        0.708325
max        0.999800
Name: vader_score, dtype: float64

In [None]:
news['date'].describe()

count                              682
mean     2017-12-29 15:14:56.197946880
min                2010-01-21 15:40:27
25%      2015-09-08 05:01:22.750000128
50%                2018-04-17 17:41:42
75%                2020-05-28 12:37:58
max                2022-12-18 20:43:12
Name: date, dtype: object

In [None]:
news = forex_posts[['date', 'weighted_vader']]

In [None]:
news.to_csv('/content/drive/My Drive/CIS-5190-Project/Code/cleaned/news.csv', index=False)

In [None]:
del news
gc.collect()


NameError: name 'forex_posts' is not defined