In [1]:
!pip install nltk



In [2]:
import re
import unicodedata
import string
import pandas as pd
import nltk
from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

Load data file with header

In [3]:
dataset = pd.read_csv('MM5427_COVID-19_Tweets_2.csv', encoding='latin-1', header=0)
dataset.text = [row.encode('latin-1').decode('utf-8', 'ignore') for row in dataset.text]

# 1.I

Construct a list acc_tag that contain all account tags for each tweet

In [4]:
acc_tag = []
for text in dataset['text']:
    # split the text by whitespace
    text_content = re.split(r'\s|[(),.;!?/\'"]', text)
    # filter the list of items that starts with "@" and append to the acc_tag list
    acc_tag.append(list(filter(lambda x: x.startswith('@') and len(x) > 1, text_content)))
# add the acc_tag as the new column
dataset['acc_tag'] = acc_tag

Do the same for hashtag and URL

In [5]:
hashtag = []
for text in dataset['text']:
    text_content = re.split(r'\s|[(),.;!?/\'"]', text)
    hashtag.append(list(filter(lambda x: x.startswith('#') and len(x) > 1, text_content)))
dataset['hashtag'] = hashtag

In [6]:
URL = []
for text in dataset['text']:
    text_content = re.split(r'\s', text)
    URL.append(list(filter(lambda x: x.startswith('http'), text_content)))
dataset['URL'] = URL

In [7]:
dataset.head()

Unnamed: 0,date,time,user_id,user_follower_count,user_like_count,user_friend_count,user_media_count,user_post_count,user_list_count,user_verified,...,user_account_type,user_account_age,reply_count,like_count,retweet_count,quote_count,text,acc_tag,hashtag,URL
0,1/1/2020,10:05:36,1.2012e+18,524,889,425,441,1644,0,0,...,1,736,1,6,8,3,@AlwayACritic @DariusVolket @ZubSpike @AusMaze...,"[@AlwayACritic, @DariusVolket, @ZubSpike, @Aus...","[#China, #SARS, #chinesevirus]",[]
1,1/1/2020,2:02:52,1380602000.0,102,495,864,10,136,0,0,...,0,58587,0,0,0,0,Can’t drink can’t smoke wonderful way to start...,[],[#flu],[]
2,1/1/2020,2:13:03,8.24622e+17,3269,2131,5001,130,1933,20,0,...,1,25668,0,1,0,0,Great use of medical #science in the drama nar...,[@bbcradio4],"[#science, #Flu]",[https://t.co/AXHQi7wwgi]
3,1/1/2020,16:11:05,2749415000.0,767,3079,83,156,11499,13,0,...,1,47040,0,0,1,0,SARS is back! In one of the most heavily traff...,[],"[#Wuhan, #China, #SARS, #HongKong]",[https://t.co/iF4sS1WxMf]
4,1/1/2020,19:01:09,433035800.0,7645,9290,984,950,37606,21,0,...,1,70673,0,2,0,0,Tim and I spent #NYE playing #Pandemic togethe...,[],"[#NYE, #Pandemic]",[https://t.co/mNHuuvXA9Q]


# 1.II

In [8]:
# put the acc_tag sublist into one single list
acc_tag_single_list = [item for sublist in acc_tag for item in sublist]
# count the number of each acc in the list
acc_counter = Counter(acc_tag_single_list)
# get the top 10 countered acc
top_10_acc = acc_counter.most_common(10)
print(top_10_acc)

[('@realDonaldTrump', 812), ('@WHO', 747), ('@CDCgov', 321), ('@narendramodi', 269), ('@DrTedros', 153), ('@BorisJohnson', 144), ('@POTUS', 143), ('@PMOIndia', 140), ('@MoHFW_INDIA', 114), ('@CNN', 97)]


The result shows that country leaders and international health organizations are the most tagged.
This may reflect people seeking information, express opinion or emotion towards authority

In [9]:
# Do the same for hashtag and URL
hashtag_single_list = [item for sublist in hashtag for item in sublist]
hash_counter = Counter(hashtag_single_list)
top_10_hash = hash_counter.most_common(10)
print(top_10_hash)

[('#coronavirus', 16760), ('#COVID19', 14314), ('#Coronavirus', 3922), ('#CoronavirusOutbreak', 3359), ('#COVID2019', 2938), ('#CoronavirusPandemic', 2108), ('#covid19', 1594), ('#CoronaVirus', 1490), ('#CoronaVirusUpdate', 1477), ('#Corona', 1337)]


All 10 hashtags are related to COVID

In [10]:
URL_single_list = [item for sublist in URL for item in sublist]
URL_counter = Counter(URL_single_list)
top_10_URL = URL_counter.most_common(10)
print(top_10_URL)

[('https://t.co/Fbzw6mR9Q5', 12), ('http', 12), ('https://t.co/nFY1lZJJ2I', 12), ('https:/', 11), ('https://t.c', 11), ('https://t.co/vY4fVgAjuk', 10), ('https://t.', 9), ('https://t.co/huLTzc781F', 7), ('https://t.co/', 7), ('https://t.co', 6)]


Those URL are mainly news related. Showing people wants to spread the information in the social media

In [11]:
# Define a function for removing punctuation
def remove_punctuation(input_string):
    # Create a translation table mapping punctuation characters to empty string
    translator = str.maketrans('', '', string.punctuation + '‘’“”–•・❝❞')
    # Transform the full-width characters to half-with characters
    normalized_text = unicodedata.normalize('NFKC', input_string)
    # Remove punctuation using the translation table
    no_punct = normalized_text.translate(translator)
    return no_punct

In [12]:
# Use the result in 1.I to do the removal
processed_text = []
delimiter = ' '
stop_words = set(stopwords.words('english'))
nltk.download('punkt')
for i in range(dataset.shape[0]):
    # get the data of the row
    text = dataset['text'][i]
    acc_tag = dataset['acc_tag'][i]
    hashtag = dataset['hashtag'][i]
    URL = dataset['URL'][i]
    # remove account tags
    for tag1 in acc_tag:
        text = text.replace(tag1, '')
    # remove hashtag
    for tag2 in hashtag:
        text = text.replace(tag2, '')
    # remove URL
    for tag3 in URL:
        text = text.replace(tag3, '')
    # remove punctuations by above function
    text = remove_punctuation(text)
    # tokenize the text, remove the stop words and join the tokens to sentence again
    tokens = word_tokenize(text)
    processed_text.append(delimiter.join([w for w in tokens if not w in stop_words]))
dataset['processed_text'] = processed_text

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
dataset.head()

Unnamed: 0,date,time,user_id,user_follower_count,user_like_count,user_friend_count,user_media_count,user_post_count,user_list_count,user_verified,...,user_account_age,reply_count,like_count,retweet_count,quote_count,text,acc_tag,hashtag,URL,processed_text
0,1/1/2020,10:05:36,1.2012e+18,524,889,425,441,1644,0,0,...,736,1,6,8,3,@AlwayACritic @DariusVolket @ZubSpike @AusMaze...,"[@AlwayACritic, @DariusVolket, @ZubSpike, @Aus...","[#China, #SARS, #chinesevirus]",[],Lets welcome new year decade exciting Im tryin...
1,1/1/2020,2:02:52,1380602000.0,102,495,864,10,136,0,0,...,58587,0,0,0,0,Can’t drink can’t smoke wonderful way to start...,[],[#flu],[],Cant drink cant smoke wonderful way start 2020
2,1/1/2020,2:13:03,8.24622e+17,3269,2131,5001,130,1933,20,0,...,25668,0,1,0,0,Great use of medical #science in the drama nar...,[@bbcradio4],"[#science, #Flu]",[https://t.co/AXHQi7wwgi],Great use medical drama narrative 15 Minute Dr...
3,1/1/2020,16:11:05,2749415000.0,767,3079,83,156,11499,13,0,...,47040,0,0,1,0,SARS is back! In one of the most heavily traff...,[],"[#Wuhan, #China, #SARS, #HongKong]",[https://t.co/iF4sS1WxMf],SARS back In one heavily trafficked airports w...
4,1/1/2020,19:01:09,433035800.0,7645,9290,984,950,37606,21,0,...,70673,0,2,0,0,Tim and I spent #NYE playing #Pandemic togethe...,[],"[#NYE, #Pandemic]",[https://t.co/mNHuuvXA9Q],Tim I spent playing together FaceTime We one t...


# 1.IV

In [14]:
lower_text = []
for i in range(dataset.shape[0]):
    # apply lowercase for each row
    lower_text.append(dataset['processed_text'][i].lower())
dataset['processed_text'] = lower_text

# 1.V

In [15]:
stem_text = []
delimiter = ' '
stemmer = PorterStemmer()
for i in range(dataset.shape[0]):
    stem_token = []
    # tokenize the text and apply stemming for each token
    for tokens in word_tokenize(dataset['processed_text'][i]):
        stem_token.append(stemmer.stem(tokens))
    stem_text.append(delimiter.join(stem_token))
dataset['stemmed_text'] = stem_text

In [16]:
dataset.head()

Unnamed: 0,date,time,user_id,user_follower_count,user_like_count,user_friend_count,user_media_count,user_post_count,user_list_count,user_verified,...,reply_count,like_count,retweet_count,quote_count,text,acc_tag,hashtag,URL,processed_text,stemmed_text
0,1/1/2020,10:05:36,1.2012e+18,524,889,425,441,1644,0,0,...,1,6,8,3,@AlwayACritic @DariusVolket @ZubSpike @AusMaze...,"[@AlwayACritic, @DariusVolket, @ZubSpike, @Aus...","[#China, #SARS, #chinesevirus]",[],lets welcome new year decade exciting im tryin...,let welcom new year decad excit im tri upbeat ...
1,1/1/2020,2:02:52,1380602000.0,102,495,864,10,136,0,0,...,0,0,0,0,Can’t drink can’t smoke wonderful way to start...,[],[#flu],[],cant drink cant smoke wonderful way start 2020,cant drink cant smoke wonder way start 2020
2,1/1/2020,2:13:03,8.24622e+17,3269,2131,5001,130,1933,20,0,...,0,1,0,0,Great use of medical #science in the drama nar...,[@bbcradio4],"[#science, #Flu]",[https://t.co/AXHQi7wwgi],great use medical drama narrative 15 minute dr...,great use medic drama narr 15 minut drama my l...
3,1/1/2020,16:11:05,2749415000.0,767,3079,83,156,11499,13,0,...,0,0,1,0,SARS is back! In one of the most heavily traff...,[],"[#Wuhan, #China, #SARS, #HongKong]",[https://t.co/iF4sS1WxMf],sars back in one heavily trafficked airports w...,sar back in one heavili traffick airport world...
4,1/1/2020,19:01:09,433035800.0,7645,9290,984,950,37606,21,0,...,0,2,0,0,Tim and I spent #NYE playing #Pandemic togethe...,[],"[#NYE, #Pandemic]",[https://t.co/mNHuuvXA9Q],tim i spent playing together facetime we one t...,tim i spent play togeth facetim we one turn sp...


# 1.VI

Finding the emoji. I notice there are some emoji stuck together. 

I tried both considering it to be one emoji and multiple emoji

In [17]:
emoticons1 = []
emoticons2 = []
for i in range(dataset.shape[0]):
    text = dataset['stemmed_text'][i]
    # Consider consecutive emoji as one emoji
    emoticons1.append(re.findall('[\U0001F600-\U0001F64F]+', text))
    # Consider consecutive emoji as multiple emoji
    emoticons_multi = re.finditer('[\U0001F600-\U0001F64F]', text)
    for emoji in emoticons_multi:
        emoticons2.append(emoji.group())
# create single list of emoji for counting        
emoticons_single_list1 = [item for sublist in emoticons1 for item in sublist]
emoticons_counter1 = Counter(emoticons_single_list1)
top_3_emoticons1 = emoticons_counter1.most_common(3)
print(top_3_emoticons1)

emoticons_counter2 = Counter(emoticons2)
top_3_emoticons2 = emoticons_counter2.most_common(3)
print(top_3_emoticons2)

[('🙏', 560), ('😷', 346), ('😂', 242)]
[('🙏', 757), ('😂', 602), ('😷', 462)]


The top 3 results are the same but in different order.

It shows people are likely to use multiple 😂 to express themselves

# 2.I

Two models for Count vectorizer

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_count_reply = CountVectorizer(max_df=0.5, min_df=0.02)
vectorizer_count_like = CountVectorizer(max_df=0.5, min_df=0.02)

# 2.IIa

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [20]:
dataset['reply_count'].describe()

count    51221.000000
mean         8.971984
std        103.724886
min          0.000000
25%          0.000000
50%          1.000000
75%          4.000000
max       8221.000000
Name: reply_count, dtype: float64

In [21]:
dataset.groupby('reply_count').size()

reply_count
0       20276
1        9713
2        5263
3        3112
4        2138
        ...  
5550        1
6935        1
7141        1
7649        1
8221        1
Length: 405, dtype: int64

As the data have long tail, I try to keep 99% of the data and exclude the remaining

In [22]:
dataset2 = dataset[dataset['reply_count'] <= dataset['reply_count'].quantile(0.99)]
content = dataset2['stemmed_text'].values
y = dataset2['reply_count'].values

content_train, content_test, y_train, y_test = train_test_split(content, y, test_size=0.2, random_state=99)

vectorizer_count_reply.fit(content)

X = vectorizer_count_reply.transform(content).toarray()
X_train = vectorizer_count_reply.transform(content_train).toarray()
X_test = vectorizer_count_reply.transform(content_test).toarray()

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

mse_cv = -cross_val_score(linear_reg, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
score_cv = cross_val_score(linear_reg, X_train, y_train, cv=10)

print("Training MSE: %.2f" % mse_cv.mean())
print("Training R square: %.6f" % score_cv.mean())
print("Testing MSE: %.2f" % mean_squared_error(y_test, linear_reg.predict(X_test)))
print("Testing R square: %.6f" % linear_reg.score(X_test, y_test, sample_weight=None))

Training MSE: 108.31
Training R square: 0.010701
Testing MSE: 110.75
Testing R square: 0.012406


# 2.IIb

In [23]:
dataset['like_count'].describe()

count     51221.000000
mean        147.968099
std        4008.755911
min           0.000000
25%           5.000000
50%          16.000000
75%          45.000000
max      717503.000000
Name: like_count, dtype: float64

In [24]:
dataset.groupby('like_count').size()

like_count
0         3856
1         2806
2         2193
3         1608
4         1554
          ... 
81190        1
152986       1
196017       1
428200       1
717503       1
Length: 1596, dtype: int64

As the data have long tail, I try to keep 99% of the data and exclude the remaining

In [25]:
dataset3 = dataset[dataset['like_count'] <= dataset['like_count'].quantile(0.99)]
content = dataset3['stemmed_text'].values
y = dataset3['like_count'].values

content_train, content_test, y_train, y_test = train_test_split(content, y, test_size=0.2, random_state=99)

vectorizer_count_reply.fit(content)

X = vectorizer_count_reply.transform(content).toarray()
X_train = vectorizer_count_reply.transform(content_train).toarray()
X_test = vectorizer_count_reply.transform(content_test).toarray()

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

mse_cv = -cross_val_score(linear_reg, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
score_cv = cross_val_score(linear_reg, X_train, y_train, cv=10)

print("Training MSE: %.2f" % mse_cv.mean())
print("Training R square: %.6f" % score_cv.mean())
print("Testing MSE: %.2f" % mean_squared_error(y_test, linear_reg.predict(X_test)))
print("Testing R square: %.6f" % linear_reg.score(X_test, y_test, sample_weight=None))

Training MSE: 20440.03
Training R square: 0.008071
Testing MSE: 20454.69
Testing R square: 0.009841


# 2.III

The MSE of "reply" prediction is significantly lower than the MSE of "like" prediction. Meaning the people are more predictable to reply the tweet with specific words. While people may randomly give like regardless of the content

# 2.IV

In [26]:
# Two models for Tf-IDf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf_reply = TfidfVectorizer(max_df = 0.5, min_df = 0.02)
vectorizer_tfidf_like = TfidfVectorizer(max_df = 0.5, min_df = 0.02)

# 2.Va

In [27]:
content = dataset2['stemmed_text'].values
y = dataset2['reply_count'].values

vectorizer_tfidf_reply.fit(content)

content_train, content_test, y_train, y_test = train_test_split(content, y, test_size=0.2, random_state=99)
X = vectorizer_tfidf_reply.transform(content).toarray()
X_train = vectorizer_tfidf_reply.transform(content_train).toarray()
X_test = vectorizer_tfidf_reply.transform(content_test).toarray()

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

mse_cv = -cross_val_score(linear_reg, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
score_cv = cross_val_score(linear_reg, X_train, y_train, cv=10)

print("Training MSE: %.2f" % mse_cv.mean())
print("Training R square: %.6f" % score_cv.mean())
print("Testing MSE: %.2f" % mean_squared_error(y_test, linear_reg.predict(X_test)))
print("Testing R square: %.6f" % linear_reg.score(X_test, y_test, sample_weight=None))

Training MSE: 108.53
Training R square: 0.008544
Testing MSE: 110.95
Testing R square: 0.010605


# 2.Vb

In [28]:
content = dataset3['stemmed_text'].values
y = dataset3['like_count'].values

vectorizer_tfidf_like.fit(content)

content_train, content_test, y_train, y_test = train_test_split(content, y, test_size=0.2, random_state=99)
X = vectorizer_tfidf_like.transform(content).toarray()
X_train = vectorizer_tfidf_like.transform(content_train).toarray()
X_test = vectorizer_tfidf_like.transform(content_test).toarray()

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

mse_cv = -cross_val_score(linear_reg, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
score_cv = cross_val_score(linear_reg, X_train, y_train, cv=10)

print("Training MSE: %.2f" % mse_cv.mean())
print("Training R square: %.6f" % score_cv.mean())
print("Testing MSE: %.2f" % mean_squared_error(y_test, linear_reg.predict(X_test)))
print("Testing R square: %.6f" % linear_reg.score(X_test, y_test, sample_weight=None))

Training MSE: 20474.87
Training R square: 0.006368
Testing MSE: 20501.33
Testing R square: 0.007583



The MSE of "reply" is again significantly lower than MSE of "like" prediction. 

The result of Count and TF-IDF vectorizer are very similar.

# 3.I

In [29]:
# load NRC-Emotion-lexicon
lexicon = pd.read_csv('NRC-Emotion-Lexicon.txt', sep = '\t', names = ['term', 'category', 'associated'])

# 3.II

In [30]:
# positive and negative sentiment word list
pos_list = list(lexicon[(lexicon['category'] == 'positive') & (lexicon['associated'] == 1)].term)
neg_list = list(lexicon[(lexicon['category'] == 'negative') & (lexicon['associated'] == 1)].term)

In [31]:
def sentiment_score(text_list, sen_list):
    temp_list = []
    check_list = []
    for t in text_list:
        if len(t) > 0:
            tokenized_text = word_tokenize(t)
            temp = 0
            word1 = []
            for w in sen_list:
                if tokenized_text.count(w) > 0:
                    word1.append(w)
                temp += tokenized_text.count(w)
            temp_list.append(temp/len(tokenized_text))
            check_list.append(word1)
        else:
            temp_list.append(0)
            check_list.append([])
    return temp_list, check_list

In [32]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
# Lemmatize text before applying lexicon
lemmatized_text = []
delimiter = ' '
lemmatizer = WordNetLemmatizer()
for text in dataset['processed_text']:
    token = []
    # tokenize the text and apply lemmatization for each token
    for tokens in word_tokenize(text):
        token.append(lemmatizer.lemmatize(tokens))
    lemmatized_text.append(delimiter.join(token))
# replace the text
dataset['lemmatized_text'] = lemmatized_text

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
dataset['pos_score'] ,dataset['pos_word_list'] = sentiment_score(dataset['lemmatized_text'], pos_list)
dataset['neg_score'] ,dataset['neg_word_list'] = sentiment_score(dataset['lemmatized_text'], neg_list)

dataset.head()

Unnamed: 0,date,time,user_id,user_follower_count,user_like_count,user_friend_count,user_media_count,user_post_count,user_list_count,user_verified,...,acc_tag,hashtag,URL,processed_text,stemmed_text,lemmatized_text,pos_score,pos_word_list,neg_score,neg_word_list
0,1/1/2020,10:05:36,1.2012e+18,524,889,425,441,1644,0,0,...,"[@AlwayACritic, @DariusVolket, @ZubSpike, @Aus...","[#China, #SARS, #chinesevirus]",[],lets welcome new year decade exciting im tryin...,let welcom new year decad excit im tri upbeat ...,let welcome new year decade exciting im trying...,0.1,"[exciting, launch]",0.1,"[pneumonia, rumor]"
1,1/1/2020,2:02:52,1380602000.0,102,495,864,10,136,0,0,...,[],[#flu],[],cant drink cant smoke wonderful way start 2020,cant drink cant smoke wonder way start 2020,cant drink cant smoke wonderful way start 2020,0.125,[wonderful],0.0,[]
2,1/1/2020,2:13:03,8.24622e+17,3269,2131,5001,130,1933,20,0,...,[@bbcradio4],"[#science, #Flu]",[https://t.co/AXHQi7wwgi],great use medical drama narrative 15 minute dr...,great use medic drama narr 15 minut drama my l...,great use medical drama narrative 15 minute dr...,0.083333,[medical],0.0,[]
3,1/1/2020,16:11:05,2749415000.0,767,3079,83,156,11499,13,0,...,[],"[#Wuhan, #China, #SARS, #HongKong]",[https://t.co/iF4sS1WxMf],sars back in one heavily trafficked airports w...,sar back in one heavili traffick airport world...,sars back in one heavily trafficked airport wo...,0.0,[],0.055556,[heavily]
4,1/1/2020,19:01:09,433035800.0,7645,9290,984,950,37606,21,0,...,[],"[#NYE, #Pandemic]",[https://t.co/mNHuuvXA9Q],tim i spent playing together facetime we one t...,tim i spent play togeth facetim we one turn sp...,tim i spent playing together facetime we one t...,0.076923,[love],0.076923,[spent]


# 3.III

In [34]:
# 4 emotion word list
fear_list = list(lexicon[(lexicon['category'] == 'fear') & (lexicon['associated'] == 1)].term)
anger_list = list(lexicon[(lexicon['category'] == 'anger') & (lexicon['associated'] == 1)].term)
sadness_list = list(lexicon[(lexicon['category'] == 'sadness') & (lexicon['associated'] == 1)].term)
joy_list = list(lexicon[(lexicon['category'] == 'joy') & (lexicon['associated'] == 1)].term)

In [35]:
dataset['fear_score'] ,dataset['fear_word_list'] = sentiment_score(dataset['lemmatized_text'], fear_list)
dataset['anger_score'] ,dataset['anger_word_list'] = sentiment_score(dataset['lemmatized_text'], anger_list)
dataset['sadness_score'] ,dataset['sadness_word_list'] = sentiment_score(dataset['lemmatized_text'], sadness_list)
dataset['joy_score'] ,dataset['joy_word_list'] = sentiment_score(dataset['lemmatized_text'], joy_list)

In [36]:
dataset.head()

Unnamed: 0,date,time,user_id,user_follower_count,user_like_count,user_friend_count,user_media_count,user_post_count,user_list_count,user_verified,...,neg_score,neg_word_list,fear_score,fear_word_list,anger_score,anger_word_list,sadness_score,sadness_word_list,joy_score,joy_word_list
0,1/1/2020,10:05:36,1.2012e+18,524,889,425,441,1644,0,0,...,0.1,"[pneumonia, rumor]",0.1,"[mysterious, pneumonia]",0.0,[],0.05,[rumor],0.05,[exciting]
1,1/1/2020,2:02:52,1380602000.0,102,495,864,10,136,0,0,...,0.0,[],0.0,[],0.0,[],0.0,[],0.125,[wonderful]
2,1/1/2020,2:13:03,8.24622e+17,3269,2131,5001,130,1933,20,0,...,0.0,[],0.083333,[medical],0.0,[],0.0,[],0.0,[]
3,1/1/2020,16:11:05,2749415000.0,767,3079,83,156,11499,13,0,...,0.055556,[heavily],0.0,[],0.0,[],0.0,[],0.0,[]
4,1/1/2020,19:01:09,433035800.0,7645,9290,984,950,37606,21,0,...,0.076923,[spent],0.0,[],0.0,[],0.0,[],0.076923,[love]


# 3.IVa

In [37]:
dataset2 = dataset[dataset['reply_count'] <= dataset['reply_count'].quantile(0.99)]
X = dataset2[['pos_score', 'neg_score', 'fear_score', 'anger_score', 'fear_score', 'sadness_score']]
y = dataset2['reply_count'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

mse_cv = -cross_val_score(linear_reg, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
score_cv = cross_val_score(linear_reg, X_train, y_train, cv=10)

print("Training MSE: %.2f" % mse_cv.mean())
print("Training R square: %.6f" % score_cv.mean())
print("Testing MSE: %.2f" % mean_squared_error(y_test, linear_reg.predict(X_test)))
print("Testing R square: %.6f" % linear_reg.score(X_test, y_test, sample_weight=None))

Training MSE: 109.46
Training R square: 0.000076
Testing MSE: 112.19
Testing R square: -0.000479


# 3.IVb

In [38]:
dataset3 = dataset[dataset['like_count'] <= dataset['like_count'].quantile(0.99)]
X = dataset3[['pos_score', 'neg_score', 'fear_score', 'anger_score', 'fear_score', 'sadness_score']]
y = dataset3['like_count'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

mse_cv = -cross_val_score(linear_reg, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
score_cv = cross_val_score(linear_reg, X_train, y_train, cv=10)

print("Training MSE: %.2f" % mse_cv.mean())
print("Training R square: %.6f" % score_cv.mean())
print("Testing MSE: %.2f" % mean_squared_error(y_test, linear_reg.predict(X_test)))
print("Testing R square: %.6f" % linear_reg.score(X_test, y_test, sample_weight=None))

Training MSE: 20611.66
Training R square: -0.000350
Testing MSE: 20657.51
Testing R square: 0.000022


# 3.V

When comparing between the MSE of "reply" and "like" using the 2 sentiment and 4 emotion value, the result still shows "reply" is more predictable than "like"

However, when MSE between word vector method and sentiment-emotion method, I find that the word vector result is better on both "reply" and "like" prediction. Which shows that the sentiment and emotion alone does not have good prediction power.


# 3.VI

In [39]:
X = dataset2[['pos_score', 'neg_score', 'fear_score', 'anger_score', 'fear_score', 'sadness_score',
              'user_follower_count','user_like_count', 'user_friend_count', 'user_media_count',
              'user_post_count', 'user_list_count' , 'user_verified', 'user_default_profile', 
              'user_account_type', 'user_account_age']]
y = dataset2['reply_count'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

mse_cv = -cross_val_score(linear_reg, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
score_cv = cross_val_score(linear_reg, X_train, y_train, cv=10)

print("Training MSE: %.2f" % mse_cv.mean())
print("Training R square: %.6f" % score_cv.mean())
print("Testing MSE: %.2f" % mean_squared_error(y_test, linear_reg.predict(X_test)))
print("Testing R square: %.6f" % linear_reg.score(X_test, y_test, sample_weight=None))

Training MSE: 103.46
Training R square: 0.055022
Testing MSE: 104.65
Testing R square: 0.066774


In [40]:
X = dataset3[['pos_score', 'neg_score', 'fear_score', 'anger_score', 'fear_score', 'sadness_score',
              'user_follower_count','user_like_count', 'user_friend_count', 'user_media_count',
              'user_post_count', 'user_list_count' , 'user_verified', 'user_default_profile',
              'user_account_type', 'user_account_age']]
y = dataset3['like_count'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

mse_cv = -cross_val_score(linear_reg, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
score_cv = cross_val_score(linear_reg, X_train, y_train, cv=10)

print("Training MSE: %.2f" % mse_cv.mean())
print("Training R square: %.6f" % score_cv.mean())
print("Testing MSE: %.2f" % mean_squared_error(y_test, linear_reg.predict(X_test)))
print("Testing R square: %.6f" % linear_reg.score(X_test, y_test, sample_weight=None))

Training MSE: 19524.48
Training R square: 0.052670
Testing MSE: 19363.70
Testing R square: 0.062653


# 3.VII

The MSE of using sentiment, emotion and user characteristics is the lowest among all 3 methods. Although the "reply" prediction is still much lower than "like" prediction, this method achieved the best result.
The R square is also much better than the other 2 methods (word vector and sentiment-emotion)

Sentiment and emotion do have value in prediction because it achieved simular result and word vector, but it is only part of the factor. We need to add in some other factors, like user characteristics, to get more accurate result