## Feature Engineering

In [34]:
import pandas as pd
import os
import nltk
import string
import datetime as dt
from datetime import datetime, timedelta
import re

pd.set_option('display.max_colwidth', 100) # To extend column width

wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')

nltk.download()
string.punctuation

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
# Set path to cryptocurrency csv files
path = '../Data/Reddit_Comments/Cryptocurrency/'

files = os.listdir(path)

# Create list of csv file names
csv_list = []
for f in files:
    csv_list.append(f)

In [18]:
csv_list = csv_list[:-1]
csv_list

['Cryptocurrency_2021-08-07.csv',
 'Cryptocurrency_2021-08-08.csv',
 'Cryptocurrency_2021-08-09.csv',
 'Cryptocurrency_2021-08-10.csv',
 'Cryptocurrency_2021-08-11.csv',
 'Cryptocurrency_2021-08-12.csv',
 'Cryptocurrency_2021-08-13.csv',
 'Cryptocurrency_2021-08-14.csv',
 'Cryptocurrency_2021-08-15.csv',
 'Cryptocurrency_2021-08-16.csv',
 'Cryptocurrency_2021-08-17.csv',
 'Cryptocurrency_2021-08-18.csv',
 'Cryptocurrency_2021-08-19.csv',
 'Cryptocurrency_2021-08-20.csv',
 'Cryptocurrency_2021-08-21.csv',
 'Cryptocurrency_2021-08-22.csv',
 'Cryptocurrency_2021-08-23.csv',
 'Cryptocurrency_2021-08-24.csv',
 'Cryptocurrency_2021-08-25.csv',
 'Cryptocurrency_2021-08-26.csv',
 'Cryptocurrency_2021-08-27.csv',
 'Cryptocurrency_2021-08-28.csv',
 'Cryptocurrency_2021-08-29.csv',
 'Cryptocurrency_2021-08-30.csv',
 'Cryptocurrency_2021-08-31.csv',
 'Cryptocurrency_2021-09-01.csv',
 'Cryptocurrency_2021-09-02.csv',
 'Cryptocurrency_2021-09-03.csv',
 'Cryptocurrency_2021-09-04.csv',
 'Cryptocurren

In [19]:
csv_list[0]

'Cryptocurrency_2021-08-07.csv'

## Reading Twitter Training data

In [20]:
data = pd.read_csv('../Data/Twitter/Tweets.csv')
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &...",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [21]:
twitter_df = data[['airline_sentiment', 'text']]
twitter_df.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials to the experience... tacky.
2,neutral,@VirginAmerica I didn't today... Must mean I need to take another trip!
3,negative,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &..."
4,negative,@VirginAmerica and it's a really big bad thing about it


In [22]:
# shape of the dataset?

print("Input data has {} rows and {} columns".format(len(twitter_df), len(twitter_df.columns)))

Input data has 14640 rows and 2 columns


In [23]:
# How many are positive, neutral, negative

print("Out of {} rows, {} are positive, {} are negative, {} are neutral".format(len(twitter_df),
                                                       len(twitter_df[twitter_df['airline_sentiment']=='positive']),
                                                       len(twitter_df[twitter_df['airline_sentiment']=='negative']),
                                                       len(twitter_df[twitter_df['airline_sentiment']=='neutral'])))

Out of 14640 rows, 2363 are positive, 9178 are negative, 3099 are neutral


In [24]:
# How much missing data is there?

print("Number of null in label: {}".format(twitter_df['airline_sentiment'].isnull().sum()))
print("Number of null in text: {}".format(twitter_df['text'].isnull().sum()))

Number of null in label: 0
Number of null in text: 0


## Vectorizing Data

In [25]:

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

## CountVectorizer

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(twitter_df['text'])
print(X_counts.shape)

(14640, 15603)


In [27]:
X_counts

<14640x15603 sparse matrix of type '<class 'numpy.int64'>'
	with 149787 stored elements in Compressed Sparse Row format>

In [28]:
X_counts_df = pd.DataFrame(X_counts.toarray(), columns=count_vect.get_feature_names())
X_counts_df.head(10)

Unnamed: 0,Unnamed: 1,0,00,0011,0016,006,0162389030167,0162424965446,0162431184663,0167560070877,...,zkatcher,zombie,zone,zoom,zrh,zrhairport,zukes,zurich,zurichnew,ʖ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## N-Grams CountVectorizer

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vect = CountVectorizer(ngram_range=(2,2),analyzer=clean_text)
X_counts = ngram_vect.fit_transform(twitter_df['text'])
print(X_counts.shape)

(14640, 15603)


## TfidfVectorizer

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(twitter_df['text'])
print(X_tfidf.shape)

(14640, 15603)


In [31]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names())
X_tfidf_df.head()

Unnamed: 0,Unnamed: 1,0,00,0011,0016,006,0162389030167,0162424965446,0162431184663,0167560070877,...,zkatcher,zombie,zone,zoom,zrh,zrhairport,zukes,zurich,zurichnew,ʖ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.166305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# ML Classifiers

## GridSearchCV

In [36]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13793,13794,13795,13796,13797,13798,13799,13800,13801,13802
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

In [None]:
# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(twitter_df['text'])
X_tfidf_feat = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names())
X_tfidf_feat.head()

In [None]:
# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(twitter_df['text'])
X_count_feat = pd.DataFrame(X_count.toarray(), columns=count_vect.get_feature_names())
X_count_feat.head()

## CountVectorizer

In [37]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_count_feat, twitter_df['airline_sentiment'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,914.177585,49.846273,3.654449,0.676397,,150,"{'max_depth': None, 'n_estimators': 150}",0.748634,0.731899,0.667691,0.766393,0.673497,0.717623,0.039961,1
11,1766.410734,291.820779,4.177546,1.751319,,300,"{'max_depth': None, 'n_estimators': 300}",0.753757,0.730874,0.651298,0.76127,0.672473,0.713934,0.044172,2
9,86.413659,11.388772,0.893883,0.19893,,10,"{'max_depth': None, 'n_estimators': 10}",0.716872,0.697063,0.642077,0.74112,0.697063,0.698839,0.032684,3
8,1672.046039,106.587841,4.662355,0.74182,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.701844,0.744536,0.659153,0.70765,0.656079,0.693852,0.033027,4
7,867.945438,39.744043,3.690105,0.459352,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.701844,0.738046,0.656079,0.710724,0.659495,0.693238,0.031327,5


## TfidfVectorizer

In [38]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf_feat, twitter_df['airline_sentiment'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()

## Read cryptocurrency subreddit comments

In [None]:
df = pd.read_csv(path + csv_list[0])
df.head()

Unnamed: 0,author,author_fullname,author_premium,body,collapsed_reason_code,comment_type,created_utc,score,id,parent_id,permalink
0,HiCarumba,t2_95jx76vi,False,Yes Brother. This is the way.,,,1628333279,1,h81jxny,t1_h81jvaa,/r/CryptoCurrency/comments/ozr28z/it_was_prophesied/h81jxny/
1,baeiby,t2_7htt0be3,False,"Lol to be very honest, I knew close to zilch abt investing in general. Didn't even know what DCA...",,,1628333279,2,h81jxnr,t1_h81jn5u,/r/CryptoCurrency/comments/ozqz9m/i_trust_people_here_more_than_my_circle_of/h81jxnr/
2,NotRyanPace,t2_xgk0k,False,"End of year, take the ""Last call for ETH under $10k"" comments seriously. That's a reasonable pri...",,,1628333278,1,h81jxmq,t1_h81jqki,/r/CryptoCurrency/comments/ozinnf/daily_discussion_august_7_2021_gmt0/h81jxmq/
3,BountyManagerBD,t2_8nbdzghw,False,ESCROW PROTOCOL are aiming to attract users from outside of the cryptocurrency space,,,1628333277,1,h81jxlg,t3_o2e67b,/r/CryptoCurrency/comments/o2e67b/escrow_protocol_ico_crowdfunding_platform_with/h81jxlg/
4,Gabus_Bego,t2_bbs0wmdt,False,"Thank you, my dude.",,,1628333276,1,h81jxk6,t1_h81jtfb,/r/CryptoCurrency/comments/ozinnf/daily_discussion_august_7_2021_gmt0/h81jxk6/


In [None]:
# Remove all rows that have NaN in df['body']
df = df[df['body'].notna()]

Unnamed: 0,author,author_fullname,author_premium,body,collapsed_reason_code,comment_type,created_utc,score,id,parent_id,permalink
0,HiCarumba,t2_95jx76vi,False,Yes Brother. This is the way.,,,1628333279,1,h81jxny,t1_h81jvaa,/r/CryptoCurrency/comments/ozr28z/it_was_prophesied/h81jxny/
1,baeiby,t2_7htt0be3,False,"Lol to be very honest, I knew close to zilch abt investing in general. Didn't even know what DCA...",,,1628333279,2,h81jxnr,t1_h81jn5u,/r/CryptoCurrency/comments/ozqz9m/i_trust_people_here_more_than_my_circle_of/h81jxnr/
2,NotRyanPace,t2_xgk0k,False,"End of year, take the ""Last call for ETH under $10k"" comments seriously. That's a reasonable pri...",,,1628333278,1,h81jxmq,t1_h81jqki,/r/CryptoCurrency/comments/ozinnf/daily_discussion_august_7_2021_gmt0/h81jxmq/
3,BountyManagerBD,t2_8nbdzghw,False,ESCROW PROTOCOL are aiming to attract users from outside of the cryptocurrency space,,,1628333277,1,h81jxlg,t3_o2e67b,/r/CryptoCurrency/comments/o2e67b/escrow_protocol_ico_crowdfunding_platform_with/h81jxlg/
4,Gabus_Bego,t2_bbs0wmdt,False,"Thank you, my dude.",,,1628333276,1,h81jxk6,t1_h81jtfb,/r/CryptoCurrency/comments/ozinnf/daily_discussion_august_7_2021_gmt0/h81jxk6/


In [None]:
df.isna().any()

author                   False
author_fullname           True
author_premium            True
body                     False
collapsed_reason_code     True
comment_type              True
created_utc              False
score                    False
id                       False
parent_id                False
permalink                False
dtype: bool

In [None]:
# Remove automoderator authors
df = df[df.author != "AutoModerator"]

Unnamed: 0,author,author_fullname,author_premium,body,collapsed_reason_code,comment_type,created_utc,score,id,parent_id,permalink
0,HiCarumba,t2_95jx76vi,False,Yes Brother. This is the way.,,,1628333279,1,h81jxny,t1_h81jvaa,/r/CryptoCurrency/comments/ozr28z/it_was_prophesied/h81jxny/
1,baeiby,t2_7htt0be3,False,"Lol to be very honest, I knew close to zilch abt investing in general. Didn't even know what DCA...",,,1628333279,2,h81jxnr,t1_h81jn5u,/r/CryptoCurrency/comments/ozqz9m/i_trust_people_here_more_than_my_circle_of/h81jxnr/
2,NotRyanPace,t2_xgk0k,False,"End of year, take the ""Last call for ETH under $10k"" comments seriously. That's a reasonable pri...",,,1628333278,1,h81jxmq,t1_h81jqki,/r/CryptoCurrency/comments/ozinnf/daily_discussion_august_7_2021_gmt0/h81jxmq/
3,BountyManagerBD,t2_8nbdzghw,False,ESCROW PROTOCOL are aiming to attract users from outside of the cryptocurrency space,,,1628333277,1,h81jxlg,t3_o2e67b,/r/CryptoCurrency/comments/o2e67b/escrow_protocol_ico_crowdfunding_platform_with/h81jxlg/
4,Gabus_Bego,t2_bbs0wmdt,False,"Thank you, my dude.",,,1628333276,1,h81jxk6,t1_h81jtfb,/r/CryptoCurrency/comments/ozinnf/daily_discussion_august_7_2021_gmt0/h81jxk6/


In [None]:
comments_df = df.copy()

In [None]:
bitcoin_df = df[df['body'].str.contains('Bitcoin|BTC', case=False)]

Unnamed: 0,author,author_fullname,author_premium,body,collapsed_reason_code,comment_type,created_utc,score,id,parent_id,permalink
6,atomwest314,t2_x50okt,True,i noticed this too peeps like 'o wow ICP go up must be good now??' like nope... btc and eth risi...,,,1628333275,3,h81jxhz,t3_ozr2hx,/r/CryptoCurrency/comments/ozr2hx/can_we_please_remember_that_your_coin_being_in/h81jxhz/
23,Shrappy16,t2_4e2hp2d0,False,"Probably know the answer: If you earned enough in ETH to buy 1 BTC, would you do it or keep rid...",,,1628333265,1,h81jx4f,t3_oznwv4,/r/CryptoCurrency/comments/oznwv4/eth_just_completed_a_historymaking_16_green/h81jx4f/
29,Odd_Copy_8077,t2_ajd0894j,False,"A boy asked his bitcoin-investing dad for 1 bitcoin for his birthday\n\nDad: What? 40,660 dollar...",,,1628333261,4,h81jwxn,t1_h81jr69,/r/CryptoCurrency/comments/ozinnf/daily_discussion_august_7_2021_gmt0/h81jwxn/
91,Much-Weekend-8882,t2_66sa37r1,False,Is this all time high of eth/btc ratio?,,,1628333204,2,h81juiz,t3_ozinnf,/r/CryptoCurrency/comments/ozinnf/daily_discussion_august_7_2021_gmt0/h81juiz/
114,itsglocx,t2_c3slvp4i,False,can you imagine being mad at bitcoin going down but owning it at that price haha,,,1628341913,1,h81wcat,t3_ozsz9f,/r/CryptoCurrency/comments/ozsz9f/a_snapshot_in_time_back_when_bitcoin_holders/h81wcat/


In [None]:
# Saving cleaned dataframe
df.to_csv("../Data/Reddit_Comments/Cleaned/comments_cleaned.csv")