# BERT Sentiment Analysis on all Reddit /r/wallstreetbets

### Validate Environment

In [1]:
!python --version

Python 3.8.0


In [2]:
!which python

/Users/melissacirtain/work/envs/ait/bin/python


In [3]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import pandas as pd
import numpy as np

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print('Pulled pretrained BERT transformer')

df = pd.read_csv('../datawork/reddit_wsb.csv')

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pulled pretrained BERT transformer


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,stock
0,0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41,IMXI
1,1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10,GME
2,2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,
3,3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57,"NEW,GME"
4,4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56,"GME,AMC,FARM,ESGD,ESGE,ESGU,SUSB,SUSC,AWTM,EAG..."


In [7]:
# drop extra index
df = df[['title', 'score', 'id', 'url', 'comms_num', 'created',
       'body', 'timestamp', 'stock']]

In [16]:
# find bad rows and drop them
df.index.dtype
df.iloc[932].head()
df.iloc[930:935].head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,stock
930,"The game is rigged, robinhood and other broker...",4,l6wwdm,https://www.reddit.com/r/wallstreetbets/commen...,2,1611870000.0,How is this even legal? Do we have any recours...,2021-01-28 23:43:24,"ELYS,GAMR,IGT,BSPE,GME,FCPI"
931,PLTR'Ds Robinhood sent the flows to you,1,l6wwdi,https://www.reddit.com/r/wallstreetbets/commen...,2,1611870000.0,PLTR is still Robinhood approved🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀🚀,2021-01-28 23:43:24,SENT
932,Since RH has removed the ability to buy more o...,0,l6wwdh,https://www.reddit.com/r/wallstreetbets/commen...,4,1611870000.0,,2021-01-28 23:43:24,"RH,BBY,AACQ,AAL,AAME,AAOI,AAON,AAPL,AAWW,ABCB,..."
933,Still Phase 1,2,l6wwda,https://i.redd.it/9uurekzfu2e61.jpg,0,1611870000.0,,2021-01-28 23:43:23,"BIGC,BTRS,CETXP,CETXW,DALI,ERIC,QQQ,UEPS,ABEV,..."
934,Hold the line boiss 🚀🚀🚀🚀,6,l6wwd9,https://www.reddit.com/r/wallstreetbets/commen...,1,1611870000.0,,2021-01-28 23:43:23,"HOLD,PLPC,KVLE,NCLH"


### Data Cleanup/Inspection

In [21]:
junk = df.iloc[932]['stock']
junk # this is rendering badly in excel as the CSV, but the data is correct in the dataframe...

'RH,BBY,AACQ,AAL,AAME,AAOI,AAON,AAPL,AAWW,ABCB,ABEO,ABIO,ABMD,ABNB,ABST,ABTX,ABUS,ACAD,ACBI,ACCD,ACER,ACET,ACGL,ACHC,ACIA,ACIU,ACIW,ACLS,ACMR,ACNB,ACOR,ACRS,ACRX,ACST,ACTG,ADBE,ADES,ADI,ADIL,ADMA,ADMP,ADMS,ADN,ADP,ADPT,ADSK,ADTN,ADTX,ADUS,ADV,ADVM,ADXS,AEGN,AEHL,AEHR,AEI,AEIS,AEMD,AEP,AERI,AESE,AEY,AEYE,AEZS,AFBI,AFIB,AFIN,AFINO,AFINP,AFMD,AFRM,AGEN,AGFS,AGFY,AGIO,AGLE,AGNC,AGNCM,AGNCN,AGNCO,AGNCP,AGRX,AGTC,AGYS,AHAC,AHCO,AHPI,AIHS,AIKI,AIMC,AIRG,AIRT,AKAM,AKBA,AKER,AKRO,AKTS,AKUS,ALBO,ALCO,ALDX,ALEC,ALGM,ALGN,ALGS,ALGT,ALIM,ALJJ,ALLK,ALLO,ALNA,ALNY,ALOT,ALPN,ALRM,ALRN,ALRS,ALSK,ALT,ALTA,ALTM,ALTO,ALTR,ALTU,ALVR,ALXN,ALXO,AMAL,AMAT,AMCX,AMD,AMED,AMEH,AMGN,AMHC,AMKR,AMNB,AMOT,AMPG,AMPH,AMRB,AMRK,AMRS,AMSC,AMSF,AMST,AMSWA,AMTB,AMTBB,AMTI,AMTX,AMWD,AMZN,ANAB,ANAT,ANDE,ANGI,ANGN,ANGO,ANIK,ANIP,ANIX,ANNX,ANSS,AOUT,APA,APDN,APEI,APEN,APLS,APLT,APOG,APPF,APPH,APPN,APPS,APR,APRE,APTX,APVO,APXT,APYX,AQB,AQMS,AQST,ARAV,ARAY,ARBG,ARCB,ARCT,ARDS,ARDX,AREC,ARKO,ARKR,ARNA,AROW,ARPO,ARQT,ARRY,ARTL,AR

In [22]:
df.iloc[1751]['stock']


'NIO,AACQ,AAL,AAME,AAOI,AAON,AAPL,AAWW,ABCB,ABEO,ABIO,ABMD,ABNB,ABST,ABTX,ABUS,ACAD,ACBI,ACCD,ACER,ACET,ACGL,ACHC,ACIA,ACIU,ACIW,ACLS,ACMR,ACNB,ACOR,ACRS,ACRX,ACST,ACTG,ADBE,ADES,ADI,ADIL,ADMA,ADMP,ADMS,ADN,ADP,ADPT,ADSK,ADTN,ADTX,ADUS,ADV,ADVM,ADXS,AEGN,AEHL,AEHR,AEI,AEIS,AEMD,AEP,AERI,AESE,AEY,AEYE,AEZS,AFBI,AFIB,AFIN,AFINO,AFINP,AFMD,AFRM,AGEN,AGFS,AGFY,AGIO,AGLE,AGNC,AGNCM,AGNCN,AGNCO,AGNCP,AGRX,AGTC,AGYS,AHAC,AHCO,AHPI,AIHS,AIKI,AIMC,AIRG,AIRT,AKAM,AKBA,AKER,AKRO,AKTS,AKUS,ALBO,ALCO,ALDX,ALEC,ALGM,ALGN,ALGS,ALGT,ALIM,ALJJ,ALLK,ALLO,ALNA,ALNY,ALOT,ALPN,ALRM,ALRN,ALRS,ALSK,ALT,ALTA,ALTM,ALTO,ALTR,ALTU,ALVR,ALXN,ALXO,AMAL,AMAT,AMCX,AMD,AMED,AMEH,AMGN,AMHC,AMKR,AMNB,AMOT,AMPG,AMPH,AMRB,AMRK,AMRS,AMSC,AMSF,AMST,AMSWA,AMTB,AMTBB,AMTI,AMTX,AMWD,AMZN,ANAB,ANAT,ANDE,ANGI,ANGN,ANGO,ANIK,ANIP,ANIX,ANNX,ANSS,AOUT,APA,APDN,APEI,APEN,APLS,APLT,APOG,APPF,APPH,APPN,APPS,APR,APRE,APTX,APVO,APXT,APYX,AQB,AQMS,AQST,ARAV,ARAY,ARBG,ARCB,ARCT,ARDS,ARDX,AREC,ARKO,ARKR,ARNA,AROW,ARPO,ARQT,ARRY,ARTL,ARTNA

In [23]:
df.iloc[2508]['stock']

'HOLD,HOLD,HOLD,BBY,POST,NWS,NWSA,AACQ,AAL,AAME,AAOI,AAON,AAPL,AAWW,ABCB,ABEO,ABIO,ABMD,ABNB,ABST,ABTX,ABUS,ACAD,ACBI,ACCD,ACER,ACET,ACGL,ACHC,ACIA,ACIU,ACIW,ACLS,ACMR,ACNB,ACOR,ACRS,ACRX,ACST,ACTG,ADBE,ADES,ADI,ADIL,ADMA,ADMP,ADMS,ADN,ADP,ADPT,ADSK,ADTN,ADTX,ADUS,ADV,ADVM,ADXS,AEGN,AEHL,AEHR,AEI,AEIS,AEMD,AEP,AERI,AESE,AEY,AEYE,AEZS,AFBI,AFIB,AFIN,AFINO,AFINP,AFMD,AFRM,AGEN,AGFS,AGFY,AGIO,AGLE,AGNC,AGNCM,AGNCN,AGNCO,AGNCP,AGRX,AGTC,AGYS,AHAC,AHCO,AHPI,AIHS,AIKI,AIMC,AIRG,AIRT,AKAM,AKBA,AKER,AKRO,AKTS,AKUS,ALBO,ALCO,ALDX,ALEC,ALGM,ALGN,ALGS,ALGT,ALIM,ALJJ,ALLK,ALLO,ALNA,ALNY,ALOT,ALPN,ALRM,ALRN,ALRS,ALSK,ALT,ALTA,ALTM,ALTO,ALTR,ALTU,ALVR,ALXN,ALXO,AMAL,AMAT,AMCX,AMD,AMED,AMEH,AMGN,AMHC,AMKR,AMNB,AMOT,AMPG,AMPH,AMRB,AMRK,AMRS,AMSC,AMSF,AMST,AMSWA,AMTB,AMTBB,AMTI,AMTX,AMWD,AMZN,ANAB,ANAT,ANDE,ANGI,ANGN,ANGO,ANIK,ANIP,ANIX,ANNX,ANSS,AOUT,APA,APDN,APEI,APEN,APLS,APLT,APOG,APPF,APPH,APPN,APPS,APR,APRE,APTX,APVO,APXT,APYX,AQB,AQMS,AQST,ARAV,ARAY,ARBG,ARCB,ARCT,ARDS,ARDX,AREC,ARKO,ARKR,ARNA,A

In [24]:
df.iloc[2854]['stock']

'AACQ,AAL,AAME,AAOI,AAON,AAPL,AAWW,ABCB,ABEO,ABIO,ABMD,ABNB,ABST,ABTX,ABUS,ACAD,ACBI,ACCD,ACER,ACET,ACGL,ACHC,ACIA,ACIU,ACIW,ACLS,ACMR,ACNB,ACOR,ACRS,ACRX,ACST,ACTG,ADBE,ADES,ADI,ADIL,ADMA,ADMP,ADMS,ADN,ADP,ADPT,ADSK,ADTN,ADTX,ADUS,ADV,ADVM,ADXS,AEGN,AEHL,AEHR,AEI,AEIS,AEMD,AEP,AERI,AESE,AEY,AEYE,AEZS,AFBI,AFIB,AFIN,AFINO,AFINP,AFMD,AFRM,AGEN,AGFS,AGFY,AGIO,AGLE,AGNC,AGNCM,AGNCN,AGNCO,AGNCP,AGRX,AGTC,AGYS,AHAC,AHCO,AHPI,AIHS,AIKI,AIMC,AIRG,AIRT,AKAM,AKBA,AKER,AKRO,AKTS,AKUS,ALBO,ALCO,ALDX,ALEC,ALGM,ALGN,ALGS,ALGT,ALIM,ALJJ,ALLK,ALLO,ALNA,ALNY,ALOT,ALPN,ALRM,ALRN,ALRS,ALSK,ALT,ALTA,ALTM,ALTO,ALTR,ALTU,ALVR,ALXN,ALXO,AMAL,AMAT,AMCX,AMD,AMED,AMEH,AMGN,AMHC,AMKR,AMNB,AMOT,AMPG,AMPH,AMRB,AMRK,AMRS,AMSC,AMSF,AMST,AMSWA,AMTB,AMTBB,AMTI,AMTX,AMWD,AMZN,ANAB,ANAT,ANDE,ANGI,ANGN,ANGO,ANIK,ANIP,ANIX,ANNX,ANSS,AOUT,APA,APDN,APEI,APEN,APLS,APLT,APOG,APPF,APPH,APPN,APPS,APR,APRE,APTX,APVO,APXT,APYX,AQB,AQMS,AQST,ARAV,ARAY,ARBG,ARCB,ARCT,ARDS,ARDX,AREC,ARKO,ARKR,ARNA,AROW,ARPO,ARQT,ARRY,ARTL,ARTNA,ART

In [25]:
df.iloc[4616]['stock']

'AACQ,AAL,AAME,AAOI,AAON,AAPL,AAWW,ABCB,ABEO,ABIO,ABMD,ABNB,ABST,ABTX,ABUS,ACAD,ACBI,ACCD,ACER,ACET,ACGL,ACHC,ACIA,ACIU,ACIW,ACLS,ACMR,ACNB,ACOR,ACRS,ACRX,ACST,ACTG,ADBE,ADES,ADI,ADIL,ADMA,ADMP,ADMS,ADN,ADP,ADPT,ADSK,ADTN,ADTX,ADUS,ADV,ADVM,ADXS,AEGN,AEHL,AEHR,AEI,AEIS,AEMD,AEP,AERI,AESE,AEY,AEYE,AEZS,AFBI,AFIB,AFIN,AFINO,AFINP,AFMD,AFRM,AGEN,AGFS,AGFY,AGIO,AGLE,AGNC,AGNCM,AGNCN,AGNCO,AGNCP,AGRX,AGTC,AGYS,AHAC,AHCO,AHPI,AIHS,AIKI,AIMC,AIRG,AIRT,AKAM,AKBA,AKER,AKRO,AKTS,AKUS,ALBO,ALCO,ALDX,ALEC,ALGM,ALGN,ALGS,ALGT,ALIM,ALJJ,ALLK,ALLO,ALNA,ALNY,ALOT,ALPN,ALRM,ALRN,ALRS,ALSK,ALT,ALTA,ALTM,ALTO,ALTR,ALTU,ALVR,ALXN,ALXO,AMAL,AMAT,AMCX,AMD,AMED,AMEH,AMGN,AMHC,AMKR,AMNB,AMOT,AMPG,AMPH,AMRB,AMRK,AMRS,AMSC,AMSF,AMST,AMSWA,AMTB,AMTBB,AMTI,AMTX,AMWD,AMZN,ANAB,ANAT,ANDE,ANGI,ANGN,ANGO,ANIK,ANIP,ANIX,ANNX,ANSS,AOUT,APA,APDN,APEI,APEN,APLS,APLT,APOG,APPF,APPH,APPN,APPS,APR,APRE,APTX,APVO,APXT,APYX,AQB,AQMS,AQST,ARAV,ARAY,ARBG,ARCB,ARCT,ARDS,ARDX,AREC,ARKO,ARKR,ARNA,AROW,ARPO,ARQT,ARRY,ARTL,ARTNA,ART

In [26]:
df.iloc[4722]['stock']

'CHNG,CHNGU,CHGX,CLII,CLII.U,CLII.W,AACQ,AAL,AAME,AAOI,AAON,AAPL,AAWW,ABCB,ABEO,ABIO,ABMD,ABNB,ABST,ABTX,ABUS,ACAD,ACBI,ACCD,ACER,ACET,ACGL,ACHC,ACIA,ACIU,ACIW,ACLS,ACMR,ACNB,ACOR,ACRS,ACRX,ACST,ACTG,ADBE,ADES,ADI,ADIL,ADMA,ADMP,ADMS,ADN,ADP,ADPT,ADSK,ADTN,ADTX,ADUS,ADV,ADVM,ADXS,AEGN,AEHL,AEHR,AEI,AEIS,AEMD,AEP,AERI,AESE,AEY,AEYE,AEZS,AFBI,AFIB,AFIN,AFINO,AFINP,AFMD,AFRM,AGEN,AGFS,AGFY,AGIO,AGLE,AGNC,AGNCM,AGNCN,AGNCO,AGNCP,AGRX,AGTC,AGYS,AHAC,AHCO,AHPI,AIHS,AIKI,AIMC,AIRG,AIRT,AKAM,AKBA,AKER,AKRO,AKTS,AKUS,ALBO,ALCO,ALDX,ALEC,ALGM,ALGN,ALGS,ALGT,ALIM,ALJJ,ALLK,ALLO,ALNA,ALNY,ALOT,ALPN,ALRM,ALRN,ALRS,ALSK,ALT,ALTA,ALTM,ALTO,ALTR,ALTU,ALVR,ALXN,ALXO,AMAL,AMAT,AMCX,AMD,AMED,AMEH,AMGN,AMHC,AMKR,AMNB,AMOT,AMPG,AMPH,AMRB,AMRK,AMRS,AMSC,AMSF,AMST,AMSWA,AMTB,AMTBB,AMTI,AMTX,AMWD,AMZN,ANAB,ANAT,ANDE,ANGI,ANGN,ANGO,ANIK,ANIP,ANIX,ANNX,ANSS,AOUT,APA,APDN,APEI,APEN,APLS,APLT,APOG,APPF,APPH,APPN,APPS,APR,APRE,APTX,APVO,APXT,APYX,AQB,AQMS,AQST,ARAV,ARAY,ARBG,ARCB,ARCT,ARDS,ARDX,AREC,ARKO,ARKR,ARNA

In [27]:
df.iloc[5023]['stock']

'BBY,AACQ,AAL,AAME,AAOI,AAON,AAPL,AAWW,ABCB,ABEO,ABIO,ABMD,ABNB,ABST,ABTX,ABUS,ACAD,ACBI,ACCD,ACER,ACET,ACGL,ACHC,ACIA,ACIU,ACIW,ACLS,ACMR,ACNB,ACOR,ACRS,ACRX,ACST,ACTG,ADBE,ADES,ADI,ADIL,ADMA,ADMP,ADMS,ADN,ADP,ADPT,ADSK,ADTN,ADTX,ADUS,ADV,ADVM,ADXS,AEGN,AEHL,AEHR,AEI,AEIS,AEMD,AEP,AERI,AESE,AEY,AEYE,AEZS,AFBI,AFIB,AFIN,AFINO,AFINP,AFMD,AFRM,AGEN,AGFS,AGFY,AGIO,AGLE,AGNC,AGNCM,AGNCN,AGNCO,AGNCP,AGRX,AGTC,AGYS,AHAC,AHCO,AHPI,AIHS,AIKI,AIMC,AIRG,AIRT,AKAM,AKBA,AKER,AKRO,AKTS,AKUS,ALBO,ALCO,ALDX,ALEC,ALGM,ALGN,ALGS,ALGT,ALIM,ALJJ,ALLK,ALLO,ALNA,ALNY,ALOT,ALPN,ALRM,ALRN,ALRS,ALSK,ALT,ALTA,ALTM,ALTO,ALTR,ALTU,ALVR,ALXN,ALXO,AMAL,AMAT,AMCX,AMD,AMED,AMEH,AMGN,AMHC,AMKR,AMNB,AMOT,AMPG,AMPH,AMRB,AMRK,AMRS,AMSC,AMSF,AMST,AMSWA,AMTB,AMTBB,AMTI,AMTX,AMWD,AMZN,ANAB,ANAT,ANDE,ANGI,ANGN,ANGO,ANIK,ANIP,ANIX,ANNX,ANSS,AOUT,APA,APDN,APEI,APEN,APLS,APLT,APOG,APPF,APPH,APPN,APPS,APR,APRE,APTX,APVO,APXT,APYX,AQB,AQMS,AQST,ARAV,ARAY,ARBG,ARCB,ARCT,ARDS,ARDX,AREC,ARKO,ARKR,ARNA,AROW,ARPO,ARQT,ARRY,ARTL,ARTNA

In [None]:
#df.iloc[]['stock']

In [None]:
#df.iloc[]['stock']

In [None]:
#df.iloc[]['stock']

In [None]:
#df.iloc[]['stock']

In [None]:
#df.iloc[]['stock']

In [None]:
#df.iloc[]['stock']

In [None]:
#df.iloc[]['stock']

In [8]:
# What else might we look for?

# grab indices matching the above, inspect rows not in that index...
known_gme_idx = df[((df['body'].notna()) & (df['body'].str.lower().str.contains('|'.join(strings_to_search)))) | 
  (df['title'].str.lower().str.contains('|'.join(strings_to_search)))].index

### Classify Titles
- so we can make a time series

In [32]:
%%time

print(df.shape)

# Make a GME-only classified dataframe
from transformers import pipeline

# Allocate a pipeline for sentiment-analysis
classifier = pipeline('sentiment-analysis')

print([x for x in df.columns])

# Drop non-gme posts
#df = df.iloc[known_gme_idx]

# Apply classifier over title for the GME-related rows
print('classifying titles')
df['title_sentiment'] = df['title'].apply(classifier)


(36668, 11)


Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_77']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['title', 'score', 'id', 'url', 'comms_num', 'created', 'body', 'timestamp', 'stock', 'short_body', 'body_sentiment']
classifying titles
CPU times: user 1h 27min 48s, sys: 8min 18s, total: 1h 36min 6s
Wall time: 1h 8min 44s


In [33]:
df.to_csvv('intermediate_class_with_stocks.csv')

### Classify bodies

In [34]:
%%time

df.head()

CPU times: user 160 µs, sys: 1e+03 ns, total: 161 µs
Wall time: 167 µs


Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,stock,short_body,body_sentiment,title_sentiment
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41,IMXI,,fail,"[{'label': 'NEGATIVE', 'score': 0.993991911411..."
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10,GME,,fail,"[{'label': 'NEGATIVE', 'score': 0.999741792678..."
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,,The CEO of NASDAQ pushed to halt trading “to g...,fail,"[{'label': 'NEGATIVE', 'score': 0.999687314033..."
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57,"NEW,GME",,fail,"[{'label': 'NEGATIVE', 'score': 0.996105611324..."
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56,"GME,AMC,FARM,ESGD,ESGE,ESGU,SUSB,SUSC,AWTM,EAG...",,fail,"[{'label': 'NEGATIVE', 'score': 0.996898174285..."


### Replace NaN with ''

In [35]:
df['body'][df['body'].isna()] = ''
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['body'][df['body'].isna()] = ''


Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,stock,short_body,body_sentiment,title_sentiment
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41,IMXI,,fail,"[{'label': 'NEGATIVE', 'score': 0.993991911411..."
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10,GME,,fail,"[{'label': 'NEGATIVE', 'score': 0.999741792678..."
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,,The CEO of NASDAQ pushed to halt trading “to g...,fail,"[{'label': 'NEGATIVE', 'score': 0.999687314033..."
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57,"NEW,GME",,fail,"[{'label': 'NEGATIVE', 'score': 0.996105611324..."
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56,"GME,AMC,FARM,ESGD,ESGE,ESGU,SUSB,SUSC,AWTM,EAG...",,fail,"[{'label': 'NEGATIVE', 'score': 0.996898174285..."


### Custom classifier wrapper

In [36]:
%%time
#assert False

df['short_body'] = df['body'].str[:500]

# Apply classifier over bodies for the GME-related rows that aren't null
print('classifying bodies')
def classify_bodies(row):
    try:
        return classifier(row)
    except Exception as e:
        print(f'failed on row: \n\n{row} \n\n***** {e}\n\n')
        return 'fail'
    
df['body_sentiment'] = df['short_body'].apply(classify_bodies)

classifying bodies
CPU times: user 2h 8min 31s, sys: 25min 5s, total: 2h 33min 36s
Wall time: 1h 47min 56s


In [37]:
df.head()
df.to_csv('full_class_with_stocks.csv')

### Split classifications into labels and scores

In [38]:
%%time

# split out scores and labels for sentiment: body and title
df['body_sent'] = df['body_sentiment'].apply(lambda x: x[0]['label'])
df['body_score'] = df['body_sentiment'].apply(lambda x: x[0]['score'])
df['title_sent'] = df['title_sentiment'].apply(lambda x: x[0]['label'])
df['title_score'] = df['title_sentiment'].apply(lambda x: x[0]['score'])

CPU times: user 52.3 ms, sys: 2.7 ms, total: 55 ms
Wall time: 53.9 ms


### Save classified CSV

In [39]:
df.to_csv('classified_and_split_gme_posts_plus_stocks.csv')

In [40]:
df.to_csv('classified_and_split_all_posts_plus_stocks.csv')