In [2]:
# Import required libraries

from psaw import PushshiftAPI
import praw
import datetime as dt
import pandas as pd
import csv
from pathlib import Path


In [3]:
  # Initiate api
  
  api = PushshiftAPI()

In [4]:
# use serach_submissions to pull only posts in wallstreeetbets that contain CEI in it

subs = api.search_submissions(
    q='CEI',
    subreddit='wallstreetbets'
)

In [5]:
# Create dataframe to hold results

subs_df = pd.DataFrame([submission.d_ for submission in subs])



In [6]:
# Determining size of data
subs_df.shape

(477, 89)

In [7]:
# View all columns to choose required columns
subs_df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id',
       'link_flair_text', 'link_flair_text_color', 'link_flair_type', 'locked',
       'media_only', 'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'post_hint',
       'preview', 'pwls', 'retrieved_on', 'score', 'selftext', 'send_replies',
       'spoiler', 'stickied', 'subreddit', 'subre

In [8]:
# Testing column names to review data 
subs_df[['author','title','selftext','score','created_utc']].sample(1)

Unnamed: 0,author,title,selftext,score,created_utc
110,Fireguy69420,PROG Short Squeeze???,Somehow my last post got deleted. SI at 24% w ...,1,1633107554


In [9]:
# Convert datetime to readable format
subs_df['date'] = pd.to_datetime(subs_df['created_utc'], utc =True, unit='s')

In [10]:
# Create new dataframe to hold required columns
subs_df_2 = subs_df[['title','selftext','date','score','num_comments']]

display(subs_df_2.head(5))
display(subs_df_2.tail(5))


Unnamed: 0,title,selftext,date,score,num_comments
0,$cei could it be a sleeping giant? Could it re...,,2021-10-10 15:01:53+00:00,1,0
1,#cei the most manipulated stock in the market....,,2021-10-09 17:12:26+00:00,1,0
2,$PED - Pedevco Corp.,This company have been popped up after $CEI ra...,2021-10-09 11:15:13+00:00,1,1
3,What is up with CEI?,The thing literally is down 99.9% in the last ...,2021-10-09 08:03:49+00:00,1,1
4,https://twitter.com/KerrisdaleCap/status/14464...,[removed],2021-10-09 01:41:07+00:00,1,3


Unnamed: 0,title,selftext,date,score,num_comments
472,CEI too the moon,100%+ returns today what do you guys think?\n\...,2018-09-24 21:10:24+00:00,1,0
473,The cheapest thing on Robinhood currently is [...,[removed],2018-02-28 20:04:11+00:00,21,37
474,BUY CEI U WILL 9X TIMES INCREASE UR MONEY$$$ h...,[removed],2017-11-22 16:34:32+00:00,1,0
475,"ALERT FOR TUES, NOV 20, 2017 3 HOTTTT WALL ST....",[removed],2017-11-21 07:04:42+00:00,1,0
476,XXII and CEI today?,Anyone hopping on or am I going to be the only...,2017-10-06 12:57:00+00:00,7,13


In [11]:
# Convert format to required format
subs_df_2['date']= subs_df_2.loc[:,'date'].apply(lambda x:pd.Timestamp(x).strftime('%Y-%m-%d %H:%M:%S'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:

# Remove emojis

subs_df_2 = subs_df_2.astype(str).apply(lambda x: x.str.encode('ascii','ignore').str.decode('ascii'))

# Remove URLs

subs_df_2 = subs_df_2.replace(to_replace=r'https?:\/\/.*[\r\n]*',value='',regex=True)
# Remove [removed]

subs_df_2 = subs_df_2.replace('[removed]','')

# Removed posts without a date

subs_df_2 = subs_df_2.dropna(subset=['date'])


In [13]:
# Export dataframe into a csv
subs_df_2.to_csv(Path('../Resources/reddit_data.csv'),encoding='utf-8', index=None, header=True)
subs_df_2.tail(10)

Unnamed: 0,title,selftext,date,score,num_comments
467,Whats the deal with $CEI?,I know Im going to lose half of you autists be...,2019-09-22 13:49:09,2,7
468,"Someone please look at Camber Energy, $CEI",historically energy sector has gone up after a...,2019-08-19 17:43:14,0,3
469,$CEI is at 420. TIME TO GO ALL IN.,,2019-07-15 13:24:22,1,0
470,"Help, cut losses or hold",Bag holding $CEI. Come Monday should I cut los...,2019-07-14 18:50:01,4,18
471,Challenge find the worst performing stock ever,What's the worst performing stock you can find...,2019-04-21 16:59:31,56,76
472,CEI too the moon,100%+ returns today what do you guys think?\n\...,2018-09-24 21:10:24,1,0
473,The cheapest thing on Robinhood currently is [...,,2018-02-28 20:04:11,21,37
474,BUY CEI U WILL 9X TIMES INCREASE UR MONEY$$$,,2017-11-22 16:34:32,1,0
475,"ALERT FOR TUES, NOV 20, 2017 3 HOTTTT WALL ST....",,2017-11-21 07:04:42,1,0
476,XXII and CEI today?,Anyone hopping on or am I going to be the only...,2017-10-06 12:57:00,7,13
