# Project 3 Webscraping 


## Part 1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import datetime as dt
import time
import requests

In [None]:
url = "https://api.pushshift.io/reddit/search/submission?subreddit=wow"

In [None]:
res = requests.get(url)

In [None]:
res.status_code

In [None]:
assert res.status_code == 200

### Lets Get that Data

In [None]:
json_data = res.json()
json_data.keys()

In [None]:
len(json_data['data'])

In [None]:
json_data['data'][24];

In [None]:
results_df = pd.DataFrame(json_data['data'])
results_df.head()

In [None]:
results_df.columns

In [None]:
subfields = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
posts = results_df[subfields]
posts.head(20)

In [None]:
#remove duplicates
posts.drop_duplicates(subset= 'title', inplace= True)

In [None]:
posts.head(6)

In [None]:
len(posts)

In [None]:
posts['is_self'].unique()

In [None]:
# filter only is_self == true

#posts['is_self'] == "True"  #when i set it or do loc i get an empty dataframe

posts['is_self'] = posts['is_self'].astype(int)
posts.head(6)

In [None]:
posts = posts[posts['is_self'] == 1]
posts.head()

In [None]:
#create a 'timestamp' column using created_utc

posts['timestamp'] = posts['created_utc'].map(dt.date.fromtimestamp)


In [None]:
posts.head()

# Put it Altogether!

In [None]:
#establish parameters
subreddits = ['wow', 'lostarkgame']
kind = 'submission'

#establish url base
base_url = f"https://api.pushshift.io/reddit/search/{kind}"  #our API endpoint

#for loop variables :
day_window = 2  #  after	Return results after this date		Epoch value or Integer + "s,m,h,d" (i.e. 30d for 30 days)
n = 7

#make my empty list to push my data into and we will concat it later
posts = []


for mmo in subreddits:
    #construct full url
    stem = f"{base_url}?subreddit={mmo}&size=100"   # we are pulling a max of 100 posts ..  
    
    for  i in range(1, n + 1):
        #create custom url using 'after'
        URL = f"{stem}&after={day_window*i}d"
        print("Querying from " + URL)
        res = requests.get(URL)
        assert res.status_code == 200
        json = res.json()['data']
        df = pd.DataFrame(json)
        posts.append(df)
        time.sleep(5)
    print(f"Query complete from the {mmo} subreddit!")
    
print("Query Complete!")




Querying from https://api.pushshift.io/reddit/search/submission?subreddit=wow&size=100&after=2d
Querying from https://api.pushshift.io/reddit/search/submission?subreddit=wow&size=100&after=4d
Querying from https://api.pushshift.io/reddit/search/submission?subreddit=wow&size=100&after=6d
Querying from https://api.pushshift.io/reddit/search/submission?subreddit=wow&size=100&after=8d
Querying from https://api.pushshift.io/reddit/search/submission?subreddit=wow&size=100&after=10d
Querying from https://api.pushshift.io/reddit/search/submission?subreddit=wow&size=100&after=12d
Querying from https://api.pushshift.io/reddit/search/submission?subreddit=wow&size=100&after=14d
Query complete from the wow subreddit!
Querying from https://api.pushshift.io/reddit/search/submission?subreddit=lostarkgame&size=100&after=2d
Querying from https://api.pushshift.io/reddit/search/submission?subreddit=lostarkgame&size=100&after=4d
Querying from https://api.pushshift.io/reddit/search/submission?subreddit=lost

In [None]:
len(posts[0])

In [None]:
posts[0];

In [None]:
posts = pd.concat(posts)

In [None]:
len(posts)

1400

In [None]:
posts.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,author_premium,awarders,can_mod_post,contest_mode,created_utc,domain,full_link,gildings,id,is_created_from_ads_ui,is_crosspostable,is_meta,is_original_content,is_reddit_media_domain,is_robot_indexable,is_self,is_video,link_flair_background_color,link_flair_css_class,link_flair_richtext,link_flair_template_id,link_flair_text,link_flair_text_color,link_flair_type,locked,media_only,no_follow,num_comments,num_crossposts,over_18,parent_whitelist_status,permalink,pinned,post_hint,preview,pwls,removed_by_category,retrieved_on,score,selftext,send_replies,spoiler,stickied,subreddit,subreddit_id,subreddit_subscribers,subreddit_type,thumbnail,thumbnail_height,thumbnail_width,title,total_awards_received,treatment_tags,upvote_ratio,url,url_overridden_by_dest,whitelist_status,wls,author_flair_background_color,author_flair_template_id,author_flair_text_color,media,media_embed,secure_media,secure_media_embed,media_metadata,crosspost_parent,crosspost_parent_list,suggested_sort
0,[],False,balling_ball,,[],,text,t2_daf00eis,False,False,False,[],False,False,1644700695,i.redd.it,https://www.reddit.com/r/wow/comments/sr1ihc/m...,{},sr1ihc,False,False,False,False,True,False,False,False,,transmog,"[{'e': 'text', 't': 'Transmog'}]",bf146dee-6659-11eb-b4c9-0e48bba495d9,Transmog,dark,richtext,False,False,True,0,0,False,all_ads,/r/wow/comments/sr1ihc/my_warrior_mog/,False,image,"{'enabled': True, 'images': [{'id': 'ssRjx4HDY...",6,automod_filtered,1644700706,1,,True,False,False,wow,t5_2qio8,2170548,public,https://b.thumbs.redditmedia.com/a-_SmWcQqgNBL...,78.0,140.0,my Warrior mog,0,[],1.0,https://i.redd.it/x5zhkwtxxgh81.jpg,https://i.redd.it/x5zhkwtxxgh81.jpg,all_ads,6,,,,,,,,,,,
1,[],False,TethanProps,,[],,text,t2_9o07cm8n,False,False,False,[],False,False,1644700708,/r/wow/comments/sr1inj/im_in_the_process_of_ma...,https://www.reddit.com/r/wow/comments/sr1inj/i...,{},sr1inj,False,True,False,False,False,True,False,True,,art,"[{'e': 'text', 't': 'Art'}]",af6b23d2-494b-11ea-a552-0e312f896259,Art,dark,richtext,False,False,True,0,0,False,all_ads,/r/wow/comments/sr1inj/im_in_the_process_of_ma...,False,hosted:video,"{'enabled': False, 'images': [{'id': 'rY-h5qZl...",6,,1644700718,1,,True,False,False,wow,t5_2qio8,2170548,public,https://b.thumbs.redditmedia.com/hyCh48Rux97mG...,140.0,140.0,I'm in the process of making a Frostmourne lig...,0,[],1.0,https://v.redd.it/1ep4ymwmxgh81,https://v.redd.it/1ep4ymwmxgh81,all_ads,6,,,,,,,,,,,
2,[],False,Kiemsargis,,[],,text,t2_9mwsvxod,False,False,False,[],False,False,1644700808,/r/wow/comments/sr1jwe/can_somebody_explain_me...,https://www.reddit.com/r/wow/comments/sr1jwe/c...,{},sr1jwe,False,True,False,False,False,True,False,True,,question,"[{'e': 'text', 't': 'Question'}]",a8efbf86-494b-11ea-9ee5-0ea9890373cb,Question,dark,richtext,False,False,True,0,0,False,all_ads,/r/wow/comments/sr1jwe/can_somebody_explain_me...,False,hosted:video,"{'enabled': False, 'images': [{'id': 'KcEgUH_z...",6,,1644700819,1,,True,False,False,wow,t5_2qio8,2170548,public,https://b.thumbs.redditmedia.com/RjDunLEgKGdY3...,78.0,140.0,Can somebody explain me what's going on there ?,0,[],1.0,https://v.redd.it/8lxil29bygh81,https://v.redd.it/8lxil29bygh81,all_ads,6,,,,,,,,,,,
3,[],False,Henstelfs,,[],,text,t2_74icf,False,False,False,[],False,False,1644701325,self.wow,https://www.reddit.com/r/wow/comments/sr1qdk/t...,{},sr1qdk,False,True,False,False,False,True,True,False,,question,"[{'e': 'text', 't': 'Question'}]",a8efbf86-494b-11ea-9ee5-0ea9890373cb,Question,dark,richtext,False,False,True,0,0,False,all_ads,/r/wow/comments/sr1qdk/tribute_ideas_for_a_fri...,False,,,6,,1644701336,1,Me and some irl friends and Wow friends had a ...,True,False,False,wow,t5_2qio8,2170547,public,self,,,Tribute ideas for a friend who passed away?,0,[],1.0,https://www.reddit.com/r/wow/comments/sr1qdk/t...,,all_ads,6,,,,,,,,,,,
4,[],False,BengtJJ,,[],,text,t2_gkbsx,False,False,False,[],False,False,1644701779,self.wow,https://www.reddit.com/r/wow/comments/sr1vva/h...,{},sr1vva,False,True,False,False,False,True,True,False,,question,"[{'e': 'text', 't': 'Question'}]",a8efbf86-494b-11ea-9ee5-0ea9890373cb,Question,dark,richtext,False,False,True,0,0,False,all_ads,/r/wow/comments/sr1vva/healer_main_92/,False,,,6,,1644701789,1,Returning player and I feel like it is quite o...,True,False,False,wow,t5_2qio8,2170546,public,self,,,Healer main 9.2,0,[],1.0,https://www.reddit.com/r/wow/comments/sr1vva/h...,,all_ads,6,,,,,,,,,,,


In [None]:
posts.to_csv('subreddits_post.csv')

# New Section