### Problem Statement
- We are Data Scientists that are hired by a studio providing yoga lessons and tasked to identify potential yoga students from a meditation subreddit group to maximize profit.
- Our focus subreddit group will be r/yoga and r/Meditation.
- Deep dive into common topics and those lacked of by yoga subreddit to bring in new users/students.
- Identify the classification model with higher accuracy/recall score to get the key differences that a yoga studio needs to improve attraction from the meditation community.
- Communicate out the improvement criterias to bring in better subscription rates to maximize profit from marketing expenses.

In [1]:
import requests
import pandas as pd
from datetime import datetime

In [2]:
#import time to call for sleep function to prevent server crash.
import time

In [3]:
def scrap_posts(subreddit, n_posts):
    posts = []
    url = 'https://api.pushshift.io/reddit/search/submission'
    
    bef_dict = {'before': 1640908800}
    
    for i in range(n_posts):
        params = {
                'subreddit':subreddit,
                'size': 100,
                'before': bef_dict['before']
                }
            
        res = requests.get(url, params)
        time.sleep(0.5)
        
        if res.status_code != 200:
            print(f'Error Code {res.status_code}, {res.reason}')
            break
        
        data = res.json()
        posts.extend(data['data'])
            
        bef_dict['before'] = data['data'][-1]['created_utc']
    
    print(f"r/{subreddit} - Code:{res.status_code}, Status:{res.reason}")
    
    # create dataframe for scrapped posts
    df = pd.DataFrame(posts)
    df['created'] = df['created_utc'].apply(lambda x: datetime.fromtimestamp(x))
    
    # Stamping post and datetime while scraping 
    latest_post_stamped = datetime.fromtimestamp(df['created_utc'].iloc[0:].values[0])
    last_post_stamped = datetime.fromtimestamp(df['created_utc'].iloc[-1:].values[0])
    
    print(f"Scrapped {df.shape[0]} posts from {latest_post_stamped} to {last_post_stamped}")
    print()
    
    return df

In [4]:
df_yoga=scrap_posts('yoga',15)

r/yoga - Code:200, Status:OK
Scrapped 1500 posts from 2021-12-31 07:55:04 to 2021-11-07 19:16:53



In [5]:
df_yoga.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_richtext', 'link_flair_text_color', 'link_flair_type',
       'locked', 'media_only', 'no_follow', 'num_comments', 'num_crossposts',
       'over_18', 'parent_whitelist_status', 'permalink', 'pinned',
       'post_hint', 'preview', 'pwls', 'removed_by_category', 'retrieved_on',
       'score', 'selftext', 'send_replies', 'spoiler', 'stickied', 'subreddit',
       'subreddit_id', 'subreddit_subscribers', 'subreddit_

In [6]:
df_yoga.shape

(1500, 81)

In [7]:
df_yoga.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,gallery_data,is_gallery,media_metadata,crosspost_parent,crosspost_parent_list,author_flair_template_id,author_cakeday,edited,banned_by,created
0,[],False,shibahuskymom,,[],,text,t2_2kjzb7tv,False,False,...,,,,,,,,,,2021-12-31 07:55:04
1,[],False,fdrecordings,,[],,text,t2_8lmsmdn3,False,False,...,,,,,,,,,,2021-12-31 07:49:45
2,[],False,fdrecordings,,[],,text,t2_8lmsmdn3,False,False,...,,,,,,,,,,2021-12-31 07:49:26
3,[],False,meggriffinsglasses,,[],,text,t2_dbgfda3k,False,False,...,,,,,,,,,,2021-12-31 06:33:38
4,[],False,Friendly_Popo,,[],,text,t2_kjpgn,False,False,...,,,,,,,,,,2021-12-31 04:53:59


In [8]:
df_yoga['selftext'][:5]

0                                                     
1                                                     
2                                                     
3    I am super out of shape so I am 90% sure that ...
4    Hello yogis,\n\nAnybody aware of interesting y...
Name: selftext, dtype: object

In [9]:
#save to file as csv
df_yoga.to_csv('data/yoga.csv', index=False)


In [10]:
df_med=scrap_posts('meditation',15)

r/meditation - Code:200, Status:OK
Scrapped 1500 posts from 2021-12-31 07:38:50 to 2021-12-03 11:39:22



In [11]:
df_med.shape

(1500, 86)

In [12]:
df_med.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,author_flair_template_id,poll_data,live_audio,event_end,event_is_live,event_start,media_metadata,banned_by,edited,created
0,[],False,Extension_Mouse686,,[],,text,t2_7neqv2mr,False,False,...,,,,,,,,,,2021-12-31 07:38:50
1,[],False,saad2607,,[],,text,t2_3arwul82,False,False,...,,,,,,,,,,2021-12-31 07:04:07
2,[],False,dshep9729,,[],,text,t2_1glxiccy,False,False,...,,,,,,,,,,2021-12-31 07:03:52
3,[],False,hartmanners,,[],,text,t2_24pivn1q,False,False,...,,,,,,,,,,2021-12-31 06:35:51
4,[],False,Alina_1981,,[],,text,t2_bk6wx6vq,False,False,...,,,,,,,,,,2021-12-31 06:11:53


In [13]:
df_med['selftext'][:5]

0    Now that I have burned all the past karma that...
1                                            [removed]
2                                                     
3    I have been meditating for 220 consecutive day...
4                                                     
Name: selftext, dtype: object

In [14]:
#save to file as csv
df_med.to_csv('data/med.csv', index=False)