In [2]:
!pip install praw

Collecting praw
  Downloading praw-7.5.0-py3-none-any.whl (176 kB)
Collecting websocket-client>=0.54.0
  Downloading websocket_client-1.2.3-py3-none-any.whl (53 kB)
Collecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: websocket-client, prawcore, update-checker, praw
Successfully installed praw-7.5.0 prawcore-2.3.0 update-checker-0.18.0 websocket-client-1.2.3


In [20]:
!pip install tldextract

Collecting tldextract
  Downloading tldextract-3.1.2-py2.py3-none-any.whl (87 kB)
Collecting requests-file>=1.4
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-1.5.1 tldextract-3.1.2


# Reddit Data Collection

In [21]:
import praw
import pandas as pd
import numpy as np
import datetime as dt
import tldextract
import matplotlib.pyplot as plt
import re

### Extracting /r/india data using praw

In [2]:
user_agent = "Scapper 1.0 by /u/deepanshu0008"

reddit = praw.Reddit(
    client_id = "QGnayb8IUgunsNbVN4pnsw",
    client_secret = "U1SvgZ3tYqLIu2T1dr6w3TUOQ_I1gA",
    user_agent = user_agent
)

subreddit = reddit.subreddit('india')

In [3]:
flairs = ['AskIndia', 'Business/Finance', 'Food', 'Non-Political', 'Photography', 'Policy/Economy', 'Politics', 'Scheduled',
          'Science/Technology', 'Coronavirus']

In [4]:
def get_date(created):
    return dt.datetime.fromtimestamp(created)

In [5]:
labels_dict = {"flair":[], "title":[], "score":[], "id":[], "url":[], "comms_num": [], "created": [], "body":[], "author":[], "comments":[]}

In [None]:
for flair in flairs:
    
    get_subreddits = subreddit.search(flair, limit=400)
    
    for submission in get_subreddits:
        labels_dict['flair'].append(flair)
        labels_dict['title'].append(submission.title)
        labels_dict['score'].append(submission.score)
        labels_dict['id'].append(submission.id)
        labels_dict['url'].append(submission.url)
        labels_dict["comms_num"].append(submission.num_comments)
        labels_dict["created"].append(submission.created)
        labels_dict["body"].append(submission.selftext)
        labels_dict["author"].append(submission.author)
        
        submission.comments.replace_more(limit=None)
        comment = ''
        for top_level_comment in submission.comments:
            comment = comment + ' ' + top_level_comment.body
        labels_dict["comments"].append(comment)

In [None]:
data = pd.DataFrame(labels_dict)
data.sample

In [None]:
timedata = data['created'].apply(get_date)
data = data.assign(timestamp = timedata)
del data['created']

In [None]:
# Shuffling the rows
data = data.sample(frac=1).reset_index(drop=True)

In [None]:
# Saving the data to csv file
data.to_csv('reddit-india-data.csv', index = False)
data.head()

In [14]:
data = pd.read_csv('reddit-india-data.csv')
data.tail()

Unnamed: 0,flair,title,score,id,url,comms_num,body,author,comments,timestamp
2275,Policy/Economy,Crisis in economy retrievable with major polic...,7,j8z0st,https://www.sundayguardianlive.com/news/crisis...,0,,Free_Physics,,2020-10-11 18:02:43
2276,Scheduled,Monthly video games thread. December 2016 [Sch...,46,5fwcie,https://www.reddit.com/r/india/comments/5fwcie...,131,Let us use this thread to discuss games that w...,axaytsg,Bought games legally for the first time in li...,2016-12-02 02:10:03
2277,Business/Finance,What would be a better decision? [Serious],24,abqu80,https://www.reddit.com/r/india/comments/abqu80...,17,"24 year old male, working in a day job from 8 ...",nosleepnomore,"Bhai 8am-8pm is not a day job, it's slavery.\...",2019-01-02 20:10:50
2278,Non-Political,The forgotten promise of 1949: The RSS wrote a...,30,9pqnpq,https://www.thehindu.com/opinion/lead/the-forg...,8,,bliss_tree,"you know what bhakts will say?\r\n\r\n>""what ...",2018-10-20 16:36:14
2279,Non-Political,[Non Political] [NP] (Ask India) Medical postg...,1,85b66l,https://www.reddit.com/r/india/comments/85b66l...,0,Realised that many discontinue use after getti...,Vickythiside,,2018-03-19 02:50:46


In [15]:
def string(value):
    return str(value)

In [16]:
data['title'] = data['title'].apply(string)
data['body'] = data['body'].apply(string)
data['comments'] = data['comments'].apply(string)

In [17]:
# Importing required libraries

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
wordnet = WordNetLemmatizer()
STOPWORDS = set(stopwords.words('english'))
def text_cleaning(text):
    review = re.sub('[^a-zA-Z]',' ',text) 
    review = review.lower()
    text = ' '.join(wordnet.lemmatize(word) for word in text.split() if word not in STOPWORDS)
    return text

In [26]:
data['title'] = data['title'].apply(text_cleaning)
data['body'] = data['body'].apply(text_cleaning)
data['comments'] = data['comments'].apply(text_cleaning)

In [27]:
combined_features = data["title"] + data["comments"] + data["url"] + data["body"]
data = data.assign(combined_features = combined_features)

In [28]:
data.to_csv('data.csv')

In [29]:
pd.read_csv('data.csv').head()

Unnamed: 0.1,Unnamed: 0,flair,title,score,id,url,comms_num,body,author,comments,timestamp,combined_features
0,0,Business/Finance,Interest PF taxable: Finance minister eye PF i...,51,laoy5g,https://timesofindia.indiatimes.com/business/i...,20,,satyasys,"PF return fully taxable, point investing PF. G...",2021-02-02 19:30:37,Interest PF taxable: Finance minister eye PF i...
1,1,Food,How survive 500rs(food) 2 weeks?,55,kr3ztg,https://www.reddit.com/r/india/comments/kr3ztg...,79,Hey guys. So time salary going late I'll recei...,Luc_90,"1. 2 kg cheap rice, 50/kg so, (Don't buy boile...",2021-01-06 07:40:59,How survive 500rs(food) 2 weeks?1. 2 kg cheap ...
2,2,Scheduled,"Right wing group labelling resource document ""...",143,lbwl1r,https://www.reddit.com/r/india/comments/lbwl1r...,17,"Recently Greta Thunberg tweeted ""toolkit"" peop...",gobargorab,Anything father modi make go crazy upset. That...,2021-02-04 09:27:18,"Right wing group labelling resource document ""..."
3,3,Food,Ask: What amount pocket money give children?,5,m0auzn,https://www.reddit.com/r/india/comments/m0auzn...,39,"Also, supposed buy pocket money actually spend...",what_is_inflation,You guy get pocket money? My parent never gave...,2021-03-08 21:25:43,Ask: What amount pocket money give children?Yo...
4,4,Photography,"I’ve Recently generated interest photography, ...",0,aaakn4,https://i.redd.it/9tcehs8vz0721.jpg,14,,thesarcasticpage,A photo like letter viewer story want convey. ...,2018-12-29 03:35:37,"I’ve Recently generated interest photography, ..."
