# Project 3: Web APIs & Classification

## Step2: Downloading data - Conservatives

### Importing libaries

In [6]:
import pandas as pd
import numpy as np
import requests
import time
import re

### Crawling the Conservatives' posts

#### First level of crawling - list of posts

In [7]:
# url - Reddit API via .json
url = "https://www.reddit.com/r/Conservatives.json"


In [8]:
# Function to to make HTML requests to Reddit to receive the information in json format
# Each page has ~25 posts so the loop will go through 40 times to get ~1000 posts
# This page contains information such as posts' name (similar to id), title, url of individual post etc
# Request status 200 means success
# Add a latency of 0.2 second after each request

def extract_posts(url):
    
    headers = {"User-agent": "Bot DSI"}
    posts = []
    after = None

    # loop for 40 times
    for i in range(40):
        print(i)
        if after == None:
            params = {}
        else:
            params = {"after": after}
        res = requests.get(url, params=params, headers=headers)
        if res.status_code == 200:
            json_text = res.json()
            posts.extend(json_text["data"]["children"])
            after = json_text["data"]["after"]
        else:
            print(res.status_code)
            break
        time.sleep(0.2)
    return posts

In [9]:
# Calling the function to extract posts (first level of crawling)
posts = extract_posts(url)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39


#### Processing data from the first layer of crawling

In [11]:
# Function to process the key information crawled from the first layer of crawling
# In order to save the text in csv file properly, non-characters are replaced by spaces
def trim_posts(posts):

    url_root = "https://www.reddit.com"

    posts_trimmed= []
    for i in range(len(posts)):
        new_post = {}
        new_post["name"] = posts[i]['data']["name"]
        new_post["title"] = re.sub('[^a-zA-Z0-9 \n\.]', ' ', posts[i]['data']["title"])
        new_post["url_comments"] = url_root + posts[i]['data']['permalink']
        posts_trimmed.append(new_post)
    
    return posts_trimmed

In [12]:
posts_trimmed = trim_posts(posts)

In [13]:
posts_trimmed[0]

{'name': 't3_ktjva7',
 'title': 'ATTENTION  If you are here from the left or the right. DO NOT ADVOCATE VIOLENCE or for anyone to be killed. DO NOT advocate civil war or ALLUDE TO IT. This means ALL OF YOU. Left or right  we will ban you...and reddit will action your account.',
 'url_comments': 'https://www.reddit.com/r/conservatives/comments/ktjva7/attention_if_you_are_here_from_the_left_or_the/'}

#### Saving results of first level of crawling in csv

In [14]:
df_posts = pd.DataFrame(posts_trimmed)

In [15]:
df_posts.head()

Unnamed: 0,name,title,url_comments
0,t3_ktjva7,ATTENTION If you are here from the left or th...,https://www.reddit.com/r/conservatives/comment...
1,t3_kv5nyw,Exactly Where and How Did Trump Incite the Mob,https://www.reddit.com/r/conservatives/comment...
2,t3_kvth8k,Bullies,https://www.reddit.com/r/conservatives/comment...
3,t3_kvsq5f,Facebook confirms data sharing agreements with...,https://www.reddit.com/r/conservatives/comment...
4,t3_kvcfmm,NorthKoreaDPRK,https://www.reddit.com/r/conservatives/comment...


In [16]:
df_posts.to_csv("../data/conservatives.csv", index=False)

### Read csv files with permalinks to extract comments under the post

In [17]:
df_posts = pd.read_csv("../data/conservatives.csv")

In [18]:
df_posts.head()

Unnamed: 0,name,title,url_comments
0,t3_ktjva7,ATTENTION If you are here from the left or th...,https://www.reddit.com/r/conservatives/comment...
1,t3_kv5nyw,Exactly Where and How Did Trump Incite the Mob,https://www.reddit.com/r/conservatives/comment...
2,t3_kvth8k,Bullies,https://www.reddit.com/r/conservatives/comment...
3,t3_kvsq5f,Facebook confirms data sharing agreements with...,https://www.reddit.com/r/conservatives/comment...
4,t3_kvcfmm,NorthKoreaDPRK,https://www.reddit.com/r/conservatives/comment...


In [19]:
# add new columns to store selftext and comments
df_posts["comments"] = ""
df_posts["selftext"] = ""

In [20]:
df_posts.shape

(993, 5)

In [21]:
df_posts.head()

Unnamed: 0,name,title,url_comments,comments,selftext
0,t3_ktjva7,ATTENTION If you are here from the left or th...,https://www.reddit.com/r/conservatives/comment...,,
1,t3_kv5nyw,Exactly Where and How Did Trump Incite the Mob,https://www.reddit.com/r/conservatives/comment...,,
2,t3_kvth8k,Bullies,https://www.reddit.com/r/conservatives/comment...,,
3,t3_kvsq5f,Facebook confirms data sharing agreements with...,https://www.reddit.com/r/conservatives/comment...,,
4,t3_kvcfmm,NorthKoreaDPRK,https://www.reddit.com/r/conservatives/comment...,,


In [28]:
# function to extract selftext and comments based on permalinks to the "comments" page

def extract_selftext_comments(url):
    headers = {"User-agent": "Bot DSI"}
    res = requests.get(url, headers=headers)


    if (res.status_code == 200):
        json_text = res.json()

        comments_array = []
        selftext = []
        
        # gather comments and convert it to a string
        for i in range(len(json_text[1]["data"]["children"])):
            try:
                comments_array.append(json_text[1]["data"]["children"][i]["data"]["body"])
            except:
                pass

        #selftext does not necessarily exist - skipped if not available
        try:
            selftext.append(json_text[0]["data"]["children"][0]["data"]["selftext"])
        except:
                pass

        try:
            selftext.append(json_text[0]["data"]["children"][0]["data"]["crosspost_parent_list"][0]["selftext"])
        except:
                pass

    
    output = {"selftext": " ".join(selftext), "comments":" ".join(comments_array)}
    
    return output


In [32]:
for i in df_posts.index:
    print(i)
    url = df_posts.loc[i, "url_comments"][0:-1] + ".json"
    print(url)
    selftext_comments = extract_selftext_comments(url)
    selftext = selftext_comments["selftext"]
    comments = selftext_comments["comments"]
    selftext = re.sub('[^a-zA-Z0-9 \n\.]', ' ', selftext)
    comments = re.sub('[^a-zA-Z0-9 \n\.]', ' ', comments)
    df_posts.loc[i, "selftext"] = selftext
    df_posts.loc[i, "comments"] = comments
    time.sleep(0.1)

0
https://www.reddit.com/r/conservatives/comments/ktjva7/attention_if_you_are_here_from_the_left_or_the.json
1
https://www.reddit.com/r/conservatives/comments/kv5nyw/exactly_where_and_how_did_trump_incite_the_mob.json
2
https://www.reddit.com/r/conservatives/comments/kvth8k/bullies.json
3
https://www.reddit.com/r/conservatives/comments/kvsq5f/facebook_confirms_datasharing_agreements_with.json
4
https://www.reddit.com/r/conservatives/comments/kvcfmm/northkoreadprk.json
5
https://www.reddit.com/r/conservatives/comments/kvthk8/businesses_that_went_woke_after_capitol_riot.json
6
https://www.reddit.com/r/conservatives/comments/kvtfga/bye_bye_best_buy.json
7
https://www.reddit.com/r/conservatives/comments/kvs1oh/limbaugh_titans_of_tech_are_now_running_america.json
8
https://www.reddit.com/r/conservatives/comments/kv3066/so_true.json
9
https://www.reddit.com/r/conservatives/comments/kvao14/ben_carson_reacts_to_trump_being_banned_from.json
10
https://www.reddit.com/r/conservatives/comments/kvr

79
https://www.reddit.com/r/conservatives/comments/kurzyd/big_tech_oligarchs_collude_to_ban_parler_from.json
80
https://www.reddit.com/r/conservatives/comments/kvcj75/rdonaldtrump_has_been_banned.json
81
https://www.reddit.com/r/conservatives/comments/kv04wr/candace_owens_facebook_censored_a_sitting.json
82
https://www.reddit.com/r/conservatives/comments/kuvn73/nancy_pelosi_says_she_wants_donald_trump.json
83
https://www.reddit.com/r/conservatives/comments/kuvgfp/as_big_tech_muffles_conservative_voices_dystopian.json
84
https://www.reddit.com/r/conservatives/comments/kuymr7/communist_china_openly_celebrates_genocidal.json
85
https://www.reddit.com/r/conservatives/comments/kuuqwa/conceding_defeat_to_fraud_is_not_patriotic.json
86
https://www.reddit.com/r/conservatives/comments/kuwpy1/while_digitalocean_is_committed_to_supporting_a.json
87
https://www.reddit.com/r/conservatives/comments/kvitio/the_ghost_who_votes.json
88
https://www.reddit.com/r/conservatives/comments/kujlp7/arnold_schwa

156
https://www.reddit.com/r/conservatives/comments/ktxjo9/paypal_and_shopify_remove_trumprelated_accounts.json
157
https://www.reddit.com/r/conservatives/comments/kulisw/update_dc_officer_sicknicks_death_was_driven_by.json
158
https://www.reddit.com/r/conservatives/comments/kumytt/pelosi_tried_to_foment_a_military_coup_against.json
159
https://www.reddit.com/r/conservatives/comments/kum0v3/android_users_these_are_the_4_steps_to_avoid.json
160
https://www.reddit.com/r/conservatives/comments/ku1roc/rush_the_swamp_is_scared_of_the_next_12_days.json
161
https://www.reddit.com/r/conservatives/comments/kul318/great_american_clint_eastwood_marks_a_milestone.json
162
https://www.reddit.com/r/conservatives/comments/ku242v/pro_trump_supporter_jon_dawson_on_tiktok_removed.json
163
https://www.reddit.com/r/conservatives/comments/kulhrq/texas_attorney_general_vows_to_fight_big_tech.json
164
https://www.reddit.com/r/conservatives/comments/kubx4z/can_digital_platforms_be_allowed_to_stay_biased.json


233
https://www.reddit.com/r/conservatives/comments/kteskd/apple_issues_and_ultimatum_to_parlor_censor.json
234
https://www.reddit.com/r/conservatives/comments/kts2my/waitthat_new_coronavirus_strain_might_not_exist.json
235
https://www.reddit.com/r/conservatives/comments/ktbkwu/tucker_carlson_the_people_who_run_the_republican.json
236
https://www.reddit.com/r/conservatives/comments/ktfrxa/second_trump_impeachment_is_possible_monday.json
237
https://www.reddit.com/r/conservatives/comments/kt548d/whitlock_ignoring_the_concerns_of_trump.json
238
https://www.reddit.com/r/conservatives/comments/ktihae/parler_is_removed_from_google_play_but_tiktok.json
239
https://www.reddit.com/r/conservatives/comments/ktbmew/bongino_recalls_when_leftwing_mob_threatened.json
240
https://www.reddit.com/r/conservatives/comments/ktgkth/tor_browser.json
241
https://www.reddit.com/r/conservatives/comments/ktbjre/chinese_communist_party_bans_members_from.json
242
https://www.reddit.com/r/conservatives/comments/ks

309
https://www.reddit.com/r/conservatives/comments/kskzfj/pelosi_calls_on_vp_pence_to_invoke_25th_amendment.json
310
https://www.reddit.com/r/conservatives/comments/ksf4jb/georgia_dem_vernon_jones_announces_hes_switching.json
311
https://www.reddit.com/r/conservatives/comments/ksd6g2/tucker_carlson_as_long_as_people_sincerely.json
312
https://www.reddit.com/r/conservatives/comments/kshdrn/were_leftist_provocateurs_leading_the_way_into.json
313
https://www.reddit.com/r/conservatives/comments/ksic9i/capitol_police_inability_to_secure_seat_of_us.json
314
https://www.reddit.com/r/conservatives/comments/ksq8wp/the_pence_letter.json
315
https://www.reddit.com/r/conservatives/comments/kspbx5/californias_internet_censorship_office_is.json
316
https://www.reddit.com/r/conservatives/comments/ksiaxn/facebook_instagram_to_block_trumps_account_for.json
317
https://www.reddit.com/r/conservatives/comments/ksf1f8/trump_vows_orderly_transition_on_jan_20.json
318
https://www.reddit.com/r/conservatives/

385
https://www.reddit.com/r/conservatives/comments/kr41hc/progop_georgia_battleground_fund_raises_58.json
386
https://www.reddit.com/r/conservatives/comments/kr0swp/epidemiologist_says_influenza_cases_are_being.json
387
https://www.reddit.com/r/conservatives/comments/kqyhsp/trump_democrats_will_turn_us_into_oneparty.json
388
https://www.reddit.com/r/conservatives/comments/kqnl9e/report_400_exintel_officers_investigating_2020.json
389
https://www.reddit.com/r/conservatives/comments/kqoypc/trump_theres_no_way_we_lost_georgia_that_was_a.json
390
https://www.reddit.com/r/conservatives/comments/kqp067/saudi_arabia_to_end_feud_with_qatar_in_jared.json
391
https://www.reddit.com/r/conservatives/comments/kqk5f0/supreme_court_of_texas_to_austin_mayor_you_dont.json
392
https://www.reddit.com/r/conservatives/comments/kqqp6l/liberals_a_who_new_breed_of_stupid.json
393
https://www.reddit.com/r/conservatives/comments/krbfjq/mcconnell_to_gop_colleagues_i_wont_judge_anybody.json
394
https://www.reddi

461
https://www.reddit.com/r/conservatives/comments/kqlata/tom_cotton_will_not_oppose_counting_the_certified.json
462
https://www.reddit.com/r/conservatives/comments/kqbn9j/2021_the_left_is_serious_now_this_year_is_clearly.json
463
https://www.reddit.com/r/conservatives/comments/kqky4d/fauci_march_2020_its_possible_hundreds_of.json
464
https://www.reddit.com/r/conservatives/comments/kqa1d4/maga_patriots_must_win_the_gop_civil_war_the.json
465
https://www.reddit.com/r/conservatives/comments/kqkwmh/watch_the_trailer_the_daily_wire_to_premiere.json
466
https://www.reddit.com/r/conservatives/comments/kpzlnu/the_prayer_to_open_the_117th_congress_ended_with.json
467
https://www.reddit.com/r/conservatives/comments/kqk3m7/iran_parliament_discusses_bill_to_eliminate.json
468
https://www.reddit.com/r/conservatives/comments/kqizig/the_gender_pay_gap_is_not_a_myth_here_are_6.json
469
https://www.reddit.com/r/conservatives/comments/kpsu34/sen_blackburn_an_audit_will_answer_questions_as.json
470
htt

536
https://www.reddit.com/r/conservatives/comments/kpl2or/pelosi_bans_gender_terms_mother_daughter_father.json
537
https://www.reddit.com/r/conservatives/comments/kp1wl3/i_thought_i_saw_this_vandalism_before_oh_look.json
538
https://www.reddit.com/r/conservatives/comments/kpu7zk/the_wapo_trump_phone_call_my_review.json
539
https://www.reddit.com/r/conservatives/comments/kpmnsa/fox_news_vs.json
540
https://www.reddit.com/r/conservatives/comments/koz1w7/he_has_receipts_liberal_reporter_calls_out.json
541
https://www.reddit.com/r/conservatives/comments/kpahdr/watch_shocking_footage_emerges_of_police_storming.json
542
https://www.reddit.com/r/conservatives/comments/kqd2pz/the_media_are_lying_about_trumps_phone_call_with.json
543
https://www.reddit.com/r/conservatives/comments/kpacm8/nancy_pelosis_house_vandalized_with_leftist.json
544
https://www.reddit.com/r/conservatives/comments/kpa6rp/civil_war_tucker_carlson_hits_his_own_network_in.json
545
https://www.reddit.com/r/conservatives/comm

612
https://www.reddit.com/r/conservatives/comments/ko7xac/john_kerry_reveals_bidens_devotion_to_radical.json
613
https://www.reddit.com/r/conservatives/comments/kognhb/brexit_boris_johnsons_trade_deal_ends_the_fight.json
614
https://www.reddit.com/r/conservatives/comments/ko4lga/alyssa_milano_ditches_me_too_movement_to_boost.json
615
https://www.reddit.com/r/conservatives/comments/knq1io/americans_blast_kamala_for_kwanzaa_claim_it_didnt.json
616
https://www.reddit.com/r/conservatives/comments/knx3hy/go_woke_go_broke_pro_sports_suffer_ratings.json
617
https://www.reddit.com/r/conservatives/comments/ko2i8w/as_pence_moves_to_dismiss_election_lawsuit_the.json
618
https://www.reddit.com/r/conservatives/comments/ko2kdp/the_only_thing_corporate_media_learned_covering.json
619
https://www.reddit.com/r/conservatives/comments/knpuhf/90_of_military_ballots_observed_had_no_paper.json
620
https://www.reddit.com/r/conservatives/comments/ko2jbm/nbc_spreads_prowuhan_antiamerica_propaganda_to.json
621

687
https://www.reddit.com/r/conservatives/comments/knyenx/wannabe_jeopardy_host_ken_jennings_is_a_kavanaugh.json
688
https://www.reddit.com/r/conservatives/comments/knr6h3/the_left_takes_on_gods_country_we_stand_at_a_time.json
689
https://www.reddit.com/r/conservatives/comments/knxagx/kooky_cnn_doc_only_regime_change_will_get.json
690
https://www.reddit.com/r/conservatives/comments/knx9ly/dick_thornburgh_former_pennsylvania_governor_and.json
691
https://www.reddit.com/r/conservatives/comments/knx4g5/de_blasio_calls_nyc_bike_attack_on_suv_absolutely.json
692
https://www.reddit.com/r/conservatives/comments/kn9ukr/slam_sen_josh_hawley_dunks_on_walmart_for.json
693
https://www.reddit.com/r/conservatives/comments/knsp51/trumps_top_10_accomplishments_of_2020.json
694
https://www.reddit.com/r/conservatives/comments/kn2ewa/how_a_vindictive_classmate_and_a_cowardly.json
695
https://www.reddit.com/r/conservatives/comments/knzpgk/trump_releases_video_update_all_of_america_needs.json
696
https://

762
https://www.reddit.com/r/conservatives/comments/kmr9rz/paul_mccartney_tells_man_who_thought_she_was_a.json
763
https://www.reddit.com/r/conservatives/comments/kmeds4/the_future_gop_voters_want_can_be_summed_up_in.json
764
https://www.reddit.com/r/conservatives/comments/kmn5gv/title.json
765
https://www.reddit.com/r/conservatives/comments/kmeac2/3_of_nancy_pelosis_power_grabs_congress_needs_to.json
766
https://www.reddit.com/r/conservatives/comments/kmzzqz/donald_trump_fires_red_balls_at_his_own_camp.json
767
https://www.reddit.com/r/conservatives/comments/kmros9/the_link_between_donald_trump_and_george.json
768
https://www.reddit.com/r/conservatives/comments/kmgyui/ukraine_press_conference_explicitly_ties_hunter.json
769
https://www.reddit.com/r/conservatives/comments/kmjalg/is_it_any_wonder_liberal_states_are_shrinking.json
770
https://www.reddit.com/r/conservatives/comments/kmlhry/boston_removes_lincoln_emancipation_memorial.json
771
https://www.reddit.com/r/conservatives/comment

838
https://www.reddit.com/r/conservatives/comments/kll8lb/police_fleeing_portland_in_unprecedented_numbers.json
839
https://www.reddit.com/r/conservatives/comments/km2h0k/leftism_and_higher_education_continued.json
840
https://www.reddit.com/r/conservatives/comments/km2dwz/will_cities_survive_2020.json
841
https://www.reddit.com/r/conservatives/comments/kl8dag/you_cannot_love_the_poor_and_support_lockdowns_if.json
842
https://www.reddit.com/r/conservatives/comments/klbia4/star_of_bernie_sanders_ad_indicted_on_24.json
843
https://www.reddit.com/r/conservatives/comments/kluwwh/chinese_citizen_journalist_sentenced_to_four.json
844
https://www.reddit.com/r/conservatives/comments/kluupv/dr_fauci_admits_he_has_treated_the_american.json
845
https://www.reddit.com/r/conservatives/comments/klwyt0/2030_agenda_for_sustainable_development_41_pages.json
846
https://www.reddit.com/r/conservatives/comments/km07ja/between_deplorables.json
847
https://www.reddit.com/r/conservatives/comments/klla53/les

914
https://www.reddit.com/r/conservatives/comments/kkrfht/new_england_journal_of_medicine_advocates.json
915
https://www.reddit.com/r/conservatives/comments/kk7xv8/minnesota_lawmakers_say_coronavirus_deaths_could.json
916
https://www.reddit.com/r/conservatives/comments/kkc85l/glenn_greenwald_slams_media_coverage_of.json
917
https://www.reddit.com/r/conservatives/comments/kkom0z/house_tea_party_stars_a_decade_on_where_are_they.json
918
https://www.reddit.com/r/conservatives/comments/kkkou1/why_michael_horowitz_deserves_the_medal_of.json
919
https://www.reddit.com/r/conservatives/comments/kkb8m3/woman_charged_for_graphic_threat_to_michigan_gop.json
920
https://www.reddit.com/r/conservatives/comments/kka6wr/giuliani_election_fraud_evidence_going_to_blow_up.json
921
https://www.reddit.com/r/conservatives/comments/kkrpdx/meet_xavier_degroat_the_white_houses_first_intern.json
922
https://www.reddit.com/r/conservatives/comments/kkva9a/twelve_times_the_lockdowners_were_wrong.json
923
https://

990
https://www.reddit.com/r/conservatives/comments/kjia6k/bill_de_blasio_nyc_deputies_will_ensure_uk.json
991
https://www.reddit.com/r/conservatives/comments/kjqolb/video_communist_destruction_guides_modern_social.json
992
https://www.reddit.com/r/conservatives/comments/kjht34/chairman_neal_introduces_the_cash_act_to_provide.json


In [33]:
df_posts.tail()

Unnamed: 0,name,title,url_comments,comments,selftext
988,t3_kjra0i,A Communist Christmas,https://www.reddit.com/r/conservatives/comment...,,
989,t3_kjn2tt,Supreme Court sets deadline for election fight...,https://www.reddit.com/r/conservatives/comment...,Fuckery gt The Jan. 22 deadline might become ...,
990,t3_kjia6k,Bill de Blasio NYC Deputies Will Ensure UK Qu...,https://www.reddit.com/r/conservatives/comment...,local cops have become the shock troops of the...,
991,t3_kjqolb,Video Communist DESTRUCTION Guides Modern Soc...,https://www.reddit.com/r/conservatives/comment...,,
992,t3_kjht34,Chairman Neal Introduces the CASH Act to Provi...,https://www.reddit.com/r/conservatives/comment...,It s only the 700 billion in pork and special...,


In [34]:
df_posts.to_csv("../data/conservatives_comments.csv", index=False)