## Subredit analysys

In [1]:
import pandas as pd
from scraping import get_all_comments_from_subreddit

## Getting the data

In [2]:
subreddit_url = "/r/askreddit"
comments = get_all_comments_from_subreddit(subreddit_url)

Getting posts from /r/askreddit...
Getting comments from https://reddit.com/r/AskReddit/comments/c0qaj7/have_you_ever_quit_a_job_without_another_lined_up/.json...
Getting comments from https://reddit.com/r/AskReddit/comments/c0ocp5/it_people_of_reddit_what_is_your_goto_generic/.json...
Getting comments from https://reddit.com/r/AskReddit/comments/c0matc/which_two_and_two_did_you_just_recently_put/.json...
Getting comments from https://reddit.com/r/AskReddit/comments/c0kwdw/teachers_of_reddit_what_is_the_weirdest_excuse/.json...
Getting comments from https://reddit.com/r/AskReddit/comments/c0kyo0/americans_whove_visited_european_countries_what/.json...
Getting comments from https://reddit.com/r/AskReddit/comments/c0liuk/liberals_of_reddit_what_is_your_most_conservative/.json...
Getting comments from https://reddit.com/r/AskReddit/comments/c0n5pd/its_a_post_apocalyptic_world_however_reddit_is/.json...
Getting comments from https://reddit.com/r/AskReddit/comments/c0ovb8/serious_doctor_of_

## Cleaning data

In [13]:
df = pd.DataFrame(comments)
df = df.sort_values(['score'], ascending=[0])

In [14]:
# regex para quitar signos de puntuación \w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*
df['content'] = df['content'].str.replace(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '')

In [15]:
df.head()

Unnamed: 0,content,score
104,Valve released Steam. \n\nTook me entirely too...,35254
563,A sugar glider. Kept hearing a high pitched sq...,27285
79,"In Spanish, hats are called ""sombrero"" because...",27142
1037,Fyre Festival VIP suites,24677
80,I race cars in an amateur league. There is a b...,23204


In [16]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /Users/macbook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Tokenize comment (turn phrase into list of words)

In [17]:
df['content'] = df['content'].apply(lambda comment: nltk.word_tokenize(comment))
df.head()tc

Unnamed: 0,content,score
104,"[Valve, released, Steam, ., Took, me, entirely...",35254
563,"[A, sugar, glider, ., Kept, hearing, a, high, ...",27285
79,"[In, Spanish, ,, hats, are, called, ``, sombre...",27142
1037,"[Fyre, Festival, VIP, suites]",24677
80,"[I, race, cars, in, an, amateur, league, ., Th...",23204


## Nº words in comment

In [18]:
df['word_count'] = df['content'].apply(len)
df.head()

Unnamed: 0,content,score,word_count
104,"[Valve, released, Steam, ., Took, me, entirely...",35254,14
563,"[A, sugar, glider, ., Kept, hearing, a, high, ...",27285,32
79,"[In, Spanish, ,, hats, are, called, ``, sombre...",27142,18
1037,"[Fyre, Festival, VIP, suites]",24677,4
80,"[I, race, cars, in, an, amateur, league, ., Th...",23204,45


## Nº chars in comment

In [19]:
df['char_len'] = df['content'].apply(lambda comment: sum(len(word) for word in comment))
df.head()

Unnamed: 0,content,score,word_count,char_len
104,"[Valve, released, Steam, ., Took, me, entirely...",35254,14,56
563,"[A, sugar, glider, ., Kept, hearing, a, high, ...",27285,32,122
79,"[In, Spanish, ,, hats, are, called, ``, sombre...",27142,18,67
1037,"[Fyre, Festival, VIP, suites]",24677,4,21
80,"[I, race, cars, in, an, amateur, league, ., Th...",23204,45,169


## Character/word average

In [23]:
def avg_word(comment):
    return (sum(len(word) for word in comment)/len(comment))
df['avg_word'] = df['content'].apply(avg_word)
df.head()

Unnamed: 0,content,score,word_count,char_len,avg_word,stopwords,upper_words
104,"[Valve, released, Steam, ., Took, me, entirely...",35254,14,56,4.0,4,0
563,"[A, sugar, glider, ., Kept, hearing, a, high, ...",27285,32,122,3.8125,12,1
79,"[In, Spanish, ,, hats, are, called, ``, sombre...",27142,18,67,3.722222,3,0
1037,"[Fyre, Festival, VIP, suites]",24677,4,21,5.25,0,1
80,"[I, race, cars, in, an, amateur, league, ., Th...",23204,45,169,3.755556,17,3


## Nº english stopwords

In [24]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['stopwords'] = df['content'].apply(lambda comment: len([word for word in comment if word in stop]))
df.head()

Unnamed: 0,content,score,word_count,char_len,avg_word,stopwords,upper_words
104,"[Valve, released, Steam, ., Took, me, entirely...",35254,14,56,4.0,4,0
563,"[A, sugar, glider, ., Kept, hearing, a, high, ...",27285,32,122,3.8125,12,1
79,"[In, Spanish, ,, hats, are, called, ``, sombre...",27142,18,67,3.722222,3,0
1037,"[Fyre, Festival, VIP, suites]",24677,4,21,5.25,0,1
80,"[I, race, cars, in, an, amateur, league, ., Th...",23204,45,169,3.755556,17,3


## Nº of upper words

In [25]:
df['upper_words'] = df['content'].apply(lambda comment: len([word for word in comment if word.isupper()]))
df.head()

Unnamed: 0,content,score,word_count,char_len,avg_word,stopwords,upper_words
104,"[Valve, released, Steam, ., Took, me, entirely...",35254,14,56,4.0,4,0
563,"[A, sugar, glider, ., Kept, hearing, a, high, ...",27285,32,122,3.8125,12,1
79,"[In, Spanish, ,, hats, are, called, ``, sombre...",27142,18,67,3.722222,3,0
1037,"[Fyre, Festival, VIP, suites]",24677,4,21,5.25,0,1
80,"[I, race, cars, in, an, amateur, league, ., Th...",23204,45,169,3.755556,17,3
