## Inspection of the compressing data

In [2]:
import bz2
import pandas as pd
import re
import numpy as np

In [None]:
# reddit_comments_2019_02.csv and reddit_posts_2019_02.csv.gz are a sample of 1.000.000 data each dowloaded
# ordering created_utc as the original ds is ordered by subreddit.

In [3]:
comments_df = pd.read_csv('/Users/giuliagalli/Documents/GitHub/TFM/reddit/00_data/reddit_comments_2019_02.csv.gz', compression='gzip', 
                                 header=0, sep=',', quotechar='"')

In [4]:
comments_df.shape

(1000000, 3)

In [5]:
comments_df.head(10)

Unnamed: 0,created_utc,body,subreddit
0,1548979200,"Nah I think that’s the biggest one, I’m in it ...",RocketLeagueExchange
1,1548979200,Checking in,pornfree
2,1548979200,traditionally you have a 2nd tortilla undernea...,Austin
3,1548979200,Yeah it can get bad. Muting earlier than later...,Smite
4,1548979200,Sorry,FortNiteBR
5,1548979200,Just ran a all boarding fleet and did decently...,BFGArmada
6,1548979200,They’re really letting Donger down,USLPRO
7,1548979200,All my mail is for the previous 6 tenants...so...,chicago
8,1548979200,It's okay! The only way we can find out the an...,fursuit
9,1548979200,It because when you first start playing your m...,gaming


In [6]:
comments_df.describe()

Unnamed: 0,created_utc
count,1000000.0
mean,1548988000.0
std,5301.421
min,1548979000.0
25%,1548984000.0
50%,1548988000.0
75%,1548993000.0
max,1548998000.0


In [7]:
posts_df = pd.read_csv('/Users/giuliagalli/Documents/GitHub/TFM/reddit/00_data/reddit_posts_2019_02.csv.gz', compression='gzip', 
                                 header=0, sep=',', quotechar='"')

In [8]:
posts_df.shape

(1000000, 4)

In [9]:
posts_df.head(10)

Unnamed: 0,created_utc,title,selftext,subreddit
0,1548979200,Xiaomi President Shows Off Its Folding Phone,,businesstalkdaily
1,1548979200,[GAME THREAD] Our Milwaukee Bucks (36-13) visi...,#r/MkeBucks ANNOUNCEMENTS AND REMINDERS\n\n-\t...,MkeBucks
2,1548979200,Wearing only a thong and a smile ;D,,palegirls
3,1548979200,01-31 23:59 - 'Pee walls in the boys bathrooms...,'''\n\nNow this might sound really strange but...,removalbot
4,1548979200,Wtf?! Its time already.. Where are the leaks/p...,The event should have started by now so where ...,IdleHeroes
5,1548979200,Skrill removed all of bangarang from Apple Mus...,[deleted],EDM
6,1548979200,[WIP] Heeeeeeeeere's Johnny!,,CrossStitch
7,1548979200,Orang defend,,TheGreenArmy
8,1548979200,My stocks,,logodesign
9,1548979201,What location are you most likely to be killed...,[deleted],AskReddit


In [10]:
posts_df.describe()

Unnamed: 0,created_utc
count,1000000.0
mean,1549062000.0
std,47966.24
min,1548979000.0
25%,1549025000.0
50%,1549059000.0
75%,1549105000.0
max,1549144000.0


In [11]:
comments_df.body.sample(7)

381529    Lowest price in my area is $65 (Total Wine).  ...
597378                          I wouldn’t call it small ��
963412                  Great work. Guessing you’re Aussie?
916563                Some of these curves are annoying... 
558577                                            [deleted]
993741                             Sleep tight sweet bot~! 
555360           I think i saw you un the fairly oddparents
Name: body, dtype: object

In [12]:
posts_df.title.sample(10)

87669            [QC] Are these really bad reps? Please LMK
911242    You are offered $1,000,000 to plan the most aw...
961178    [PS4] W: Trusted user to help transfer items H...
237074                                              Gay son
561870                       The power of cute complies you
668543                                    Maybe Maybe Maybe
851581             Do You Feel The Music or The Victories ?
429516                    Letter of Consideration EReassign
928328               Mash This (a picture this megamashup!)
344560    «Κύμα» αλληλεγγύης για το κορίτσι που διάβαζε ...
Name: title, dtype: object

In [13]:
posts_df.selftext.sample(10)

357763                                                  NaN
400622                                                  NaN
594488                                                  NaN
167771                                            [deleted]
573924                                            [deleted]
28865                                             [deleted]
295                                               [deleted]
379971                                                  NaN
373564                                                  NaN
540292    When they first came out I did the same thing....
Name: selftext, dtype: object

In [None]:
# Let's see if whe HTML tags in our comments and posts

In [14]:
# WIP
#test = posts_df.title.compile(r'<[^>]+>')
#test.sample(10)

AttributeError: 'Series' object has no attribute 'compile'

### Cleaning Post DS

In [16]:
# we have a lot of NaN values on the self text but we will keep it because the of the title not NaN
# we have 8 NaN on the title where we can check if have values on the selftext
posts_df.isnull().sum()

created_utc         0
title               8
selftext       535906
subreddit           0
dtype: int64

In [17]:
# where the title is NaN, also the selftext is NaN so we can drop them
posts_df[posts_df["title"].isnull()]

Unnamed: 0,created_utc,title,selftext,subreddit
487442,1549057095,,,u_hannaonprime
487870,1549057159,,,u_hannaonprime
578866,1549070350,,,u_honda
579490,1549070451,,,u_honda
592231,1549072492,,,u_hannaonprime
592442,1549072521,,,u_hannaonprime
837726,1549121657,,,ProgrammerHumor
896334,1549129709,,,Anime_For_Otakus


In [22]:
test = posts_df.dropna(subset=['title'])

In [23]:
test.shape

(999992, 4)

In [24]:
# let's drop into posts ds only the 8 raws with NaN values in title and selftext
posts_df_clear = posts_df.dropna(subset=['title'])

In [25]:
# we can check it
posts_df_clear.isnull().sum()

created_utc         0
title               0
selftext       535898
subreddit           0
dtype: int64

Now we can analyze the length of our text

In [26]:
posts_df_clear.title.map(lambda x: len(x)).max()

938

In [70]:
posts_df_clear['length_title']= posts_df_clear.title.map(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [71]:
posts_df_clear.sort_values(by=['length_title'], ascending=False)

Unnamed: 0,created_utc,title,selftext,subreddit,lengh_title,lenght_title,length_title
68201,1548989072,Spurs coach Gregg Popovich was asked about how...,,u_LumpyObject,938,938,938
574899,1549069731,do u hate ur life !! �������������������������...,,NatureIsFuckingLit,517,517,517
673962,1549086507,asdasd�� �� �� �� �� �� �� �� �� ���� �� �� ��...,,Community_Demo,455,455,455
528873,1549062786,L train you're dragon[gggg&amp;gggggy&amp;gygg...,[deleted],ChoosingBeggars,364,364,364
907434,1549131205,WATCH NFL FINAL MATCH LIVE@@&gt;Patriots vs Ra...,[removed],CowboysRamsLiveTv,363,363,363
908474,1549131343,[[ !!!~&gt;&gt; NFL TV THREAD &lt;&lt;!!! ]] P...,[removed],CowboysRamsLiveTv,360,360,360
908831,1549131389,rEDDIT[[**WATCH --- &gt; Patriots vs Rams Live...,[removed],CowboysRamsLiveTv,352,352,352
909716,1549131508,[[ ~@#&gt; Patriots vs Rams Live S?t?r?e?a?m? ...,[removed],CowboysRamsLiveTv,348,348,348
908238,1549131314,2019 SUPER BOWL LIII Live-#@$^&gt; Patriots vs...,[removed],CowboysRamsLiveTv,340,340,340
907232,1549131178,%:::&gt;WaTcH@ Patriots vs Rams Live %:::&gt;s...,[removed],CowboysRamsLiveTv,338,338,338


In [72]:
posts_df_clear.title.map(lambda x: len(x)).min()

1

In [74]:
comments_df_clear.body.map(lambda x: len(x)).min()

1

Exploring text with length < 4

In [88]:
posts_df_clear.loc[posts_df_clear['length_title'] < 4]

Unnamed: 0,created_utc,title,selftext,subreddit,lengh_title,lenght_title,length_title


Cleaning not relevant data

In [85]:
posts_df_clear = posts_df_clear[(posts_df_clear["length_title"] > 4) & (posts_df_clear["selftext"] != "[removed]")]
posts_df_clear = posts_df_clear[(posts_df_clear["length_title"] > 4) & (posts_df_clear["selftext"] != "[deleted]")]
posts_df_clear = posts_df_clear[(posts_df_clear["length_title"] > 4) & (posts_df_clear["selftext"] != "NaN")]

Let's see how is big now our df

In [103]:
posts_df_clear.shape

(742467, 7)

### Cleaning Comments DS

In [15]:
# we have NaN values on the body of the subreddit that we can drop
comments_df.isnull().sum()

created_utc    0
body           7
subreddit      0
dtype: int64

In [18]:
# let's drop into comments ds
test = comments_df.dropna(axis=0)

In [19]:
test.shape

(999993, 3)

In [20]:
comments_df_clear = comments_df.dropna(axis=0)

In [21]:
# we can check it
comments_df_clear.isnull().sum()

created_utc    0
body           0
subreddit      0
dtype: int64

Now we can analyze the length of our text

In [73]:
comments_df_clear.body.map(lambda x: len(x)).max()

10088

In [86]:
comments_df_clear['length_title']= comments_df_clear.body.map(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [87]:
comments_df_clear.sort_values(by=['length_title'], ascending=False)

Unnamed: 0,created_utc,body,subreddit,length_title
679868,1548991504,&gt;&gt; Your 5% figure continues to be bas...,entertainment,10088
873295,1548995221,** [FRESH] King Gizzard and the Lizard Wizard:...,indieheadscirclejerk,10000
116320,1548981366,"3-6 is good, folks. At least I really love it....",danganronpa,9918
112979,1548981306,#####&amp;#009;\n\n######&amp;#009;\n\n####&am...,pathofexile,9846
404400,1548986490,\n\nThe history of the Smallpox vaccine and it...,conspiracy,9726
361063,1548985722,"#Kuroyami, The White Knight, and Tape Face\n\n...",StrawHatRPG,9691
329197,1548985158,Copypasting it from the other threads to peopl...,httyd,9687
44181,1548980084,I’ve moved some things around to improve the f...,Catholicism,9683
8743,1548979384,## [�� Link para a notícia completa](https://o...,brasilnoticias,9668
637247,1548990711,## [�� Link para a notícia completa](https://o...,brasilnoticias,9668


Exploring text with length < 10, more than post_ds because on the body we also want to exclude remouved and delated text

In [98]:
comments_df_clear.loc[comments_df_clear['length_title'] < 10]

Unnamed: 0,created_utc,body,subreddit,length_title
4,1548979200,Sorry,FortNiteBR,5
12,1548979200,G,furry_irl,1
14,1548979200,[removed],Art,9
15,1548979200,Ez,StockMarket,2
17,1548979200,[deleted],Life,9
22,1548979200,r/creepy,hmmm,8
31,1548979200,[deleted],HitBoxPorn,9
33,1548979200,[deleted],privacy,9
36,1548979200,[removed],PoliticalDiscussion,9
44,1548979200,��,gonewild30plus,2


In [100]:
comments_df_clear = comments_df_clear[(comments_df_clear["length_title"] > 10)]

Let's see how is big now our df

In [102]:
comments_df_clear.shape

(876155, 4)

In [None]:
# analizar si en lugar que filtrar por fecha no sería mejor por ranking y seleccionar los reddit más usados

# Como eliminar simbulos y enlaces