## Load File as .CSV

In [1]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
reddit = pd.read_csv('reddit_posts.csv')
pd.set_option('display.max_columns', None)

## Exploratory Data Analysis

In [3]:
# The dataset is comrpised of 26688 rows and 53 columns. Many of the columns seem to have NaN values to them.
reddit.head(10)

Unnamed: 0,adserver_click_url,adserver_imp_pixel,archived,author,author_flair_css_class,author_flair_text,contest_mode,created_utc,disable_comments,distinguished,domain,downs,edited,gilded,hide_score,href_url,id,imp_pixel,is_self,link_flair_css_class,link_flair_text,locked,media,media_embed,mobile_ad_url,name,num_comments,original_link,over_18,permalink,post_hint,preview,promoted,promoted_by,promoted_display_name,promoted_url,quarantine,retrieved_on,saved,score,secure_media,secure_media_embed,selftext,spoiler,stickied,subreddit,subreddit_id,third_party_tracking,third_party_tracking_2,thumbnail,title,ups,url
0,,,False,johnnyawesome0,,,False,1480697304,,,self.techsupport,0.0,False,0.0,False,,5g49s2,,True,,,False,,{},,t3_5g49s2,1.0,,False,/r/techsupport/comments/5g49s2/help_with_audio...,,,,,,,False,1484297000.0,False,1.0,,{},I have a Sony surround sound system for a blu-...,False,False,techsupport,t5_2qioo,,,self,Help with audio set-up,1.0,https://www.reddit.com/r/techsupport/comments/...
1,,,False,Silverfin113,,,False,1480697424,,,self.learnprogramming,0.0,False,0.0,False,,5g4a5p,,True,,,False,,{},,t3_5g4a5p,8.0,,False,/r/learnprogramming/comments/5g4a5p/optimizing...,,,,,,,False,1484297000.0,False,23.0,,{},I've written what seems to be a prohibitively ...,False,False,learnprogramming,t5_2r7yd,,,self,Optimizing code for speed,23.0,https://www.reddit.com/r/learnprogramming/comm...
2,,,False,bookbooksbooks,,,False,1480697613,,,self.gamedev,0.0,False,0.0,False,,5g4att,,True,discussion cat-talk,Discussion,False,,{},,t3_5g4att,5.0,,False,/r/gamedev/comments/5g4att/seeking_tales_of_de...,,,,,,,False,1484297000.0,False,12.0,,{},"I'm writing an article called ""Video Games Tha...",False,False,gamedev,t5_2qi0a,,,self,Seeking Tales of Development Woe (and Triumph)...,12.0,https://www.reddit.com/r/gamedev/comments/5g4a...
3,,,False,[deleted],,,False,1480697634,,,self.learnprogramming,0.0,1480698462,0.0,False,,5g4awr,,True,solved,Solved,False,,{},,t3_5g4awr,9.0,,False,/r/learnprogramming/comments/5g4awr/java_findi...,,,,,,,False,1484297000.0,False,0.0,,{},[deleted],False,False,learnprogramming,t5_2r7yd,,,default,[Java] Finding smallest value in an array,0.0,https://www.reddit.com/r/learnprogramming/comm...
4,,,False,caffeine_potent,,,False,1480697748,,,self.learnpython,0.0,1480709138,0.0,False,,5g4bcr,,True,,,False,,{},,t3_5g4bcr,12.0,,False,/r/learnpython/comments/5g4bcr/currying_functi...,,,,,,,False,1484297000.0,False,6.0,,{},I have the following representation of argumen...,False,False,learnpython,t5_2r8ot,,,self,currying functions using functools,6.0,https://www.reddit.com/r/learnpython/comments/...
5,,,False,uzbek1993,,,False,1480697775,,,self.learnprogramming,0.0,False,0.0,False,,5g4bfi,,True,,,False,,{},,t3_5g4bfi,1.0,,False,/r/learnprogramming/comments/5g4bfi/text_edito...,,,,,,,False,1484297000.0,False,1.0,,{},I am about to create a website where users use...,False,False,learnprogramming,t5_2r7yd,,,self,Text Editor integration,1.0,https://www.reddit.com/r/learnprogramming/comm...
6,,,False,[deleted],,,False,1480697781,,,self.Python,0.0,False,0.0,False,,5g4bg2,,True,,,False,,{},,t3_5g4bg2,5.0,,False,/r/Python/comments/5g4bg2/how_can_i_use_differ...,,,,,,,False,1484297000.0,False,0.0,,{},[deleted],False,False,Python,t5_2qh0y,,,default,How can I use (different 'groups' of 8 differe...,0.0,https://www.reddit.com/r/Python/comments/5g4bg...
7,,,False,excitedaboutemacs,,,False,1480697827,,,self.learnpython,0.0,False,0.0,False,,5g4bkq,,True,,,False,,{},,t3_5g4bkq,11.0,,False,/r/learnpython/comments/5g4bkq/what_are_some_w...,,,,,,,False,1484297000.0,False,8.0,,{},This [post](https://www.reddit.com/r/learnpyth...,False,False,learnpython,t5_2r8ot,,,self,What are some ways to learn efficient python?,8.0,https://www.reddit.com/r/learnpython/comments/...
8,,,False,Steakuddin,,,False,1480697900,,,self.linux,0.0,False,0.0,False,,5g4btt,,True,,,False,,{},,t3_5g4btt,4.0,,False,/r/linux/comments/5g4btt/requesting_help_with_...,,,,,,,False,1484297000.0,False,2.0,,{},[removed],False,False,linux,t5_2qh1a,,,default,Requesting help with moving a file from local ...,2.0,https://www.reddit.com/r/linux/comments/5g4btt...
9,,,False,Firenzzz,,,False,1480698099,,,self.techsupport,0.0,False,0.0,False,,5g4ch0,,True,,,False,,{},,t3_5g4ch0,0.0,,False,/r/techsupport/comments/5g4ch0/no_screen_on_st...,self,{u'images': [{u'source': {u'url': u'https://i....,,,,,False,1484297000.0,False,1.0,,{},As the title says. Yesterday I've finally had ...,False,False,techsupport,t5_2qioo,,,self,No screen on startup when display has freesync...,1.0,https://www.reddit.com/r/techsupport/comments/...


In [4]:
reddit.describe()

Unnamed: 0,adserver_click_url,adserver_imp_pixel,created_utc,disable_comments,downs,gilded,href_url,imp_pixel,mobile_ad_url,num_comments,original_link,promoted,promoted_by,promoted_display_name,promoted_url,retrieved_on,score,third_party_tracking,third_party_tracking_2,ups
count,0.0,0.0,26688.0,0.0,26688.0,26688.0,0.0,0.0,0.0,26688.0,0.0,0.0,0.0,0.0,0.0,26688.0,26688.0,0.0,0.0,26688.0
mean,,,1481789000.0,,0.0,0.0006,,,,5.632794,,,,,,1484390000.0,7.789006,,,7.789006
std,,,741157.5,,0.0,0.042406,,,,47.256441,,,,,,51667.07,216.126418,,,216.126418
min,,,1480551000.0,,0.0,0.0,,,,0.0,,,,,,1484291000.0,0.0,,,0.0
25%,,,1481138000.0,,0.0,0.0,,,,0.0,,,,,,1484377000.0,1.0,,,1.0
50%,,,1481753000.0,,0.0,0.0,,,,2.0,,,,,,1484403000.0,1.0,,,1.0
75%,,,1482420000.0,,0.0,0.0,,,,5.0,,,,,,1484431000.0,2.0,,,2.0
max,,,1483099000.0,,0.0,6.0,,,,6109.0,,,,,,1484456000.0,26573.0,,,26573.0


In [5]:
# There are 22 columns with either all NaNs or more then 85% NaN values. These will be investigated and determined whether 
# there is relevance in imputerizing, binarizing, or dummifing.
reddit.isnull().sum()[reddit.isnull().sum() >0]

adserver_click_url        26688
adserver_imp_pixel        26688
author_flair_css_class    26253
author_flair_text         26337
disable_comments          26688
distinguished             26603
href_url                  26688
imp_pixel                 26688
link_flair_css_class      22396
link_flair_text           22078
media                     26420
mobile_ad_url             26688
original_link             26688
post_hint                 23175
preview                   23175
promoted                  26688
promoted_by               26688
promoted_display_name     26688
promoted_url              26688
secure_media              26420
third_party_tracking      26688
third_party_tracking_2    26688
dtype: int64

In [6]:
reddit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26688 entries, 0 to 26687
Data columns (total 53 columns):
adserver_click_url        0 non-null float64
adserver_imp_pixel        0 non-null float64
archived                  26688 non-null bool
author                    26688 non-null object
author_flair_css_class    435 non-null object
author_flair_text         351 non-null object
contest_mode              26688 non-null bool
created_utc               26688 non-null int64
disable_comments          0 non-null float64
distinguished             85 non-null object
domain                    26688 non-null object
downs                     26688 non-null float64
edited                    26688 non-null object
gilded                    26688 non-null float64
hide_score                26688 non-null bool
href_url                  0 non-null float64
id                        26688 non-null object
imp_pixel                 0 non-null float64
is_self                   26688 non-null bool
link_fla

### Data Dictionary and brief analysis

In [7]:
# There are no values in this column. It will be dropped.
reddit.adserver_click_url.value_counts()

Series([], Name: adserver_click_url, dtype: int64)

In [8]:
# There are no values in this column. It will be dropped
reddit.adserver_imp_pixel.value_counts()

Series([], Name: adserver_imp_pixel, dtype: int64)

In [9]:
# Every row has a value but every value is False. So none of these were archived. I can't see how this could be useful. The only
# data available is False. It can't be moved from a categorical format to a numereical format becasue it would all be 0 It will
# have to be dropped.
reddit.archived.value_counts()

False    26688
Name: archived, dtype: int64

In [10]:
# 5947 are deleted which means 22% don't have authors connected to the message. Now this could be used a predictive factor.
# One could try and determine the message based on the author. This could be a model to look in to. This shall be kept. KEEP
reddit.author.value_counts()

[deleted]               5947
wilsonharry              156
AutoModerator             90
ebpnovin                  58
angelovstanton            46
Hgbnty                    46
metthewhayden             26
kevindepp142              25
RingSavvy                 20
eastpointsoftware         19
redditgizmos              17
Sexual_Lettuce            17
Ceasar_JL                 16
Anonymously-Used          15
llSourcell                15
Xradam                    14
GerrardSlippedHahaha      13
danny199234               12
martinfisleburn           12
WaterdotBottledot         12
oasis1272                 12
denmarkf                  12
rms_returns               11
printerstechsupport       11
RoboticPlayer             11
sangramsate               10
xlxs                      10
patientplatypus           10
connormcwood              10
pythonion                 10
                        ... 
Kronichh                   1
segguzi                    1
oakes                      1
Aburamu       

In [11]:
# 1.6% of the rows have values. Maybe if there were more non NaN values this column would be more useful but there is so little
# information available in this dataset that I don't see how it could be useful. This shall be dropped.
reddit.author_flair_css_class.value_counts()

py32bg                 84
commercial-indie       46
py27bg                 43
hobbyist               33
py32bggh               29
none                   29
py3int                 25
noob                   21
intermediate           15
student                13
py3intgh               13
py27bggh               12
DefaultFlair            9
commercial-other        8
trusted                 8
py2int                  8
py3-empty               7
swift                   7
objc-swift              5
flair                   3
pro                     3
bell-bronze             3
python                  2
php-contributor         1
py32ad                  1
py27ad                  1
ex-commercial-other     1
emacs                   1
objc                    1
py27exgh                1
py2-empty               1
py2intgh                1
Name: author_flair_css_class, dtype: int64

In [12]:
# 1.3% of the rows have values. Again, like th above flair issue, I don't belive there is enough data to really determine if it
# is relevant. Most of th texts seem to be handles or random words/numbers. This shall be dropped.
reddit.author_flair_text.value_counts()

@LucklessSeven                                 17
☺                                              10
Nooblet Brewer                                 10
HH0718                                          8
Trusted                                         7
Swift                                           7
@TheThroneOfLies                                7
@your_twitter_handle                            6
re.tar                                          6
0 0                                             6
Beginner                                        5
Github username                                 5
I'm horrible.                                   5
thatguywiththatname                             4
yelaxify                                        4
adammichaelwood                                 4
wostensen                                       4
@catworm_studios                                4
Objective-C / Swift                             4
__simple_is_better_than_complex__               4


In [13]:
# In therory I could make this column 0 and 1 but there are only 5 True values. It seems to be somthing that the author can't
# change but only the moderators. I'm not sure if this will be useful. There is no data in this set regarding moderators so I 
# don't think this will be useful to have since it doesn't seem to correlate to any other feature column and the values are 
# mostly false. This shall be dropped.
reddit.contest_mode.value_counts()

False    26683
True         5
Name: contest_mode, dtype: int64

In [14]:
# The time is kept in Coordinated Universal Time. It might be useful to see when authors made these particular posts. I need
# to convert UTC to python though. KEEP for now.
reddit.created_utc.value_counts()

1482356776    3
1481768887    3
1480868953    2
1480718188    2
1482008551    2
1482172844    2
1481159209    2
1481809400    2
1480994523    2
1482856061    2
1481487058    2
1482791468    2
1482954479    2
1481165113    2
1481900741    2
1482361330    2
1480604738    2
1483049772    2
1481651817    2
1480638775    2
1481151390    2
1480891204    2
1480930221    2
1480948703    2
1483080521    2
1482267458    2
1481323062    2
1481235886    2
1482958389    2
1482013489    2
             ..
1481497486    1
1481190288    1
1480604562    1
1481323411    1
1482594859    1
1480970081    1
1482261317    1
1481542468    1
1481477993    1
1481845520    1
1480729361    1
1481325330    1
1481454355    1
1480721173    1
1481001756    1
1481130781    1
1482990369    1
1481093923    1
1482398502    1
1480977192    1
1481098029    1
1481253681    1
1481718578    1
1482689332    1
1482818357    1
1482627894    1
1482187577    1
1482314559    1
1481908403    1
1480863750    1
Name: created_utc, Lengt

In [15]:
# All 26688 rows are NaN values. I don't know why the comments were disabled but there is no useful data here so the columnn 
# shall be dropped.
reddit.disable_comments.value_counts()

Series([], Name: disable_comments, dtype: int64)

In [16]:
# Apparently this is a way to identify a type of user, according to the API, as being a moderator or administrator or some other 
# type of specific user. There are only 85 non NaN values. I can't see how this column is relevant with such few data points. If
# there were more I might be able to classify a user based on their status and tie that in with their comment counts score but
# as there is so little dat I shall have to drop this column.
reddit.distinguished.value_counts()

moderator    85
Name: distinguished, dtype: int64

In [17]:
# This column has all its values and I think will be very helpful in predicting the type of user comments since it is a very
# specific domain category and thus the text associated it should form a correlation regarding topic information. This column 
# will be analyzed further with the other kept columns to determine correlative relevance. KEEP
reddit.domain.value_counts()

self.techsupport                        11423
self.learnprogramming                    3448
self.learnpython                         1724
self.gamedev                             1191
self.technology                          1152
self.web_design                           749
self.Python                               566
self.javahelp                             536
self.javascript                           482
self.linux                                465
self.engineering                          454
self.csshelp                              393
self.iOSProgramming                       315
self.swift                                249
youtube.com                               188
self.PHP                                  175
self.computerscience                      159
self.compsci                              147
self.java                                 138
self.django                               124
self.netsec                                97
self.css                          

In [18]:
# Every value is 0.0. I don't know the revelance of this column but with no data to work with I'll have to drop it.
reddit.downs.value_counts()

0.0    26688
Name: downs, dtype: int64

In [19]:
# 3516 comments were editted while the remaining 23172 were not. The issue here is the edit date/time is in UTC. This might be
# useful if I can figure out how to convert to local date/time but I have a number of blocks in my way. If I could determine a 
# relationship, if there is any, between users and editted comments, it could help in classifing with NLP to predict what type
# of comment will be in each domain. I'll come back and work on this. KEEP for now
reddit.edited.value_counts()

False         23172
1481225395        2
1481742672        2
1481174981        2
1482695058        1
1482994918        1
1480755938        1
1480641893        1
1480793582        1
1482304047        1
1480731235        1
1480961991        1
1482021384        1
1480980235        1
1482877979        1
1482869053        1
1482739451        1
1482418844        1
1481869850        1
1482589747        1
1482720557        1
1482434072        1
1480710350        1
1480805978        1
1481439050        1
1480572874        1
1480775272        1
1481048938        1
1483013810        1
1481930191        1
              ...  
1480815702        1
1482073500        1
1480778570        1
1481479749        1
1482606231        1
1481047701        1
1481047306        1
1482694400        1
1482248695        1
1482276782        1
1481607136        1
1482401060        1
1481247701        1
1482901784        1
1482431865        1
1481007819        1
1480842521        1
1481319966        1
1482803900        1


In [20]:
# So gilding is when you give reddit 'gold' as a gift for a comment made. This data set has very little given. I don't see 
# relevance in how this would help in my modeling so I shall drop it.
reddit.gilded.value_counts()

0.0    26678
1.0        8
6.0        1
2.0        1
Name: gilded, dtype: int64

In [21]:
# Every value is False. I don't see how it will be helpful having only one value. It shall be dropped
reddit.hide_score.value_counts()

False    26688
Name: hide_score, dtype: int64

In [22]:
# There are no urls in this column. The value is NaN. The column shall be dropped
reddit.href_url.value_counts()

Series([], Name: href_url, dtype: int64)

In [23]:
# The id numbers don't seem to relate to any one person, topic, or domain in particular. They are all different so I'm not sure
# how helpful it would be since they all seem random and don't seem to be connected to any specific person or domain. This shall
# be dropped.
reddit.id.value_counts()

5gaxe9    1
5g00tr    1
5fzf5z    1
5kf3yr    1
5gaijt    1
5iq2lb    1
5io84i    1
5kmu7v    1
5g7hpf    1
5jm04a    1
5iwk30    1
5hhyj5    1
5gh7x4    1
5ga07c    1
5ihgft    1
5gkeuz    1
5i2bzp    1
5l161g    1
5fvyuv    1
5g9bml    1
5i0cji    1
5jpnv9    1
5ja3l4    1
5g3o5g    1
5k1alz    1
5l1jld    1
5jgw28    1
5h7mae    1
5hj9cw    1
5h6aas    1
         ..
5i0du9    1
5hklzj    1
5i5o4c    1
5gv5ws    1
5ho4cl    1
5iptbt    1
5hdufv    1
5g1ami    1
5g8tgi    1
5g2pzi    1
5is7an    1
5kekl8    1
5hm66w    1
5hyf3j    1
5jiaf8    1
5hokho    1
5gi8eq    1
5im9mu    1
5g9c6e    1
5kgcdd    1
5kiz6r    1
5jbfqg    1
5hzf6d    1
5ilf4u    1
5i120t    1
5k2sny    1
5gt6lp    1
5g530b    1
5jxagg    1
5kyms9    1
Name: id, Length: 26688, dtype: int64

In [24]:
# This entire column is comprised of NaN values. It shall be dropped.
reddit.imp_pixel.value_counts()

Series([], Name: imp_pixel, dtype: int64)

In [25]:
# Apparently this means that is it is true it is a self post; it doesn't link outside of reddit. I'm not sure of the relevance.
# Maybe certain authors make more selfposts then others. Maybe it an be used as a classifer. A lot of them seem to be false eiter
# link to something outside of reddit or contain a link not of a reddit post. Maybe  viwdo or picture? Will do further analyze 
# with this and see if there is a correlation to authors. KEEP for now
reddit.is_self.value_counts()

True     24594
False     2094
Name: is_self, dtype: int64

In [26]:
# 16% of the rows contain actual data. The remaining have NaNs. I can't see a revelance expeically when many of them have to do 
# with 'cat-talk'. If there were only 16% missing I might be able to work with the column and see if their was correlations to 
# who posted the topic or what the topic was but with such little categorical data I can't make predictions on what could have 
# been. It shall be dropped.
reddit.link_flair_css_class.value_counts()

issue-resolved             967
discussion                 586
question                   569
general                    505
help                       374
solved                     200
removed                    184
gen                        107
darkred                    101
discussion cat-talk         93
mec                         84
resolved                    73
ele                         39
unsolved                    36
game cat-game               29
homework                    27
project                     26
topic                       25
adventofcode                25
assets cat-event            24
cat-weekly                  21
adblock                     18
video cat-resource          14
tutorial cat-resource       13
resource                    12
civ                         10
article cat-resource        10
netneutrality               10
tutorial                    10
orange                       9
postmortem cat-resource      8
comcast                      8
announce

In [27]:
# It seems this column has a relation to the link_flair_css_class. 17% of this column actually contains data. The remaining have
# NaNs. As with the other column if it was the reverse I might be able to form correlations but the data isn't there. I'll
# have to drop the columns.
reddit.link_flair_text.value_counts() 

Solved                                                        1143
Discussion                                                     682
Question                                                       569
help                                                           354
removed: Learning                                              163
R1.iii: tech support/question/help                             122
[GENERAL]                                                      107
Business                                                       107
[MECHANICAL]                                                    84
Resolved                                                        73
Hardware                                                        73
Repost                                                          70
reject: question                                                66
Security                                                        54
R1.i: guidelines                                              

In [28]:
# Moderators can lock posts from being added to or editted if it is deemed to be 'off topic', 'rash' or of a language or topic
# that is deemed as being seen unslightly. There are only 22 here are True. Not sure the relevance of those 22. I might just 
# keep it for now until the majority of the columns get dropped and then return to see if it rleated in anyway to other rows ie
# a particular author or domain. So KEEP for now.
reddit.locked.value_counts()

False    26666
True        22
Name: locked, dtype: int64

In [29]:
# 1% of the columns contain some type of media. Without the .sum() its just a list of urls, thumbnails, and links to probably
# pictures or utube videos. I don't see how this could be relevant to modeling. It shall be dropped.
reddit.media.value_counts().sum()

268

In [30]:
# The data looks very similar to what was seen in media. Every row seems to have this media_embed. 
# These fields are populated with the data usually required to show them in the inline previews. This column is of no use. It 
# isn't properly formated to be seen. Data scrapping would have to be done on the JSON. I don't think it would be of any help
# on the modeling. This shall be dropped.
reddit.media_embed.value_counts().sum()

26688

In [31]:
# the entire column is filled with NaNs. There's no data to work with. This shall be dropped.
reddit.mobile_ad_url.value_counts()

Series([], Name: mobile_ad_url, dtype: int64)

In [32]:
# This column has the same data as id just there is a t3_ in front of the alphanumeric pattern. I still don't see the relevance
# except that now thee columnds are related to each other but still dont seem to have a relationship to any other column. They
# are still random alphanumeric patterns that have no connection to a particular author, topic, domain etc. It shall be dropped.
reddit.name.value_counts()

t3_5jbyql    1
t3_5j2bk0    1
t3_5ibnd6    1
t3_5l2m2z    1
t3_5hqmg4    1
t3_5ge1rd    1
t3_5ht7fz    1
t3_5kjasx    1
t3_5gvjny    1
t3_5idw88    1
t3_5kkj1m    1
t3_5k8riu    1
t3_5fu5gc    1
t3_5hjj7p    1
t3_5g6fv2    1
t3_5i6bn6    1
t3_5hvn92    1
t3_5jupse    1
t3_5he5ux    1
t3_5j1u9c    1
t3_5kfw4s    1
t3_5h26a3    1
t3_5itapx    1
t3_5h0rx6    1
t3_5goks9    1
t3_5g4vy1    1
t3_5jqvbe    1
t3_5kl5lk    1
t3_5kfrsc    1
t3_5hi4af    1
            ..
t3_5kufzh    1
t3_5i38ly    1
t3_5kuzwk    1
t3_5goygf    1
t3_5g1aaq    1
t3_5jh1tj    1
t3_5g5mht    1
t3_5gspbi    1
t3_5iffnp    1
t3_5ky8sy    1
t3_5jw9du    1
t3_5k8bti    1
t3_5kryuz    1
t3_5g1j3w    1
t3_5gyp4z    1
t3_5gxxuh    1
t3_5ga2f8    1
t3_5ihb28    1
t3_5h0b7e    1
t3_5i7mft    1
t3_5inyv3    1
t3_5iplh8    1
t3_5ii6fo    1
t3_5g4clp    1
t3_5l0jjy    1
t3_5ij6ff    1
t3_5gt4v3    1
t3_5h7vpk    1
t3_5kpxux    1
t3_5gcv3u    1
Name: name, Length: 26688, dtype: int64

In [33]:
# The API says this is suppose to be an interger between 1 and 500 but there are some that are over 500. I want to revisit this
# column just out of courisity to see if there is anything else out of the ordinary with these >=500 comment rows. So KEEp for
# now but I'm not sure of the rlevance. They might be outliers, I don't know if thats possible with this kind of data. 
# Also, there maybe relevance between the number of comments and authors so that would be another aspect to look into.
reddit.num_comments.value_counts()

0.0       6855
1.0       4309
2.0       3345
3.0       2307
4.0       1977
5.0       1495
6.0       1187
7.0        888
8.0        691
9.0        555
10.0       403
11.0       340
12.0       266
13.0       234
14.0       201
15.0       165
16.0       130
17.0       124
18.0       102
19.0        96
21.0        64
20.0        63
22.0        58
23.0        56
24.0        45
27.0        39
25.0        39
26.0        33
29.0        33
31.0        32
          ... 
175.0        1
98.0         1
163.0        1
319.0        1
181.0        1
127.0        1
200.0        1
121.0        1
88.0         1
130.0        1
252.0        1
172.0        1
173.0        1
339.0        1
651.0        1
223.0        1
358.0        1
162.0        1
424.0        1
653.0        1
284.0        1
131.0        1
123.0        1
167.0        1
146.0        1
2757.0       1
271.0        1
281.0        1
189.0        1
270.0        1
Name: num_comments, Length: 176, dtype: int64

In [34]:
# the column is filled with NaNs. It shall be dropped.
reddit.original_link.value_counts() 

Series([], Name: original_link, dtype: int64)

In [35]:
# This could be helpful in identifying authors. The column would have to be dummified to 0 and 1s but it might help in 
# classification and predicting. This mens that the user is has an over_18 restriction when searching or doesn't. So False
# here means the person is over 18 and doesn't require the retriction and True here means they are under 18. That makes sense.
# This could be helpful.KEEP.
reddit.over_18.value_counts()  

False    26647
True        41
Name: over_18, dtype: int64

In [36]:
# This seems to contain the area of discussion, the alphanumeric pattern associated with the comment and a brief header of what
# the author is going to be writing about and where what doman its meant to be used in. Each is unique to the author. This 
# could be helpful. Have to analyze this a little more but KEEP.
reddit.permalink.value_counts()

/r/Python/comments/5k0678/any_step_by_step_guide_how_to_use_caseless_model/                1
/r/techsupport/comments/5ggz48/hp_pavilion_cant_recognize_new_hard_drive/                  1
/r/learnprogramming/comments/5ftmrv/help_me_understand_the_logic_of_and_vs_or_in_a/        1
/r/techsupport/comments/5h3x7z/downloading_civ_v_and_getting_error_code_53/                1
/r/techsupport/comments/5kmtoj/high_pitched_audio_web_whatsapp_microsoft_edge/             1
/r/swift/comments/5i4qcg/im_having_trouble_maintaining_the_image_inside_an/                1
/r/learnprogramming/comments/5injaz/what_is_this_called_and_what_language_do_i_need/       1
/r/learnprogramming/comments/5jyz1p/python_timeout_for_a_running_process/                  1
/r/learnprogramming/comments/5g846o/beginner_help_with_creating_a_website/                 1
/r/techsupport/comments/5gp869/both_monitors_go_black_then_come_back_on_randomly/          1
/r/learnpython/comments/5k7f8z/subreddit_bot_how_do_i_keep_the_passwor

In [37]:
# Only 13% of the columns has viable data. I don't think this will be very helpful. It seems to have somethign deeply embedded in
# the coding of reddit itself to do with images.This shall be dropped.
reddit.post_hint.value_counts()

self    3513
Name: post_hint, dtype: int64

In [38]:
# This came up with the same kind of links as media and media_embed. It is interesting that the values match with post_hint 
# which makes sense with what I find out about post_hint. It has to do with images, links, videos etc. In this case its not going
# to be useful and like media and media_embed I would have to do extra work on the actual urls comprised of the preview to find
# information which might or might not be useful. So in case, I'm sropping the columns.
reddit.preview.value_counts().sum()

3513

In [39]:
# This has something to do with moderators and users who post within groups and their status can be promoted. 
#The values are all NaNs so I'm going to drop the columns anyway.
reddit.promoted.value_counts() 

Series([], Name: promoted, dtype: int64)

In [40]:
# Same idea as the column above. This would just be who promoted the user. The columns are all NaN values so I'm going to have to
# drop it.
reddit.promoted_by.value_counts()

Series([], Name: promoted_by, dtype: int64)

In [41]:
# Same as idea as the last two columns. This would be the display name of the promoted. The columns are all NaN values so I'm
# going to have to drop it.
reddit.promoted_display_name.value_counts()

Series([], Name: promoted_display_name, dtype: int64)

In [42]:
# I couldn't find anything in the API or online about what this is. The column is NaNs anyway so it's getting dropped.
reddit.promoted_url.value_counts()

Series([], Name: promoted_url, dtype: int64)

In [43]:
# A quarantined subbreddit isn't a good thing. It means the administrators have deemed the content to be inappropriate for public
# consumption and have placed restrictions on it such as no images in their CSS. must have a verified email address, and 18_over
# status. It seems this is reddits pc way of allowing people to have freedom of speech but limiting it and making it harder for
# for people to have access to it. Either way none of these are quarantined and thus the column is all NaNs so it shall be dropped.
reddit.quarantine.value_counts()

False    26688
Name: quarantine, dtype: int64

In [44]:
# This is UTC time but its in a int64 format instead of float. So If I'm going to use this column that will have to be changed
# first. I was able to change the dtype which corrcted the format so once I figure out how to chnage from UTC this could be
# useful. So KEEP, for now.
reddit.retrieved_on.value_counts()

1.484401e+09    6
1.484427e+09    4
1.484426e+09    4
1.484304e+09    4
1.484303e+09    4
1.484426e+09    4
1.484456e+09    4
1.484408e+09    4
1.484385e+09    4
1.484411e+09    4
1.484312e+09    4
1.484441e+09    4
1.484448e+09    4
1.484437e+09    4
1.484431e+09    4
1.484306e+09    4
1.484387e+09    4
1.484427e+09    4
1.484298e+09    4
1.484419e+09    4
1.484430e+09    4
1.484415e+09    4
1.484429e+09    4
1.484375e+09    4
1.484307e+09    4
1.484308e+09    4
1.484451e+09    3
1.484450e+09    3
1.484311e+09    3
1.484401e+09    3
               ..
1.484425e+09    1
1.484440e+09    1
1.484438e+09    1
1.484307e+09    1
1.484446e+09    1
1.484403e+09    1
1.484301e+09    1
1.484406e+09    1
1.484380e+09    1
1.484393e+09    1
1.484388e+09    1
1.484456e+09    1
1.484434e+09    1
1.484429e+09    1
1.484306e+09    1
1.484399e+09    1
1.484436e+09    1
1.484451e+09    1
1.484442e+09    1
1.484395e+09    1
1.484293e+09    1
1.484438e+09    1
1.484431e+09    1
1.484295e+09    1
1.484384e+

In [45]:
# Correcting for dtype and scientific notation.
#'{:.10f}'.format(reddit[['retrieved_on']])
#reddit["retrieved_on"].apply(lambda x: format(x, 'f'))
reddit.retrieved_on =  reddit.retrieved_on.astype('int64', inplace =True)

In [46]:
# The API has a lot of 'saved' definitions depending on what aspect of reddit it is under; for this instance I'm not sure what
# it applies to but either way the column only has NaN values so it has no useable data. It shall be dropped
reddit.saved.value_counts()

False    26688
Name: saved, dtype: int64

In [47]:
# Score is determined by how many upvotes or down votes received nd are also dependdent on how new the post is. From Mashable:
#"You may also notice that posts with the highest score do not always rank at the top. This is due to Reddit's time
#decay algorithm. Posts on the front page are obviously more visible, and therefore have a higher chance of being upvoted.
#But the site wouldn't be valuable if the same content remained on the front page all day." So determining the actual time might
# be more useful then I first thought. Depending on how long its been around factord into its score. Also, I'm assuming, 
#relative content. This column is definitly going to be useful. KEEP.
reddit.score.value_counts()

1.0        15301
0.0         3699
2.0         2784
3.0         1306
4.0          542
5.0          482
6.0          373
7.0          286
8.0          201
9.0          144
10.0         130
11.0         106
12.0          92
13.0          80
14.0          71
15.0          67
16.0          53
18.0          46
20.0          43
17.0          42
19.0          40
22.0          29
25.0          27
24.0          25
21.0          25
27.0          25
30.0          21
23.0          21
29.0          21
37.0          19
           ...  
102.0          1
212.0          1
342.0          1
223.0          1
1239.0         1
115.0          1
1814.0         1
26573.0        1
152.0          1
322.0          1
1061.0         1
2258.0         1
108.0          1
206.0          1
165.0          1
343.0          1
6300.0         1
302.0          1
563.0          1
164.0          1
320.0          1
222.0          1
86.0           1
409.0          1
1531.0         1
306.0          1
313.0          1
224.0         

In [48]:
# Same idea as media and media_emded. These are just secure verisons. The values match between both column sets. But this still
# Doesn't help in my modeling. This just shows that there is a relevance between media, media_embed, secure_media, and 
# secure_media_embed. These columns still don't add any value points to my project. So the column shall be dropped.
reddit.secure_media.value_counts().sum()

268

In [49]:
# see above as to why this column is being dropped. 
reddit.secure_media_embed.value_counts().sum()

26688

In [50]:
# This seems to embody the text of the post. Over 5000 have been delted and over 3000 have been removed. I'm going to have to
# analysis this more becasue this seems to be the bluk of the data. This column will be used. KEEP
reddit.selftext.value_counts()

[deleted]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [51]:
# This feature conceals certain informtion until you hover over it as to not spoil  plot line or something for other readers.
# All the calues in the column are False so it's not going to be very useful. It shall be dropped.
reddit.spoiler.value_counts()

False    26688
Name: spoiler, dtype: int64

In [52]:
# This column I will KEEP. I think it will be useful in tryign to predict from the comments what topic it should be under.
reddit.subreddit.value_counts()

techsupport         11423
learnprogramming     3448
technology           2321
learnpython          1724
gamedev              1281
web_design            864
Python                603
javascript            546
linux                 545
javahelp              536
engineering           525
csshelp               393
iOSProgramming        330
swift                 261
programming           254
PHP                   192
computerscience       162
netsec                159
java                  157
compsci               154
django                127
css                   106
cpp                    86
HTML                   85
ruby                   77
flask                  71
html5                  62
jquery                 46
coding                 34
pygame                 34
perl                   24
lisp                   16
programmer             13
dailyprogrammer        10
IPython                 8
inventwithpython        6
pystats                 3
pythoncoding            2
Name: subred

In [53]:
# This idea doesn't seem to have a ny relationship to id ot name BUT it does relate to the subreddit. For that reason I'm going
# to KEEP it for now to further analyze and see if there is any other correlation with other columns
reddit.subreddit_id.value_counts()

t5_2qioo    11423
t5_2r7yd     3448
t5_2qh16     2321
t5_2r8ot     1724
t5_2qi0a     1281
t5_2qh1m      864
t5_2qh0y      603
t5_2qh30      546
t5_2qh1a      545
t5_2t1jq      536
t5_2qhpi      525
t5_2roaw      393
t5_2s61a      330
t5_2z6zi      261
t5_2fwo       254
t5_2qh38      192
t5_2qj8o      162
t5_1rqwi      159
t5_2qhd7      157
t5_2qhmr      154
t5_2qh4v      127
t5_2qifv      106
t5_2qi27       86
t5_2r6cd       85
t5_2qh21       77
t5_2s1s3       71
t5_2r7u2       62
t5_2qhs4       46
t5_2rb2y       34
t5_2rkfn       34
t5_2qh5e       24
t5_2qh35       16
t5_2qnoo       13
t5_2tj45       10
t5_2x3ey        8
t5_2tfjk        6
t5_2yw9f        3
t5_3649w        2
Name: subreddit_id, dtype: int64

In [54]:
# This columns would have been more useful if there were more Trues. But as there is only one its not going to be very helpful.
# I shall drop it.
reddit.stickied.value_counts()

False    26687
True         1
Name: stickied, dtype: int64

In [55]:
# All the values are NaNs. So the column shall be dropped.
reddit.third_party_tracking.value_counts()

Series([], Name: third_party_tracking, dtype: int64)

In [56]:
# All the values are NaNs. So the column shall be dropped.
reddit.third_party_tracking_2.value_counts()

Series([], Name: third_party_tracking_2, dtype: int64)

In [57]:
# I don't see how this can be of any help. I'm not woriking on imagine recognition so having a thumnail there and what it is 
# doesn't matter for my purposes. It shall be dropped.
reddit.thumbnail.value_counts()

self       18192
default     8480
nsfw          16
Name: thumbnail, dtype: int64

In [58]:
# I'm not sure how helpful these titles will be. The subreddit have much more text and insight into what category the topic would
# go. I'll KEEP it for now but might drop it later. 
reddit.title.value_counts()

MakerBot Releases IPad App For Easy 3D Printing                                                                                                                7
Facebook Customer service Number – A Way to Resolution 1-866-224-8319                                                                                          6
facebook customer care number Issue Can Be Dealt Easily 1-866-224-8319                                                                                         6
Facebook customer service– A Panacea to Your Problems 1-866-224-8319                                                                                           6
Flush Your Worries of Facebook Hacked account with 1-866-224-8319                                                                                              5
Tackle Technical Troubles with Facebook Customer Service at 1-866-224-8319                                                                                     5
PLEASE INCLUDE THE SUBREDDIT YOU A

In [59]:
# This column seems to be the same as score. Not sure I need both then. I will KEEP for now until I can perform a deeper
# analysis on the data but I might end up dropping the column if in fact it is duplicate data.
reddit.ups.value_counts()

1.0        15301
0.0         3699
2.0         2784
3.0         1306
4.0          542
5.0          482
6.0          373
7.0          286
8.0          201
9.0          144
10.0         130
11.0         106
12.0          92
13.0          80
14.0          71
15.0          67
16.0          53
18.0          46
20.0          43
17.0          42
19.0          40
22.0          29
25.0          27
24.0          25
21.0          25
27.0          25
30.0          21
23.0          21
29.0          21
37.0          19
           ...  
102.0          1
212.0          1
342.0          1
223.0          1
1239.0         1
115.0          1
1814.0         1
26573.0        1
152.0          1
322.0          1
1061.0         1
2258.0         1
108.0          1
206.0          1
165.0          1
343.0          1
6300.0         1
302.0          1
563.0          1
164.0          1
320.0          1
222.0          1
86.0           1
409.0          1
1531.0         1
306.0          1
313.0          1
224.0         

In [60]:
# I dont think the urls will be helpful. There is better text data in other columns. This shall be dropped.
reddit.url.value_counts().sum()

26688

In [61]:
reddit.drop(['adserver_click_url', 'adserver_imp_pixel', 'archived',
       'author_flair_css_class', 'author_flair_text', 'contest_mode',
        'disable_comments', 'distinguished', 'downs','gilded',
        'hide_score', 'href_url', 'id', 'imp_pixel',
        'link_flair_css_class', 'link_flair_text', 'media',
        'media_embed', 'mobile_ad_url', 'name', 'original_link',
        'post_hint', 'preview', 'promoted','promoted_by', 
        'promoted_display_name', 'promoted_url', 'quarantine',
        'saved', 'secure_media', 'secure_media_embed',
        'spoiler', 'stickied','third_party_tracking', 'third_party_tracking_2', 'thumbnail', 'url'], axis=1, inplace=True)

In [62]:
reddit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26688 entries, 0 to 26687
Data columns (total 16 columns):
author          26688 non-null object
created_utc     26688 non-null int64
domain          26688 non-null object
edited          26688 non-null object
is_self         26688 non-null bool
locked          26688 non-null bool
num_comments    26688 non-null float64
over_18         26688 non-null bool
permalink       26688 non-null object
retrieved_on    26688 non-null int64
score           26688 non-null float64
selftext        26688 non-null object
subreddit       26688 non-null object
subreddit_id    26688 non-null object
title           26688 non-null object
ups             26688 non-null float64
dtypes: bool(3), float64(3), int64(2), object(8)
memory usage: 1.9+ MB


#### Summary of initial analysis
  <h4> After extensive research with the API and internet queries, I was able to limit the number of columns from 53 to 16. There are still a few columns that I want to do deeper analysis on to determine their relevance i.e. some may be duplicates, relationships to other columns, examining strange numerical scoring, and whether it is possible to convert UTC to datetime with out knowledge of local time zone( if that is really necessary. I need to research the matter).</h4> 
  <h4> Once those final analyses have been completed, I intend to preform multiple models looking at different aspects of the data. Including Hashing Vectorizer/ CountVectorizer, Term Frequency - Inverse Document Frequency, and Latent Dirichlet Allocation.</h4>

In [63]:
# First issue: Converting UTC/Epoch time
#def UTC (localtime):
#    x=[]
#    for localtime in x:
#        datetime.datetime.fromtimestamp(x).strftime('%c')
#print(UTC(reddit.created_utc))

#dt = datetime.datetime.fromtimestamp(reddit[['created_utc']]).astype(int)
#print(dt.strftime('%Y-%m-%d %H:%M:%S %Z%z'))

reddit['created_utc'] = pd.to_datetime(reddit['created_utc'],unit='s')
#reddit['edited'] = pd.to_datetime(reddit['edited'],unit='s') becasue the False are there I can't change the time. 
reddit['retrieved_on'] = pd.to_datetime(reddit['retrieved_on'],unit='s') # It looks like this was when the data was pulled.
# Not sure how helpful this will be now. Might drop it.

In [72]:
# Second issue: Converted True/False columns to dummies
reddit['is_self']= reddit['is_self'].map(lambda x: 1 if x == True else 0)
reddit['locked']= reddit['locked'].map(lambda x: 1 if x == True else 0)
reddit['over_18']= reddit['over_18'].map(lambda x: 1 if x == True else 0)

In [76]:
#Third Issue: Whether there is duplicate information
# As I thought, these are duplicate columns. I'll drop the ups columns.
reddit[['score','ups']]

Unnamed: 0,score,ups
0,1.0,1.0
1,23.0,23.0
2,12.0,12.0
3,0.0,0.0
4,6.0,6.0
5,1.0,1.0
6,0.0,0.0
7,8.0,8.0
8,2.0,2.0
9,1.0,1.0


In [77]:
# Dropped the ups column
reddit.drop(['ups'], axis=1, inplace=True)

In [81]:
# Fourth issue: relationship betweeb domain, subreddit, and subreddit_id.
# So subreddit is specific and tied to the subreddit_id while domain has a relationship with subreddit but can be multiple things
# Example subreddit technology can have a doman of self.technology or a specific type of technology or website like amazon, forbes
# etc. There is relevance but the domain could be part of another subreddit BUT the subreddit_id is tied to the subreddit. 
# Obviously there is a relationship between domain and subreddit but I think the id is stronly correlated to the subreddit opposed
# to the domain
reddit[['domain', 'subreddit', 'subreddit_id']].head(20)

Unnamed: 0,domain,subreddit,subreddit_id
0,self.techsupport,techsupport,t5_2qioo
1,self.learnprogramming,learnprogramming,t5_2r7yd
2,self.gamedev,gamedev,t5_2qi0a
3,self.learnprogramming,learnprogramming,t5_2r7yd
4,self.learnpython,learnpython,t5_2r8ot
5,self.learnprogramming,learnprogramming,t5_2r7yd
6,self.Python,Python,t5_2qh0y
7,self.learnpython,learnpython,t5_2r8ot
8,self.linux,linux,t5_2qh1a
9,self.techsupport,techsupport,t5_2qioo


In [91]:
# Fifth issue: over 500 comments vs score.
# I thought the comments would be the same as score but that is not the case. If Im understanding this correctly, the score is
# based on the up and down votes. Some of these subreddits havae a lot of comments but their score is lower which I'm assuming
# means the score was down voted either becasue of what a commentor wrote and they down voted it or somone read the subreddit and
# down voted it. I'm not sure if there is a correlation between the two.
print('Over 500 comments', reddit.num_comments[reddit.num_comments >500])
print('Over 500 score', reddit.score[reddit.score>500])

Over 500 comments 1781      560.0
3772      854.0
4635     2624.0
11400     653.0
11443     572.0
11467     577.0
11690    2757.0
21487     700.0
21737    6109.0
26075     651.0
Name: num_comments, dtype: float64
Over 500 score 91         795.0
671       1061.0
1175       716.0
2181       606.0
2633      1083.0
2646      1032.0
3465      3319.0
3518       919.0
3560       578.0
3772     19530.0
3998       537.0
4635      6300.0
4728       765.0
4970       590.0
6013      1168.0
7133      1814.0
7627      3478.0
7708       725.0
7770       662.0
10817      510.0
11330     4792.0
11400     3927.0
11443     2082.0
11467     2258.0
11673      857.0
12006     1161.0
12481      708.0
12618     1239.0
13364     1043.0
13568      816.0
14323      632.0
15528      563.0
17370      620.0
17847      627.0
19337      863.0
20512      621.0
21035      544.0
21487     1531.0
21737    26573.0
22491      536.0
23753     3458.0
24644      634.0
25850      844.0
26075     1770.0
Name: score, dtype: floa

In [92]:
# There doesn't seem to be a clear connection between score and comment number. All of these are from different authors
# (but that's hard to tell becasue some of the author names have been deleted), the dates of creation are different,
# the domain and the content is different. With more knowledge of how reddit works this might be an interesting rabbit hole to go
# down but for now it's going to have to be that there could be a relationship between comments, score and date of creation but
# that is not within the scope of this project
reddit[reddit.num_comments>500].sort_values("score")

Unnamed: 0,author,created_utc,domain,edited,is_self,locked,num_comments,over_18,permalink,retrieved_on,score,selftext,subreddit,subreddit_id,title
11690,iNeverQuiteWas,2016-12-14 21:40:54,self.Python,False,1,0,2757.0,0,/r/Python/comments/5idecg/hey_rpython_i_finall...,2017-01-14 14:10:16,209.0,Comment below with \n\nSimulateMe! Username\n\...,Python,t5_2qh0y,Hey /r/Python. I finally got my bot /u/Simulat...
1781,[deleted],2016-12-04 16:46:35,self.linux,False,1,0,560.0,0,/r/linux/comments/5gg9iu/python_c_or_java/,2017-01-13 10:33:59,320.0,[deleted],linux,t5_2qh1a,"Python, C++ or Java?"
21487,[deleted],2016-12-24 09:25:25,geekwire.com,False,0,0,700.0,0,/r/technology/comments/5k2bz4/oculus_engineeri...,2017-01-14 23:32:41,1531.0,[deleted],technology,t5_2qh16,Oculus engineering leader arrested near Seattl...
26075,lank3y,2016-12-29 19:45:07,self.linux,False,1,0,651.0,0,/r/linux/comments/5kys8k/this_just_in_richard_...,2017-01-15 04:23:06,1770.0,This just in from the man himself : \n\n Co...,linux,t5_2qh1a,THIS JUST IN : Richard Stallman needs money. T...
11443,[deleted],2016-12-14 16:39:28,thehill.com,False,0,0,572.0,0,/r/technology/comments/5ibkji/trump_names_elon...,2017-01-14 13:52:38,2082.0,[deleted],technology,t5_2qh16,"Trump names Elon Musk, Uber CEO to advisory team"
11467,PowerWisdomCourage,2016-12-14 17:05:45,self.technology,1481754594,1,0,577.0,0,/r/technology/comments/5ibq7y/psa_samsung_now_...,2017-01-14 13:54:07,2258.0,[removed],technology,t5_2qh16,"PSA: Samsung now requires you to opt-in to ""In..."
11400,[deleted],2016-12-14 15:41:22,bostonglobe.com,False,0,0,653.0,0,/r/technology/comments/5ib8cl/elon_musk_and_ub...,2017-01-14 13:49:27,3927.0,[deleted],technology,t5_2qh16,Elon Musk and Uber CEO added to Trump's econom...
4635,rp4187135,2016-12-07 10:56:38,self.technology,1481121478,1,0,2624.0,0,/r/technology/comments/5gzall/why_dont_mobile_...,2017-01-14 06:25:33,6300.0,[removed],technology,t5_2qh16,Why don't mobile companies realize we don't wa...
3772,[deleted],2016-12-06 15:34:16,thehill.com,False,0,0,854.0,0,/r/technology/comments/5gtgrj/samsung_triumphs...,2017-01-13 12:32:58,19530.0,[deleted],technology,t5_2qh16,Samsung triumphs over Apple at the Supreme Cou...
21737,BryAallDay,2016-12-24 18:39:13,self.technology,1482623455,1,0,6109.0,0,/r/technology/comments/5k4dn8/im_becoming_scar...,2017-01-14 23:51:17,26573.0,"**Edit 2: It's Christmas Eve, everyone; let's...",technology,t5_2qh16,I'm becoming scared of Facebook.


## Modeling
#### 1. CountVectorizer and HashingVectorizer
   The difference between the two and the results that are returned, what they mean and how they relate.