In [1]:
import numpy as np
import pandas as pd
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV

### import comment data

In [2]:
df = pd.read_csv('comments.csv')

In [3]:
df.head()

Unnamed: 0,body,subreddit
0,Your skin is flawless. Maybe you can bottle Fi...,cats
1,Such a beautiful cat ^^,cats
2,this is someone's OC,cats
3,It totally makes my day to meet a random cat o...,cats
4,Looks just like my kitty Blue! He does the sam...,cats


### label target value

In [4]:
# Target is 'subreddit'. Current values: 'cats', 'dogs'.
# Need to transform to 0 or 1 values. Set cats = 1, dogs = 0

df['target'] = df['subreddit'].map({'cats': 1, 'dogs': 0})
df.drop('subreddit', axis=1, inplace=True)
df.head()

Unnamed: 0,body,target
0,Your skin is flawless. Maybe you can bottle Fi...,1
1,Such a beautiful cat ^^,1
2,this is someone's OC,1
3,It totally makes my day to meet a random cat o...,1
4,Looks just like my kitty Blue! He does the sam...,1


### drop duplicates

In [5]:
# ***to do***
# maybe: remove the duplicate mod comments, [removed] comments, etc but keep repetitive comments i.e. Thank you!
# remove all of the RemindMe! posts?

In [6]:
# there are some duplicate comments, mostly automated comments by moderators

df.duplicated().sum()

1649

In [7]:
df[df.duplicated()].head()

Unnamed: 0,body,target
26,PLEASE READ THE ENTIRE MESSAGE BEFORE MESSAGIN...,1
45,PLEASE READ THE ENTIRE MESSAGE BEFORE MESSAGIN...,1
47,PLEASE READ THE ENTIRE MESSAGE BEFORE MESSAGIN...,1
50,PLEASE READ THE ENTIRE MESSAGE BEFORE MESSAGIN...,1
60,PLEASE READ THE ENTIRE MESSAGE BEFORE MESSAGIN...,1


In [8]:
df[df.duplicated()].tail()

Unnamed: 0,body,target
19962,Your post has been automatically removed becau...,0
19985,Your post has been automatically removed becau...,0
19986,Your post has been automatically removed becau...,0
19989,Your post has been automatically removed becau...,0
19996,Your post has been automatically removed becau...,0


In [9]:
catmodpost = list(df[df.duplicated()]['body'])[0]
catmodpost

"PLEASE READ THE ENTIRE MESSAGE BEFORE MESSAGING THE MODTEAM.\nBecause your account is new (under 10 days old) **OR your account has very low comment karma**, your submission has been removed.\nThis action is **NOT** directed at you personally; /r/cats requires all accounts to be at least **10 days old** AND have **at least 25 comment karma** in order to create new threads.\nComment karma is **not** the same thing as link karma. If you have less than 25 comment karma, it is easy to get- simply participate in a few discussions and you'll have the amount you need in no time at all.\nQuestions? [Message the mods](http://www.reddit.com/message/compose?to=%2Fr%2Fcats) and let us know.\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/cats) if you have any questions or concerns.*"

In [10]:
# number of duplicate cat mod posts
len(df[df['body'] == catmodpost])

612

In [11]:
dogmodpost = list(df[df.duplicated()]['body'])[-1]
dogmodpost

'Your post has been automatically removed because you did not include one of the required title tags. Please see the [subreddit rules](https://www.reddit.com/r/dogs/wiki/index#wiki_tags_and_descriptions) for more information.  Potential title tags include: [Breeds], [Help], [Vent], [RIP], [Fluff], [Discussion], [Link], [Meta], [Survey], and [Update].  Please resubmit your post with one of the title tags beginning the submission title.  You must physically type the tag into the title, and the tag must use square brackets.  Example: "[Discussion] What foods are toxic to dogs?"  It\'s possible you may experience a delay of up to 1 hour upon trying to repost with the corrected title.  This is a Reddit-imposed waiting period and there is nothing the /r/dogs moderators can do to reduce the waiting period.  We apologize for the inconvenience.\n\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/dogs) if you ha

In [12]:
# number of duplicate dog mod posts
len(df[df['body'] == dogmodpost])

630

In [13]:
# look at other duplicate posts
df[df.duplicated() & (df['body'] != catmodpost) & (df['body'] != dogmodpost)].head(10)

Unnamed: 0,body,target
241,😂,1
501,Thank you!,1
698,❤️,1
699,❤️,1
795,"I’m kinda envious, finding a good vet here in ...",1
825,This post was removed as it matched the keywor...,1
960,[removed],1
1079,❤️🌈❤️,1
1105,❤️🌈❤️,1
1106,Gorgeous!,1


In [14]:
# look at other duplicate posts
df[df.duplicated() & (df['body'] != catmodpost) & (df['body'] != dogmodpost)].tail(10)

Unnamed: 0,body,target
19417,RemindMe! 18 hours,0
19432,Following,0
19441,RemindMe! 18 hours,0
19451,RemindMe! 18 hours,0
19466,Remind me! 2 days,0
19664,I would add up to a minimum of one hour per da...,0
19706,Can I ask how expensive the Cytopoint is for y...,0
19844,It seems like you may be asking about breeds t...,0
19901,Thank you,0
19951,It seems like you may be asking about breeds t...,0


In [15]:
df.shape

(20000, 2)

In [16]:
df.drop_duplicates(inplace=True)

In [17]:
df.shape

(18351, 2)

In [18]:
df.tail()

Unnamed: 0,body,target
19994,I'd cancel. I had a dog live just short of 18 ...,0
19995,"Well, if she didn't have ANY KIND OF IDENTIFIC...",0
19997,"This is a little late, but here goes:\n\nOur d...",0
19998,"Lots of cuteness, over time.",0
19999,"I’ve got one that’s embroidered now, but none ...",0


In [19]:
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,body,target
18346,I'd cancel. I had a dog live just short of 18 ...,0
18347,"Well, if she didn't have ANY KIND OF IDENTIFIC...",0
18348,"This is a little late, but here goes:\n\nOur d...",0
18349,"Lots of cuteness, over time.",0
18350,"I’ve got one that’s embroidered now, but none ...",0


### clean text

In [20]:
# I used these links as a reference: 
# https://towardsdatascience.com/the-real-world-as-seen-on-twitter-sentiment-analysis-part-one-5ac2d06b63fb
# https://stackoverflow.com/questions/4328500/how-can-i-strip-all-punctuation-from-a-string-in-javascript-using-regex

In [21]:
# ***to do***
# remove numbers and words that start w/numbers i.e. 
#'3am', '3ish', '3keywords', '3lb', '3oz', '3rd', '3yo', '3yrs', '400', '4000', '40701psc', '40a37711', '40ish', '40lbs', '40pounds', '41kg'

In [22]:
# some of these are redundant with the default functions of CountVectorizer but that's OK

def cleaner(text):
    # Make lowercase
    text = text.lower()

    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    
    # Remove punctuation and split 's, 't, 've with a space for filter
    text = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ', text)
    
    # Remove words with 2 or fewer letters
    text = re.sub(r'\b\w{1,2}\b', '', text)
    
    # Remove whitespace (including new line characters)
    text = re.sub(r'\s\s+', ' ', text)
    
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    text = ''.join(c for c in text if c <= '\uFFFF') 
    
    return text

In [23]:
df['body'] = df['body'].apply(cleaner)

In [24]:
df.shape

(18351, 2)

In [25]:
# drop rows where body = ''
df = df[df['body'] != '']
df = df.reset_index(drop=True)

In [26]:
df.shape

(18270, 2)

### NLP pre-processing and exploration

### lemmatize

In [27]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    words = text.split()
    lemma_words = ''
    for word in words:
        lemma_words += (lemmatizer.lemmatize(word) + ' ')
    return lemma_words    

In [28]:
df['body'] = df['body'].apply(lemmatize_words)

In [29]:
df.shape

(18270, 2)

In [30]:
# drop rows where body = ''
df = df[df['body'] != '']
df = df.reset_index(drop=True)

In [31]:
df.shape

(18228, 2)

In [32]:
df.to_csv('comments_clean.csv', index=False)

### CountVectorizer

### most frequent cat words:

In [33]:
# Easy way to get most frequently used words: change max_features

count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = "english", 
                             max_features = 35) 

# input for CountVectorizer is an array of strings
vector_input_cats = df[df['target'] == 1]['body']

# fit_transform the vectorizer
cat_words = count_vect.fit_transform(vector_input_cats)

# convert output to a Numpy array
cat_words = cat_words.toarray()

In [34]:
# get the words
cat_word_list = count_vect.get_feature_names()
print(cat_word_list)

['beautiful', 'best', 'cat', 'cute', 'day', 'doe', 'don', 'food', 'good', 'got', 'ha', 'home', 'just', 'kitten', 'kitty', 'know', 'life', 'like', 'little', 'lol', 'look', 'love', 'make', 'old', 'really', 'sorry', 'sure', 'thank', 'thing', 'think', 'time', 'vet', 'wa', 'want', 'year']


### most frequent dog words:

In [35]:
# Easy way to get most frequently used words: change max_features

count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = "english", 
                             max_features = 36) 

# input for CountVectorizer is an array of strings
vector_input_dogs = df[df['target'] == 0]['body']

# fit_transform the vectorizer
dog_words = count_vect.fit_transform(vector_input_dogs)

# convert output to a Numpy array
dog_words = dog_words.toarray()

In [36]:
# get the words
dog_word_list = count_vect.get_feature_names()
print(dog_word_list)

['breed', 'breeder', 'day', 'doesn', 'dog', 'don', 'food', 'going', 'good', 'got', 'ha', 'help', 'home', 'just', 'know', 'like', 'look', 'lot', 'love', 'make', 'need', 'people', 'pet', 'puppy', 'really', 'sure', 'thing', 'think', 'time', 'training', 'vet', 'wa', 'want', 'way', 'work', 'year']


### edit stop words

In [37]:
# add non-meaningful words from the "most frequent" lists above to the stop words dictionary

In [38]:
from sklearn.feature_extraction import text

text.ENGLISH_STOP_WORDS

add_stop_words = ['did', 'doe', 'don', 'doesn', 'getting', 'going', 'got', 'ha', 'isn', 'wa']

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

stop_words

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

### cat vs dog top 100 words: []

In [39]:
# ***to do***
# out of 100, how many are same & how many different?
# make word clouds?

### word/n-gram frequency:

In [40]:
# Instantiate the "CountVectorizer" object, which is scikit-learn's bag of words tool.
# CountVectorizer transforms the body text from the reddit comments into features (i.e. words)
# and creates columns (vectors) with word counts for each comment

count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = stop_words, 
                             max_features = 10000, 
                             ngram_range=(1, 3)
                            ) 

# input for CountVectorizer is an array of strings
vector_input_cats = df[df['target'] == 1]['body']

# fit_transform the vectorizer
cat_words = count_vect.fit_transform(vector_input_cats)

# convert output to a Numpy array
cat_words = cat_words.toarray()

In [41]:
# ***to do*** (optional)
# if the array is very large, it's faster to do a sum of the array first (which creates a vector of the sums)
# and then combine that with the feature names in a dataframe

cat_matrix = pd.DataFrame(cat_words, columns=count_vect.get_feature_names())

cat_matrix.sum().sort_values(ascending=False).head(50)

cat          3030
like         1026
just          843
love          680
kitty         601
look          587
time          500
good          407
little        406
know          384
vet           377
day           367
food          366
thank         365
sorry         349
year          346
cute          342
beautiful     340
thing         333
think         319
kitten        315
make          297
really        292
want          273
sure          258
lol           251
home          247
life          244
best          233
old           221
right         214
look like     214
lot           213
need          212
eye           206
pet           202
baby          198
help          196
hope          195
happy         193
way           190
say           188
loss          184
great         184
try           182
boy           177
long          174
thanks        174
pretty        172
sweet         169
dtype: int64

In [42]:
cat_matrix.mean().sort_values(ascending=False).head(50)

cat          0.335362
like         0.113558
just         0.093304
love         0.075263
kitty        0.066519
look         0.064970
time         0.055340
good         0.045047
little       0.044936
know         0.042501
vet          0.041727
day          0.040620
food         0.040509
thank        0.040398
sorry        0.038628
year         0.038296
cute         0.037853
beautiful    0.037631
thing        0.036857
think        0.035307
kitten       0.034864
make         0.032872
really       0.032319
want         0.030216
sure         0.028556
lol          0.027781
home         0.027338
life         0.027006
best         0.025789
old          0.024460
right        0.023686
look like    0.023686
lot          0.023575
need         0.023464
eye          0.022800
pet          0.022357
baby         0.021915
help         0.021693
hope         0.021583
happy        0.021361
way          0.021029
say          0.020808
loss         0.020365
great        0.020365
try          0.020144
boy       

In [43]:
count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = stop_words, 
                             max_features = 10000, 
                             ngram_range=(1, 3)
                            ) 

# input for CountVectorizer is an array of strings
vector_input_dogs = df[df['target'] == 0]['body']

# fit_transform the vectorizer
dog_words = count_vect.fit_transform(vector_input_dogs)

# convert output to a Numpy array
dog_words = dog_words.toarray()

In [44]:
dog_matrix = pd.DataFrame(dog_words, columns=count_vect.get_feature_names())

dog_matrix.sum().sort_values(ascending=False).head(50)

dog         10783
just         2612
like         2481
time         1857
know         1462
people       1369
think        1304
good         1279
puppy        1255
day          1188
thing        1145
really       1126
breed        1106
make         1081
vet          1048
want         1044
need         1011
work          919
lot           851
year          846
food          843
love          829
way           822
home          785
breeder       690
training      688
sure          668
look          665
help          652
pet           648
life          644
say           636
right         616
walk          594
try           587
come          577
animal        570
issue         569
best          565
owner         561
great         559
month         544
long          537
old           532
let           525
little        520
feel          511
didn          506
week          502
shelter       501
dtype: int64

In [45]:
dog_matrix.mean().sort_values(ascending=False).head(50)

dog         1.172958
just        0.284129
like        0.269879
time        0.202002
know        0.159034
people      0.148918
think       0.141847
good        0.139128
puppy       0.136517
day         0.129229
thing       0.124551
really      0.122484
breed       0.120309
make        0.117589
vet         0.114000
want        0.113565
need        0.109975
work        0.099967
lot         0.092570
year        0.092027
food        0.091700
love        0.090177
way         0.089416
home        0.085391
breeder     0.075057
training    0.074840
sure        0.072664
look        0.072338
help        0.070924
pet         0.070488
life        0.070053
say         0.069183
right       0.067008
walk        0.064614
try         0.063853
come        0.062765
animal      0.062004
issue       0.061895
best        0.061460
owner       0.061025
great       0.060807
month       0.059175
long        0.058414
old         0.057870
let         0.057109
little      0.056565
feel        0.055586
didn        0

### TF-IDF Vectorizer

In [46]:
tvec = TfidfVectorizer(analyzer = "word", 
                     stop_words = stop_words, 
                     max_features = 10000, 
                     ngram_range = (1, 3))

cat_tf_words = tvec.fit_transform(vector_input_cats)

cat_tf_words = cat_tf_words.toarray()

cat_matrix = pd.DataFrame(cat_tf_words, columns=tvec.get_feature_names())

cat_matrix.sum().sort_values(ascending=False).head(50)

cat           323.546218
like          150.042255
love          148.794817
look          126.013153
cute          125.693852
thank         117.198481
beautiful     115.932920
kitty         115.234998
just          112.678251
sorry          99.064591
good           80.264965
little         78.539472
lol            72.069564
know           71.078370
time           70.687266
adorable       70.011383
eye            64.779981
thanks         60.235855
look like      58.835983
think          56.964162
want           55.660938
pretty         54.594326
sweet          54.539193
right          54.296826
best           53.828006
loss           53.652515
thing          53.228171
kitten         51.525262
sure           50.449468
really         50.191682
yes            49.957459
day            49.916682
sorry loss     49.396057
make           49.178146
great          48.878039
year           48.332306
baby           47.900873
vet            47.825943
food           47.238175
gorgeous       46.372627


In [47]:
tvec = TfidfVectorizer(analyzer = "word", 
                     stop_words = stop_words, 
                     max_features = 10000, 
                     ngram_range = (1, 3))

dog_tf_words = tvec.fit_transform(vector_input_dogs)

dog_tf_words = dog_tf_words.toarray()

dog_matrix = pd.DataFrame(dog_tf_words, columns=tvec.get_feature_names())

dog_matrix.sum().sort_values(ascending=False).head(50)

dog           462.779419
like          186.892332
just          183.657466
time          136.848902
know          125.705360
good          118.816490
think         114.379939
vet           111.545706
thank         110.007991
people        106.517715
love          104.359127
puppy         102.901687
day            98.700575
really         97.671905
make           95.745516
thing          94.670714
breed          89.637384
need           87.865439
want           86.471091
thanks         83.227666
look           82.925065
work           82.545906
food           79.328188
way            76.331721
year           75.754686
sure           75.416406
lot            74.076969
home           69.949664
right          69.046597
help           68.516200
sound          68.157207
try            66.881656
life           65.164926
say            64.271097
best           63.432643
great          62.749292
little         62.700242
pet            60.998215
animal         60.556549
walk           60.229329
