## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import re

## Cleaning

In [2]:
# read in saved pets dataset
pet_df = pd.read_csv('./data/pets.csv').drop(columns = 'Unnamed: 0')

In [3]:
# check first 5 rows
pet_df.head()

Unnamed: 0,subreddit,selftext,title,created_utc
0,dogs,So my miniature poodle can't last more than 10...,Barking when he is alone and awake,1673991330
1,dogs,Genuinely curious. My vet won’t answer this qu...,When is an appropriate age to start taking you...,1673991096
2,dogs,My 2.5 year mixed breed male dog has to have k...,Knee surgery,1673990893
3,dogs,I’m new to reddit and came across this sub. I ...,Positive and Negative Punishment,1673990776
4,dogs,Looking for advice on which dog washing statio...,HOA Agreed to purchase dog washing station,1673990697


In [4]:
# check last 5 rows
pet_df.tail()

Unnamed: 0,subreddit,selftext,title,created_utc
1995,cats,,My sleepy boi,1673913111
1996,cats,,Safe to say my cat has accepted the new kitten.,1673913101
1997,cats,,She is a bit special for sleeping,1673913055
1998,cats,,Can anyone see me? I am hiding inside this…plate…,1673913046
1999,cats,,Cat nap yoga: the goat yoga of 2023?,1673913041


##### The 'cats' subreddit appears to have a few missing values in the 'selftext' column. Let's check for missing values in the whole dataset.

In [5]:
pet_df.dtypes

subreddit      object
selftext       object
title          object
created_utc     int64
dtype: object

In [6]:
# check shape - each subreddit should have 1000 data entries each
pet_df.shape

(2000, 4)

In [7]:
pet_df.isnull().sum()

subreddit        0
selftext       828
title            0
created_utc      0
dtype: int64

In [8]:
pet_df[pet_df['subreddit'] == 'dogs'].isnull().sum()

subreddit      0
selftext       8
title          0
created_utc    0
dtype: int64

In [9]:
pet_df[pet_df['subreddit'] == 'cats'].isnull().sum()

subreddit        0
selftext       820
title            0
created_utc      0
dtype: int64

##### Almost all of the selftext for the 'cats' subreddit is missing. 
##### Based on this, I will be replacing NaN in 'selftext' with 0's and using the 'title' column for the rest of this project.

In [10]:
# replace missing values with ' ' since it's dtype is object
pet_df.fillna(' ', inplace = True)

# check again
pet_df.isnull().sum()

subreddit      0
selftext       0
title          0
created_utc    0
dtype: int64

In [11]:
# check
pet_df.tail()

Unnamed: 0,subreddit,selftext,title,created_utc
1995,cats,,My sleepy boi,1673913111
1996,cats,,Safe to say my cat has accepted the new kitten.,1673913101
1997,cats,,She is a bit special for sleeping,1673913055
1998,cats,,Can anyone see me? I am hiding inside this…plate…,1673913046
1999,cats,,Cat nap yoga: the goat yoga of 2023?,1673913041


In [12]:
# save to cleaned csv
pet_df.to_csv('./data/cleaned_pet.csv')

### Question

* 82% of our 'cats' subreddit has missing values in the 'selftext' column, which is significant. Could this be an indicator of who is a cat or dog person? If we're using this dataset to predict who is a cat vs dog person, could an empty 'selftext' be indicative of cat people?

##### Especially since 'selftext' is the body of the post, we could ask the questions: 
* Are cat people more likely to post gifs, videos, or images than dog people are?
* Are dog people more likely to include text in their posts than cat people are?