# Cleaning and EDA for r/News and r/TheOnion subreddits

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# read in the combined dataframe that includes both subreddits
reddits = pd.read_csv(
    'data/onion_news.csv', 
    usecols=[
        'title','created_utc', 'selftext','subreddit','author', 'permalink'
    ],
    low_memory=False
)

In [3]:
reddits.shape

(29978, 6)

In [4]:
#pd.set_option('display.max_columns', 100)
reddits.head(2)

Unnamed: 0,author,created_utc,permalink,selftext,subreddit,title
0,dwaxe,1601660299,/r/TheOnion/comments/j3z3ds/real_estate_expert...,,TheOnion,Real Estate Experts Confirm Having George Cloo...
1,kc9283,1601604158,/r/TheOnion/comments/j3m23g/subway_bread_isnt_...,,TheOnion,"Subway bread isn't bread, Irish court says"


In [5]:
# taking a look
reddits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29978 entries, 0 to 29977
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   author       29978 non-null  object
 1   created_utc  29978 non-null  int64 
 2   permalink    29978 non-null  object
 3   selftext     387 non-null    object
 4   subreddit    29978 non-null  object
 5   title        29978 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.4+ MB


In [6]:
# checking how many NaNs are in The Onion subreddit
reddits.loc[reddits['subreddit'] == 'TheOnion'].isna().sum()

author             0
created_utc        0
permalink          0
selftext       14744
subreddit          0
title              0
dtype: int64

In [7]:
# checking how many NaNs are in the news subreddit
reddits.loc[reddits['subreddit'] == 'news'].isna().sum()

author             0
created_utc        0
permalink          0
selftext       14847
subreddit          0
title              0
dtype: int64

In [8]:
# fill NaNs with on the selftext column with an empty string '' to combine selftext and title columns
reddits['selftext'] = reddits['selftext'].fillna('')

In [9]:
# combine selftext and title columns into one
reddits['text'] = reddits['selftext'] + reddits['title']

In [10]:
# drop selftext and title columns
reddits.drop(columns=['selftext', 'title'], inplace=True)

In [11]:
# double checking for NaNs
reddits.isna().sum()

author         0
created_utc    0
permalink      0
subreddit      0
text           0
dtype: int64

In [16]:
# how many rows of each subreddit?
reddits.groupby('subreddit').size()

subreddit
TheOnion    14978
news        15000
dtype: int64

In [14]:
# The data are reasonably clean, so write to file to be used in analysis
reddits.to_csv('data/clean_reddts.csv', index=False)