# Project 6 Capstone - Part 2: Dataset Munging and Cleaning: Budgetfood

In [1]:
# Imports
import pandas as pd
import nltk
from nltk.corpus import stopwords

In [None]:
# Data Source: 

# Name: BudgetFood

# Link: https://www.reddit.com/r/budgetfood/

# Description: Dataset pulled using PRAW from subreddit /budgetfood 

# Data Dictionary: Created_UTC - Timestamp of post; Title - Title of Post; Self_Text - Content of the Post; Subreddit - Name of the subreddit

|Feature|Type|Dataset|Description|
|---|---|---|---|
|**budgetfood.csv**|*dtypes: float64(1), object(3)*|Reddit|Dataset pulled using PRAW from subreddit /budgetfood|

In [2]:
df = pd.read_csv('../data/PRAW/budgetfood.csv')

In [3]:
df.shape

(888, 4)

In [4]:
df.head()

Unnamed: 0,created_utc,title,self_text,subreddit
0,1735956000.0,Beginner budgeter here. Need assistance.,I think we can all agree that the prices at th...,budgetfood
1,1735900000.0,cold lunches ideas,"Hello, so my boyfriend is a geodesist, so they...",budgetfood
2,1735877000.0,Ideas for fast and cheap cake/cookies/treats? ...,"Hi, I need to prepare about \~50 treats for a ...",budgetfood
3,1735870000.0,Easy soup recipe!!,I was in a pinch and was craving soup!\nI boil...,budgetfood
4,1735700000.0,Easy budget soup,I made this video for a cooking page I’m part ...,budgetfood


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   created_utc  888 non-null    float64
 1   title        888 non-null    object 
 2   self_text    783 non-null    object 
 3   subreddit    888 non-null    object 
dtypes: float64(1), object(3)
memory usage: 27.9+ KB


In [6]:
# Merge 'title' and 'self_text' into a single 'text' column
df['title'] = df['title'].fillna('')
df['self_text'] = df['self_text'].fillna('')
df['text'] = df['title'] + ' ' + df['self_text']

In [7]:
df.shape

(888, 5)

In [8]:
df.head()

Unnamed: 0,created_utc,title,self_text,subreddit,text
0,1735956000.0,Beginner budgeter here. Need assistance.,I think we can all agree that the prices at th...,budgetfood,Beginner budgeter here. Need assistance. I th...
1,1735900000.0,cold lunches ideas,"Hello, so my boyfriend is a geodesist, so they...",budgetfood,"cold lunches ideas Hello, so my boyfriend is a..."
2,1735877000.0,Ideas for fast and cheap cake/cookies/treats? ...,"Hi, I need to prepare about \~50 treats for a ...",budgetfood,Ideas for fast and cheap cake/cookies/treats? ...
3,1735870000.0,Easy soup recipe!!,I was in a pinch and was craving soup!\nI boil...,budgetfood,Easy soup recipe!! I was in a pinch and was cr...
4,1735700000.0,Easy budget soup,I made this video for a cooking page I’m part ...,budgetfood,Easy budget soup I made this video for a cooki...


In [9]:
# Remove stop words
# download stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gfranksjr24/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
def remove_stopwords(text):
    if pd.isna(text):
        return ""
    words = str(text).split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [12]:
df['clean_text'] = df['text'].apply(remove_stopwords)

In [26]:
df.shape

(888, 6)

In [28]:
df.head()

Unnamed: 0,created_utc,title,self_text,subreddit,text,clean_text
0,1735956000.0,Beginner budgeter here. Need assistance.,I think we can all agree that the prices at th...,budgetfood,Beginner budgeter here. Need assistance. I th...,Beginner budgeter here. Need assistance. think...
1,1735900000.0,cold lunches ideas,"Hello, so my boyfriend is a geodesist, so they...",budgetfood,"cold lunches ideas Hello, so my boyfriend is a...","cold lunches ideas Hello, boyfriend geodesist,..."
2,1735877000.0,Ideas for fast and cheap cake/cookies/treats? ...,"Hi, I need to prepare about \~50 treats for a ...",budgetfood,Ideas for fast and cheap cake/cookies/treats? ...,Ideas fast cheap cake/cookies/treats? Need mak...
3,1735870000.0,Easy soup recipe!!,I was in a pinch and was craving soup!\nI boil...,budgetfood,Easy soup recipe!! I was in a pinch and was cr...,Easy soup recipe!! pinch craving soup! boiled ...
4,1735700000.0,Easy budget soup,I made this video for a cooking page I’m part ...,budgetfood,Easy budget soup I made this video for a cooki...,Easy budget soup made video cooking page I’m p...


In [30]:
# Change all characters to lowercase
df['clean_text'] = df['clean_text'].str.lower()

In [32]:
df.head()

Unnamed: 0,created_utc,title,self_text,subreddit,text,clean_text
0,1735956000.0,Beginner budgeter here. Need assistance.,I think we can all agree that the prices at th...,budgetfood,Beginner budgeter here. Need assistance. I th...,beginner budgeter here. need assistance. think...
1,1735900000.0,cold lunches ideas,"Hello, so my boyfriend is a geodesist, so they...",budgetfood,"cold lunches ideas Hello, so my boyfriend is a...","cold lunches ideas hello, boyfriend geodesist,..."
2,1735877000.0,Ideas for fast and cheap cake/cookies/treats? ...,"Hi, I need to prepare about \~50 treats for a ...",budgetfood,Ideas for fast and cheap cake/cookies/treats? ...,ideas fast cheap cake/cookies/treats? need mak...
3,1735870000.0,Easy soup recipe!!,I was in a pinch and was craving soup!\nI boil...,budgetfood,Easy soup recipe!! I was in a pinch and was cr...,easy soup recipe!! pinch craving soup! boiled ...
4,1735700000.0,Easy budget soup,I made this video for a cooking page I’m part ...,budgetfood,Easy budget soup I made this video for a cooki...,easy budget soup made video cooking page i’m p...
