# Project 6 Capstone - Part 2: Dataset Munging and Cleaning: FineDining

In [1]:
# Imports
import pandas as pd
import nltk
from nltk.corpus import stopwords

In [None]:
# Data Source: 

# Name: FineDining

# Link: https://www.reddit.com/r/finedining/ 

# Description: Dataset pulled using PRAW from subreddit /finedining

# Data Dictionary: Created_UTC - Timestamp of post; Title - Title of Post; Self_Text - Content of the Post; Subreddit - Name of the subreddit

|Feature|Type|Dataset|Description|
|---|---|---|---|
|**finedining.csv**|*dtypes: float64(1), object(3)*|Reddit|Dataset pulled using PRAW from subreddit /finedining|

In [2]:
df = pd.read_csv('../data/PRAW/finedining.csv')

In [3]:
df.shape

(996, 4)

In [4]:
df.head()

Unnamed: 0,created_utc,title,self_text,subreddit
0,1735992000.0,Ambience photos,Love the restaurant review posts on here. As a...,finedining
1,1735991000.0,Andrew Fairlie** (Scotland),"Dined at 2 starred Andrew Fairlie, located at ...",finedining
2,1735967000.0,"cépages - Gunma, Japan",Not quite sure how I originally found this pla...,finedining
3,1735966000.0,Californios ** San Francisco,Had the opportunity to dine at Californios in ...,finedining
4,1735966000.0,"Valhalla, Chicago (winter menu 2024)",Visited Valhalla on a layover in Chicago after...,finedining


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 996 entries, 0 to 995
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   created_utc  996 non-null    float64
 1   title        996 non-null    object 
 2   self_text    879 non-null    object 
 3   subreddit    996 non-null    object 
dtypes: float64(1), object(3)
memory usage: 31.3+ KB


In [6]:
# Merge 'title' and 'self_text' into a single 'text' column
df['title'] = df['title'].fillna('')
df['self_text'] = df['self_text'].fillna('')
df['text'] = df['title'] + ' ' + df['self_text']

In [7]:
df.shape

(996, 5)

In [8]:
df.head()

Unnamed: 0,created_utc,title,self_text,subreddit,text
0,1735992000.0,Ambience photos,Love the restaurant review posts on here. As a...,finedining,Ambience photos Love the restaurant review po...
1,1735991000.0,Andrew Fairlie** (Scotland),"Dined at 2 starred Andrew Fairlie, located at ...",finedining,Andrew Fairlie** (Scotland) Dined at 2 starre...
2,1735967000.0,"cépages - Gunma, Japan",Not quite sure how I originally found this pla...,finedining,"cépages - Gunma, Japan Not quite sure how I or..."
3,1735966000.0,Californios ** San Francisco,Had the opportunity to dine at Californios in ...,finedining,Californios ** San Francisco Had the opportun...
4,1735966000.0,"Valhalla, Chicago (winter menu 2024)",Visited Valhalla on a layover in Chicago after...,finedining,"Valhalla, Chicago (winter menu 2024) Visited V..."


In [9]:
# Remove stop words
# download stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gfranksjr24/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
def remove_stopwords(text):
    if pd.isna(text):
        return ""
    words = str(text).split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [12]:
df['clean_text'] = df['text'].apply(remove_stopwords)

In [13]:
df.shape

(996, 6)

In [14]:
df.head()

Unnamed: 0,created_utc,title,self_text,subreddit,text,clean_text
0,1735992000.0,Ambience photos,Love the restaurant review posts on here. As a...,finedining,Ambience photos Love the restaurant review po...,Ambience photos Love restaurant review posts h...
1,1735991000.0,Andrew Fairlie** (Scotland),"Dined at 2 starred Andrew Fairlie, located at ...",finedining,Andrew Fairlie** (Scotland) Dined at 2 starre...,Andrew Fairlie** (Scotland) Dined 2 starred An...
2,1735967000.0,"cépages - Gunma, Japan",Not quite sure how I originally found this pla...,finedining,"cépages - Gunma, Japan Not quite sure how I or...","cépages - Gunma, Japan quite sure originally f..."
3,1735966000.0,Californios ** San Francisco,Had the opportunity to dine at Californios in ...,finedining,Californios ** San Francisco Had the opportun...,Californios ** San Francisco opportunity dine ...
4,1735966000.0,"Valhalla, Chicago (winter menu 2024)",Visited Valhalla on a layover in Chicago after...,finedining,"Valhalla, Chicago (winter menu 2024) Visited V...","Valhalla, Chicago (winter menu 2024) Visited V..."


In [31]:
# Change all characters to lowercase
df['clean_text'] = df['clean_text'].str.lower()

In [34]:
df.head()

Unnamed: 0,created_utc,title,self_text,subreddit,text,clean_text
0,1735992000.0,Ambience photos,Love the restaurant review posts on here. As a...,finedining,Ambience photos Love the restaurant review po...,ambience photos love restaurant review posts h...
1,1735991000.0,Andrew Fairlie** (Scotland),"Dined at 2 starred Andrew Fairlie, located at ...",finedining,Andrew Fairlie** (Scotland) Dined at 2 starre...,andrew fairlie** (scotland) dined 2 starred an...
2,1735967000.0,"cépages - Gunma, Japan",Not quite sure how I originally found this pla...,finedining,"cépages - Gunma, Japan Not quite sure how I or...","cépages - gunma, japan quite sure originally f..."
3,1735966000.0,Californios ** San Francisco,Had the opportunity to dine at Californios in ...,finedining,Californios ** San Francisco Had the opportun...,californios ** san francisco opportunity dine ...
4,1735966000.0,"Valhalla, Chicago (winter menu 2024)",Visited Valhalla on a layover in Chicago after...,finedining,"Valhalla, Chicago (winter menu 2024) Visited V...","valhalla, chicago (winter menu 2024) visited v..."
