In [20]:
import pandas as pd
import numpy as np

# Cleaning r/Bodyweight Fitness

In [13]:
bodyweight_df = pd.read_csv('../Data/bodyweight.csv', index_col=0)

In [14]:
bodyweight_df.head(2)

Unnamed: 0,author,created_utc,num_comments,num_crossposts,score,selftext,subreddit,title
0,Luciferswife4life,1610570566,0,0,1,[removed],bodyweightfitness,What lower body excercises can I do with bad k...
1,J22Charles,1610570222,0,0,1,Been doing cardio and resistance for year swit...,bodyweightfitness,Looking for supplement for lean muscle


In [15]:
bodyweight_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   author          10000 non-null  object
 1   created_utc     10000 non-null  int64 
 2   num_comments    10000 non-null  int64 
 3   num_crossposts  10000 non-null  int64 
 4   score           10000 non-null  int64 
 5   selftext        9909 non-null   object
 6   subreddit       10000 non-null  object
 7   title           10000 non-null  object
dtypes: int64(4), object(4)
memory usage: 703.1+ KB


In [18]:
# drop nulls
bodyweight_df.dropna(inplace=True)

In [22]:
# drop rows where post is removed or deleted
bodyweight_df = bodyweight_df.loc[bodyweight_df['selftext'] != '[removed]']
bodyweight_df = bodyweight_df.loc[bodyweight_df['selftext'] != '[deleted]']

In [25]:
bodyweight_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7722 entries, 1 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   author          7722 non-null   object
 1   created_utc     7722 non-null   int64 
 2   num_comments    7722 non-null   int64 
 3   num_crossposts  7722 non-null   int64 
 4   score           7722 non-null   int64 
 5   selftext        7722 non-null   object
 6   subreddit       7722 non-null   object
 7   title           7722 non-null   object
dtypes: int64(4), object(4)
memory usage: 543.0+ KB


In [61]:
# drop rows containing the subreddit name in the post
bodyweight_df = bodyweight_df[~bodyweight_df['selftext'].str.contains(f"r/{bodyweight_df.iloc[0,6]}")]

In [62]:
bodyweight_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7392 entries, 1 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   author          7392 non-null   object
 1   created_utc     7392 non-null   int64 
 2   num_comments    7392 non-null   int64 
 3   num_crossposts  7392 non-null   int64 
 4   score           7392 non-null   int64 
 5   selftext        7392 non-null   object
 6   subreddit       7392 non-null   object
 7   title           7392 non-null   object
dtypes: int64(4), object(4)
memory usage: 519.8+ KB


In [85]:
# drop rows where post is shorter than title
# this gets rid of posts that were just media with little collateral
bodyweight_df = bodyweight_df[bodyweight_df['selftext'].str.len() >= bodyweight_df['title'].str.len()]

In [86]:
bodyweight_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7052 entries, 1 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   author          7052 non-null   object
 1   created_utc     7052 non-null   int64 
 2   num_comments    7052 non-null   int64 
 3   num_crossposts  7052 non-null   int64 
 4   score           7052 non-null   int64 
 5   selftext        7052 non-null   object
 6   subreddit       7052 non-null   object
 7   title           7052 non-null   object
dtypes: int64(4), object(4)
memory usage: 495.8+ KB


In [87]:
#saving cleaned data
bodyweight_df.to_csv('../Data/bodyweight_clean.csv')

In [91]:
# function with all the steps used to clean r/bodyweightfitness
def clean_data(df):
    """Remove unwanted rows from a dataframe of subreddit posts.
    
    For this function, the dataframe must have a column named
    'selftext' with the text of the subreddit post.  It must
    also contain the name of the subreddit in column of index 6.
    This function will drop rows with the following conditions:
    - Rows with null values
    - Rows with removed or deleted posts
    - Rows containing the subreddit name in the post
    - Rows where the post is shorter than the title
    
    args:
        df (pandas dataframe): dataframe to clean
        
    returns:
        pandas dataframe: cleaned dataframe
    """
    # drop nulls
    df.dropna(inplace=True)
    
    # drop rows with removed or deleted posts
    df = df.loc[df['selftext'] != '[removed]']
    df = df.loc[df['selftext'] != '[deleted]']
    
    # drop rows containing the subreddit name in the post
    df = df[~df['selftext'].str.contains(f"r/{df.iloc[0,6]}")]
    
    # drop rows where post is shorter than title
    df = df[df['selftext'].str.len() >= df['title'].str.len()]
    
    # reset the index
    df.reset_index(drop=True)
    
    return df

# Cleaning r/Fitness

In [94]:
fitness_df = pd.read_csv('../Data/fitness.csv', index_col=0)

In [95]:
fitness_df.head(2)

Unnamed: 0,author,created_utc,num_comments,num_crossposts,score,selftext,subreddit,title
0,bigjungus11,1610566951,1,0,1,[removed],Fitness,"Skinnyfat, should I eat more? Less?"
1,Oz390,1610566886,0,0,1,[removed],Fitness,Starting a new fitness programme


In [96]:
fitness_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26000 entries, 0 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   author          26000 non-null  object
 1   created_utc     26000 non-null  int64 
 2   num_comments    26000 non-null  int64 
 3   num_crossposts  26000 non-null  int64 
 4   score           26000 non-null  int64 
 5   selftext        25622 non-null  object
 6   subreddit       26000 non-null  object
 7   title           26000 non-null  object
dtypes: int64(4), object(4)
memory usage: 1.8+ MB


In [97]:
# apply same cleaning as r/bodyweightfitness using function
fitness_df_clean = clean_data(fitness_df)

In [98]:
fitness_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6711 entries, 49 to 98
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   author          6711 non-null   object
 1   created_utc     6711 non-null   int64 
 2   num_comments    6711 non-null   int64 
 3   num_crossposts  6711 non-null   int64 
 4   score           6711 non-null   int64 
 5   selftext        6711 non-null   object
 6   subreddit       6711 non-null   object
 7   title           6711 non-null   object
dtypes: int64(4), object(4)
memory usage: 471.9+ KB


In [99]:
# saving cleaned data
fitness_df_clean.to_csv('../Data/fitness_clean.csv')