This notebook contains the preprocessing steps of the text column of a dataset that I scrapped from 3 Subreddit groups (r/Jobs, r/resume, and r/careers) on the Reddit social media platform. This data will be analysized to gain insight into this group's discussions.

This is the first of a series of Notebooks that will be uploaded as this project progresses.

In [None]:
#Including neccesary libraries
import pandas as pd
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:

#Read in data and convert to a pandas dataframe
df_Jobs = pd.read_csv('/content/combined_data.csv')


In [None]:
#Information about the dataframe
df_Jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2840 entries, 0 to 2839
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2840 non-null   object 
 1   title         2832 non-null   object 
 2   Text          2318 non-null   object 
 3   score         2831 non-null   object 
 4   num_comments  2831 non-null   float64
 5   post_id       2830 non-null   object 
 6   post_time     2830 non-null   float64
dtypes: float64(2), object(5)
memory usage: 155.4+ KB


In [None]:
#Remove rows with missing data
df_Jobs = df_Jobs.dropna()

In [None]:
#Check info afterwards to ensure the previous step was carried out
df_Jobs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2317 entries, 0 to 2839
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2317 non-null   object 
 1   title         2317 non-null   object 
 2   Text          2317 non-null   object 
 3   score         2317 non-null   object 
 4   num_comments  2317 non-null   float64
 5   post_id       2317 non-null   object 
 6   post_time     2317 non-null   float64
dtypes: float64(2), object(5)
memory usage: 144.8+ KB


In [None]:
#Remove the Unnamed: 0 and post_id(To ensure annonymity of the post) columns
df_Jobs = df_Jobs.drop(['Unnamed: 0', 'post_id'], axis=1)

df_Jobs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2317 entries, 0 to 2839
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         2317 non-null   object 
 1   Text          2317 non-null   object 
 2   score         2317 non-null   object 
 3   num_comments  2317 non-null   float64
 4   post_time     2317 non-null   float64
dtypes: float64(2), object(3)
memory usage: 108.6+ KB


In [None]:
#The function performs various preprocessing steps on the Text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

    # Convert emojis to text
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove single-letter words
    tokens = [word for word in tokens if len(word) > 2]

    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text




In [None]:
# Apply preprocessing function to create a new column 'preprocessed_text'
df_Jobs['preprocessed_text'] = df_Jobs['Text'].apply(preprocess_text)

In [None]:
#Take a look at the first five rows
df_Jobs.head()

Unnamed: 0,title,Text,score,num_comments,post_time,preprocessed_text
0,Success and Disappointment Megathread for the ...,This is the weekly success and disappointment...,17,75.0,1686524000.0,weekly success disappointment megathread week ...
1,New moderator recruitment,Are you a current active community member? Int...,4,0.0,1688062000.0,current active community member interested hel...
2,"What are these ""I finish work in 2 hours and j...",I'm currently in a business development role w...,2297,941.0,1688123000.0,currently business development role constant w...
3,Nobody wants to help you anymore,"Decades ago, when you started a new job, you w...",242,59.0,1688148000.0,decade ago started new job would trained also ...
4,after six months - FINALLY! keep going!,I was laid off from a very large mortgage comp...,44,5.0,1688152000.0,laid large mortgage company get trouble lot .....


In [None]:
#Save the preprocessed text into a csv file for further analysis
df_Jobs.to_csv('Jobs.csv')