In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import html


# Language Detection Imports:
# SOURCE:  https://spacy.io/
# SOURCE:  https://pypi.org/project/spacy-langdetect/
import spacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language

# Code provided from the following source:  https://stackoverflow.com/questions/66433496/how-do-i-fix-valueerror-when-doing-nlp-add-pipelanguagedetector-name-langua
#  The documentation was providing code that was not working, and the above stack overflow post managed to fix the issue
# SOURCE:  https://pypi.org/project/spacy-langdetect/

nlp = spacy.load("en_core_web_sm")

def create_lang_detector(nlp, name):
    return LanguageDetector()

Language.factory("language_detector", func=create_lang_detector)
nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x2756d555700>

In [2]:
df_onion = pd.read_csv('../data/theonion_1682378516.csv')
df_wldnws = pd.read_csv('../data/worldnews_1682378600.csv')

In [25]:
def onion_wldnws_cleaner(onion_df, wldnws_df, file_name):
    
    # DROP DUPLICATE TITLES
    onion_df.drop_duplicates(subset = 'title', inplace=True)
    wldnws_df.drop_duplicates(subset = 'title', inplace=True)
    
    # DROP UNNEEDED COLUMNS
    onion_df = onion_df[['subreddit', 'title']]
    wldnws_df = wldnws_df[['subreddit', 'title']]
    
    # FIX HTML ERRORS
    wldnws_df['title'] = wldnws_df['title'].apply(html.unescape)
    onion_df['title'] = onion_df['title'].apply(html.unescape)
    
    # REMOVE LOWERCASE ARTICLES
    bad_onion_dfs = list(onion_df[onion_df['title'].str.islower() == True].index)
    bad_news_dfs = list(wldnws_df[wldnws_df['title'].str.islower() == True].index)
    onion_df.drop(index=bad_onion_dfs, inplace=True)
    wldnws_df.drop(index=bad_news_dfs, inplace=True)
    
    # REMOVE NON_ENGLISH
    wldnws_df['lang'] = wldnws_df['title'].apply(lambda x: nlp(x)._.language['language'])
    wldnws_df = wldnws_df[wldnws_df['lang'] == 'en']
    onion_df['lang'] = onion_df['title'].apply(lambda x: nlp(x)._.language['language'])
    onion_df = onion_df[onion_df['lang'] == 'en']
    
    # CREATE CONCATENATED DATAFRAME
    reddit_data = pd.concat([onion_df[['subreddit', 'title']], wldnws_df[['subreddit', 'title']]]).reset_index(drop = True)

    
# ==============================================================================================    
    
    # DROP TITLES REFERENCING SUBREDDIT NAME
    # Find all titles with some variation of World News
        # The regex code here was created with the help of the following sources:
        # Source:  https://stackoverflow.com/questions/18402416/regular-expression-to-match-a-word-or-its-prefix
        # Source:  https://stackoverflow.com/questions/20462834/python-using-str-replace-with-a-wildcard
        # Source:  https://regex101.com/
        # Source:  https://stackoverflow.com/questions/5633533/regular-expression-for-matching-parentheses
        # Source:  https://stackoverflow.com/questions/4007302/regex-how-to-match-an-optional-character
        # Source:  https://www.regular-expressions.info/optional.html
        # Source:  https://stackoverflow.com/questions/9655164/regex-ignore-case-sensitivity
        # Source:  https://stackoverflow.com/questions/34583904/javascript-regex-ignore-case-for-specific-capture-group
        # Source:  https://stackoverflow.com/questions/7548787/regex-for-and-not-operation
    '''
    The regex expression below will (in order or characters left to right):
    - outer () - says to match all inside the parentheses
    - /{0,1}r{0,1}/{0,1} - find a string with or without each of  /, r, and /
    - [wW]orld[nN]ews - find worldnew optionally capitalized
    - ( Live Thread: )? - it may have this text after it, if so find it, if not, don't
    - | either or
    - outer () - says to match all inside the parenthese
    - \( - backslash tells it to treat left parenthesis as character
    - Thread # - where it says 'Thread #'
    - \d{1,} - \d any digit, {1,} at least one character long
    - \) - backslash tells it to treat right parenthesis as character
    '''
    # Create the regex string
    regex_string = '(/{0,1}r{0,1}/{0,1}[wW]orld[nN]ews( Live Thread: )?)|( \(Thread #\d{1,}\))'
    bad_titles_ind = list(reddit_data[reddit_data['title'].apply(lambda title: len(re.findall(regex_string, title))) > 0].index)
    # Append the full list of articles containing 'onion' identified above to the bad_titles list
    for title in list(reddit_data[reddit_data['title'].str.lower().str.find('onion') > -1].index):
        bad_titles_ind.append(title)
    reddit_data.drop(index=bad_titles_ind, inplace=True);
    
    # REMOVE HASHTAGS
    reddit_data['title'] = reddit_data['title'].apply(lambda title: title.replace('&#x27;',''))
    # Find all titles with some variation of World News
    '''
    The regex string below looks to find and # followed by a word and not followed by a number
    '''
    # Create the regex string from before
    regex_string = '#\D\w+'
    reddit_data[reddit_data['title'].apply(lambda title: len(re.findall(regex_string, title))) > 0]
    reddit_data['title'] = reddit_data['title'].apply(lambda title: re.sub(regex_string, '', title));
    
    # REMOVED NEW ARTICLE TITLES USUALLY AFTER PIPES AND DASHES
    # For the PIPES
    regex_string = '\| [\w \W]+'
    reddit_data['title'] = reddit_data['title'].apply(lambda title: re.sub(regex_string, '', title))
    # For the DASHES
    regex_string = '\- [\w \W]+'
    reddit_data['title'] = reddit_data['title'].apply(lambda title: re.sub(regex_string, '', title))
    
    #  REMOVE EMOJIS AND SPECIAL CHARACTERS
    # Test the regex expression to remove emojis:
        # This expression will remove emojis and most punctuation which won't be necessary for the algorithms used.
        # Source to help with this code:  https://stackoverflow.com/questions/7548787/regex-for-and-not-operation
    '''
    This regex will find all non word items and exclude all of the following characters (a space is the first character):  ' &$%\-#/'
    The characters selected to be excluded above were iteratively chosen by examining the outputs of the find all code below.  
    The word vectorizers will do the vast majority of the punctuation removal, but here, these characters are being skipped 
    as replacing them with nothing may change the meaning of some words or combine two words that shoud not be together.
    '''
    regex_string = "((?=[^ &$%\-#/])\W)"
    reddit_data['title'] = reddit_data['title'].apply(lambda title: re.sub(regex_string, '', title))
    
    # Remove Starting and Trailing Spaces
    reddit_data['title'] = reddit_data['title'].apply(lambda title: title.strip())
    
    # REMOVE TITLES WITH TWO OR FEWER WORDS
    reddit_data.drop(index = reddit_data[reddit_data['title'].apply(lambda x: len(x.strip().split())) <= 2].index,
           inplace=True)
    
    # Reset the index:
    reddit_data.reset_index(drop = True, inplace = True)
    
    # Export to a csv
    reddit_data.to_csv(f'../data/{file_name}.csv', index=False)
        
    return reddit_data

In [26]:
df_onion_test = pd.read_csv('../data/theonion_1578009619.csv')
df_wldnws_test = pd.read_csv('../data/worldnews_1680688103.csv')

df = onion_wldnws_cleaner(df_onion_test, df_wldnws_test, 'reddit_holdout_2')
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wldnws_df['title'] = wldnws_df['title'].apply(html.unescape)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  onion_df['title'] = onion_df['title'].apply(html.unescape)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  onion_df.drop(index=bad_onion_dfs, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the doc

Unnamed: 0,subreddit,title
0,TheOnion,US Schools Trail World In Child Soldier Aptitude
1,TheOnion,Is Your Flamingo Sick Enough To Make A Movie A...
2,TheOnion,More American Workers Outsourcing Own Jobs Ove...
3,TheOnion,So People Could Be Listening To This Conversat...
4,TheOnion,Destroyed substitute teachers
...,...,...
1727,worldnews,For the first time renewable energy generation...
1728,worldnews,Chinas loans to Africa worry World Bank Presid...
1729,worldnews,In Ukraine where even the corpses are booby tr...
1730,worldnews,Russian aggression killed 262 Ukrainian athletes


# <font color = 'red'> DONT FORGET TO RESET INDICES

In [14]:
df = onion_wldnws_cleaner(df_onion, df_wldnws, )
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wldnws['title'] = wldnws['title'].apply(html.unescape)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  onion['title'] = onion['title'].apply(html.unescape)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  onion.drop(index=bad_onions, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https:

Unnamed: 0,subreddit,title
0,TheOnion,Idiot Tornado Tears Harmlessly Through Empty F...
1,TheOnion,New Texas Law Requires Schools To Display Imag...
2,TheOnion,New Poll Finds Americans Would Respect Biden M...
3,TheOnion,Could You Pass Racial Discrimination Training ...
4,TheOnion,Dog And Owner Having Public Fight
...,...,...
10360,worldnews,Diners in Japan arrested for dipping own chops...
10361,worldnews,200 Russian Journalists Sign Letter Demanding ...
10362,worldnews,Foxconn founder Gou to run for Taiwan presiden...
10363,worldnews,A wartime NATO struggles to replace its chief


In [10]:
df = onion_wldnws_cleaner(df_onion, df_wldnws, )
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wldnws['title'] = wldnws['title'].apply(html.unescape)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  onion['title'] = onion['title'].apply(html.unescape)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  onion.drop(index=bad_onions, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https:

Unnamed: 0,subreddit,title
0,TheOnion,Idiot Tornado Tears Harmlessly Through Empty F...
1,TheOnion,New Texas Law Requires Schools To Display Imag...
2,TheOnion,New Poll Finds Americans Would Respect Biden M...
3,TheOnion,Could You Pass Racial Discrimination Training ...
4,TheOnion,Dog And Owner Having Public Fight
...,...,...
10366,worldnews,Diners in Japan arrested for dipping own chops...
10367,worldnews,200 Russian Journalists Sign Letter Demanding ...
10368,worldnews,Foxconn founder Gou to run for Taiwan presiden...
10369,worldnews,A wartime NATO struggles to replace its chief


In [18]:
df_onion_test = pd.read_csv('../data/theonion_1578009619.csv')
df_wldnws_test = pd.read_csv('../data/worldnews_1680688103.csv')

df = onion_wldnws_cleaner(df_onion_test, df_wldnws_test, 'reddit_holdout_2')
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wldnws['title'] = wldnws['title'].apply(html.unescape)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  onion['title'] = onion['title'].apply(html.unescape)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  onion.drop(index=bad_onions, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https:

Unnamed: 0,subreddit,title
0,TheOnion,US Schools Trail World In Child Soldier Aptitude
1,TheOnion,Is Your Flamingo Sick Enough To Make A Movie A...
2,TheOnion,More American Workers Outsourcing Own Jobs Ove...
3,TheOnion,So People Could Be Listening To This Conversat...
4,TheOnion,Destroyed substitute teachers
...,...,...
1724,worldnews,For the first time renewable energy generation...
1725,worldnews,Chinas loans to Africa worry World Bank Presid...
1726,worldnews,In Ukraine where even the corpses are booby tr...
1727,worldnews,Russian aggression killed 262 Ukrainian athletes


In [12]:
df.to_csv('../data/holdout_data.csv')

In [None]:
# 2 - 1.1.2 - DROP DUPLICATE TITLES

onion.drop_duplicates(subset = 'title', inplace=True)
wldnws.drop_duplicates(subset = 'title', inplace=True)
onion.shape, wldnws.shape

In [None]:
# 2 - 1.3 - DROP UNNEEDED COLUMNS
onion = onion[['subreddit', 'title']]
wldnws = wldnws[['subreddit', 'title']]

In [None]:
# 2 - 2.1 - FIX HTML ERRORS
wldnws['title'] = wldnws['title'].apply(html.unescape)
onion['title'] = onion['title'].apply(html.unescape)

In [None]:
# 2 - 2.4 - REMOVE LOWERCASE ARTICLES
bad_onions = list(onion[onion['title'].str.islower() == True].index)
bad_news = list(wldnws[wldnws['title'].str.islower() == True].index)

onion.drop(index=bad_onions, inplace=True)
wldnws.drop(index=bad_news, inplace=True)

In [None]:
# 2 - 3 - Remove non-English

# Code provided from the following source:  https://stackoverflow.com/questions/66433496/how-do-i-fix-valueerror-when-doing-nlp-add-pipelanguagedetector-name-langua
#  The documentation was providing code that was not working, and the above stack overflow post managed to fix the issue
# SOURCE:  https://pypi.org/project/spacy-langdetect/

nlp = spacy.load("en_core_web_sm")

def create_lang_detector(nlp, name):
    return LanguageDetector()

Language.factory("language_detector", func=create_lang_detector)
nlp.add_pipe('language_detector', last=True)

wldnws['lang'] = wldnws['title'].apply(lambda x: nlp(x)._.language['language'])
wldnws.head(2)

wldnws = wldnws[wldnws['lang'] == 'en']

onion['lang'] = onion['title'].apply(lambda x: nlp(x)._.language['language'])
onion.head(2)

onion = onion[onion['lang'] == 'en']

In [None]:
# 2 - 4.3 - CREATE CONCATENATED DATAFRAME
reddit_df = pd.concat([onion[['subreddit', 'title']], wldnws[['subreddit', 'title']]]).reset_index(drop = True)
print(reddit_df.shape)
reddit_df.head()

In [None]:
# 3 - 4.1 - DROP TITLES REFERENCING SUBREDDIT NAME

# Find all titles with some variation of World News

# The regex code here was created with the help of the following sources:
# Source:  https://stackoverflow.com/questions/18402416/regular-expression-to-match-a-word-or-its-prefix
# Source:  https://stackoverflow.com/questions/20462834/python-using-str-replace-with-a-wildcard
# Source:  https://regex101.com/
# Source:  https://stackoverflow.com/questions/5633533/regular-expression-for-matching-parentheses
# Source:  https://stackoverflow.com/questions/4007302/regex-how-to-match-an-optional-character
# Source:  https://www.regular-expressions.info/optional.html
# Source:  https://stackoverflow.com/questions/9655164/regex-ignore-case-sensitivity
# Source:  https://stackoverflow.com/questions/34583904/javascript-regex-ignore-case-for-specific-capture-group
# Source:  https://stackoverflow.com/questions/7548787/regex-for-and-not-operation

'''
The regex expression below will (in order or characters left to right):
- outer () - says to match all inside the parentheses
- /{0,1}r{0,1}/{0,1} - find a string with or without each of  /, r, and /
- [wW]orld[nN]ews - find worldnew optionally capitalized
- ( Live Thread: )? - it may have this text after it, if so find it, if not, don't
- | either or
- outer () - says to match all inside the parenthese
- \( - backslash tells it to treat left parenthesis as character
- Thread # - where it says 'Thread #'
- \d{1,} - \d any digit, {1,} at least one character long
- \) - backslash tells it to treat right parenthesis as character
'''

# Create the regex string
regex_string = '(/{0,1}r{0,1}/{0,1}[wW]orld[nN]ews( Live Thread: )?)|( \(Thread #\d{1,}\))'
bad_titles = list(reddit[reddit['title'].apply(lambda title: len(re.findall(regex_string, title))) > 0].index)

# Append the full list of articles containing 'onion' identified above to the bad_titles list

for title in list(reddit[reddit['title'].str.lower().str.find('onion') > -1].index):
    bad_titles.append(title)

reddit.drop(index=bad_titles, inplace=True);

In [None]:
# 3 - 4.2 - Remove Hashtags:
reddit['title'] = reddit['title'].apply(lambda title: title.replace('&#x27;',''))

# Find all titles with some variation of World News

'''
The regex string below looks to find and # followed by a word and not followed by a number

'''

# Create the regex string from before
regex_string = '#\D\w+'

reddit[reddit['title'].apply(lambda title: len(re.findall(regex_string, title))) > 0]

reddit['title'] = reddit['title'].apply(lambda title: re.sub(regex_string, '', title));

In [None]:
# 3 - 4.3 - REMOVED NEW ARTICLE TITLES USUALLY AFTER PIPES AND DASHES

# For the PIPES
regex_string = '\| [\w \W]+'

reddit['title'] = reddit['title'].apply(lambda title: re.sub(regex_string, '', title))

# For the DASHES
regex_string = '\- [\w \W]+'

reddit['title'] = reddit['title'].apply(lambda title: re.sub(regex_string, '', title))

In [None]:
# 3 - 4.4 - REMOVE EMOJIS AND SPECIAL CHARACTERS

# Test the regex expression to remove emojis:

# This expression will remove emojis and most punctuation which won't be necessary for the algorithms used.
# Source to help with this code:  https://stackoverflow.com/questions/7548787/regex-for-and-not-operation
'''
This regex will find all non word items and exclude all of the following characters (a space is the first character):  ' &$%\-#/'

The characters selected to be excluded above were iteratively chosen by examining the outputs of the find all code below.  
The word vectorizers will do the vast majority of the punctuation removal, but here, these characters are being skipped 
as replacing them with nothing may change the meaning of some words or combine two words that shoud not be together.

'''
#regex_string2 = "((?=[^ &$%\-#/])\W)"
regex_string = "((?=[^ &$%\-#/])\W)"
[re.findall(regex_string, x) for x in reddit.title if len(re.findall(regex_string, x))>0]

reddit['title'] = reddit['title'].apply(lambda title: re.sub(regex_string, '', title))



In [None]:
# 3 - 4.5 - Remove Starting and Trailing Spaces
reddit['title'] = reddit['title'].apply(lambda title: title.strip())

In [17]:
def onion_wldnws_cleaner(onion_df, wldnws, file_name):
    
    # DROP DUPLICATE TITLES
    onion_df.drop_duplicates(subset = 'title', inplace=True)
    wldnws.drop_duplicates(subset = 'title', inplace=True)
    
    # DROP UNNEEDED COLUMNS
    onion_df = onion_df[['subreddit', 'title']]
    wldnws = wldnws[['subreddit', 'title']]
    
    # FIX HTML ERRORS
    wldnws['title'] = wldnws['title'].apply(html.unescape)
    onion_df['title'] = onion_df['title'].apply(html.unescape)
    
    # REMOVE LOWERCASE ARTICLES
    bad_onion_dfs = list(onion_df[onion_df['title'].str.islower() == True].index)
    bad_news = list(wldnws[wldnws['title'].str.islower() == True].index)
    onion_df.drop(index=bad_onion_dfs, inplace=True)
    wldnws.drop(index=bad_news, inplace=True)
    
    # REMOVE NON_ENGLISH
    wldnws['lang'] = wldnws['title'].apply(lambda x: nlp(x)._.language['language'])
    wldnws.head(2)

    wldnws = wldnws[wldnws['lang'] == 'en']

    onion_df['lang'] = onion_df['title'].apply(lambda x: nlp(x)._.language['language'])
    onion_df.head(2)

    onion_df = onion_df[onion_df['lang'] == 'en']
    
    # CREATE CONCATENATED DATAFRAME
    reddit = pd.concat([onion_df[['subreddit', 'title']], wldnws[['subreddit', 'title']]]).reset_index(drop = True)

    
# ==============================================================================================    
    
    # DROP TITLES REFERENCING SUBREDDIT NAME
    # Find all titles with some variation of World News
        # The regex code here was created with the help of the following sources:
        # Source:  https://stackoverflow.com/questions/18402416/regular-expression-to-match-a-word-or-its-prefix
        # Source:  https://stackoverflow.com/questions/20462834/python-using-str-replace-with-a-wildcard
        # Source:  https://regex101.com/
        # Source:  https://stackoverflow.com/questions/5633533/regular-expression-for-matching-parentheses
        # Source:  https://stackoverflow.com/questions/4007302/regex-how-to-match-an-optional-character
        # Source:  https://www.regular-expressions.info/optional.html
        # Source:  https://stackoverflow.com/questions/9655164/regex-ignore-case-sensitivity
        # Source:  https://stackoverflow.com/questions/34583904/javascript-regex-ignore-case-for-specific-capture-group
        # Source:  https://stackoverflow.com/questions/7548787/regex-for-and-not-operation
    '''
    The regex expression below will (in order or characters left to right):
    - outer () - says to match all inside the parentheses
    - /{0,1}r{0,1}/{0,1} - find a string with or without each of  /, r, and /
    - [wW]orld[nN]ews - find worldnew optionally capitalized
    - ( Live Thread: )? - it may have this text after it, if so find it, if not, don't
    - | either or
    - outer () - says to match all inside the parenthese
    - \( - backslash tells it to treat left parenthesis as character
    - Thread # - where it says 'Thread #'
    - \d{1,} - \d any digit, {1,} at least one character long
    - \) - backslash tells it to treat right parenthesis as character
    '''
    # Create the regex string
    regex_string = '(/{0,1}r{0,1}/{0,1}[wW]orld[nN]ews( Live Thread: )?)|( \(Thread #\d{1,}\))'
    bad_titles = list(reddit[reddit['title'].apply(lambda title: len(re.findall(regex_string, title))) > 0].index)
    # Append the full list of articles containing 'onion' identified above to the bad_titles list
    for title in list(reddit[reddit['title'].str.lower().str.find('onion') > -1].index):
        bad_titles.append(title)
    reddit.drop(index=bad_titles, inplace=True);
    
    # REMOVE HASHTAGS
    reddit['title'] = reddit['title'].apply(lambda title: title.replace('&#x27;',''))
    # Find all titles with some variation of World News
    '''
    The regex string below looks to find and # followed by a word and not followed by a number
    '''
    # Create the regex string from before
    regex_string = '#\D\w+'
    reddit[reddit['title'].apply(lambda title: len(re.findall(regex_string, title))) > 0]
    reddit['title'] = reddit['title'].apply(lambda title: re.sub(regex_string, '', title));
    
    # REMOVED NEW ARTICLE TITLES USUALLY AFTER PIPES AND DASHES
    # For the PIPES
    regex_string = '\| [\w \W]+'
    reddit['title'] = reddit['title'].apply(lambda title: re.sub(regex_string, '', title))
    # For the DASHES
    regex_string = '\- [\w \W]+'
    reddit['title'] = reddit['title'].apply(lambda title: re.sub(regex_string, '', title))
    
    #  REMOVE EMOJIS AND SPECIAL CHARACTERS
    # Test the regex expression to remove emojis:
        # This expression will remove emojis and most punctuation which won't be necessary for the algorithms used.
        # Source to help with this code:  https://stackoverflow.com/questions/7548787/regex-for-and-not-operation
    '''
    This regex will find all non word items and exclude all of the following characters (a space is the first character):  ' &$%\-#/'
    The characters selected to be excluded above were iteratively chosen by examining the outputs of the find all code below.  
    The word vectorizers will do the vast majority of the punctuation removal, but here, these characters are being skipped 
    as replacing them with nothing may change the meaning of some words or combine two words that shoud not be together.
    '''
    regex_string = "((?=[^ &$%\-#/])\W)"
    reddit['title'] = reddit['title'].apply(lambda title: re.sub(regex_string, '', title))
    
    # Remove Starting and Trailing Spaces
    reddit['title'] = reddit['title'].apply(lambda title: title.strip())
    
    # REMOVE TITLES WITH TWO OR FEWER WORDS
    reddit.drop(index = reddit[reddit['title'].apply(lambda x: len(x.strip().split())) <= 2].index,
           inplace=True)
    
    # Reset the index:
    reddit.reset_index(drop = True, inplace = True)
    
    # Export to a csv
    reddit.to_csv(f'../data/{file_name}.csv', index=False)
        
    return reddit