In [3]:
import json
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import RegexpStemmer
import emot


def text_cleaning(month, year, joy, rm_urls, rm_tags, rm_punc, rm_stop, lower):
    
    # determine if looking at joy or no joy tweets
    if joy == True: 
        path = 'Joy-NoJoy/Joy-'
    
    else:
        path = 'Joy-NoJoy/NoJoy-'
        
        
    file = '/joy/joyData/' + path + str(month) + '-' + str(year) + '.csv'
    df = pd.read_csv(file)
    
    
    clean_tweets = []
    clean_stems = []
    emoji_meanings = []
    
    for tweet in df['content']:
        

        # remove URLs
        if rm_urls == True:
        
            url_regex = r"\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b"
            tweet = re.sub(url_regex, '', tweet)
        
        # remove hashtags
        if rm_tags == True:
            
            tag_regex = '/#\w+\s*/'
            tweet = re.sub(tag_regex, '', tweet)
            
        # remove punctuation
        if rm_punc == True:
            
            tweet = "".join([char for char in tweet if char not in string.punctuation and char not in ['\n']])
        
        # convert to lower case
        if lower == True:
            
            tweet = tweet.lower()
            
        # tokenization (second one removes emojis)
        word_tokens = nltk.word_tokenize(tweet)
        #word_tokens = RegexpTokenizer(r'\w+').tokenize(tweet)
        
        # remove stop words
        if rm_stop == True:
            
            #nltk.download('stopwords')
            
            non_stopwords = [word for word in word_tokens if word not in stopwords.words('english')]
            tweet = " ".join(non_stopwords)
        
        clean_tweets.append(tweet)
          
            
        # stemming
        regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
            
        stems = []
        for word in word_tokens:
            stems.append(regexp.stem(word))
                             
        stem = " ".join(stems)
    
        clean_stems.append(stem)
        
        
        # find emoji meanings
        emot_obj = emot.core.emot() 
        emojis = emot_obj.emoji(tweet)
        tweet_emojis = []
    
        # if there are emojis in the tweet replace them with their meanings
        if len(emojis['value']) > 0:
                
            for i in range(len(emojis['value'])):
                    
                meaning = emojis['mean'][i]
                clean_meaning = meaning.replace("_", " ")
                clean_meaning = meaning.replace(":", "")
                tweet_emojis.append(clean_meaning)
        
        emoji_meanings.append(tweet_emojis)
                            
        
        
    # write df to file with new column that includes the cleaned tweets
    df['clean_tweet'] = clean_tweets
    df['clean_stems'] = clean_stems
    df['emoji_meanings'] = emoji_meanings
    df.to_csv(path + str(month) + '-' + str(year) + '.csv', index = False, mode = 'w')
    
    return clean_tweets
    


In [4]:
# create a list of months to loop through for counting joy tweets 
dates = pd.date_range(start = '09/01/2019', end = '01/31/2022', freq = 'M')


months = []
for date in dates:
    date = str(date).split('-')
    year = date[0]
    month = date[1]
    months.append([month, year])
    
    
# loop through months and clean both joy and no joy tweets

for month in months:
    
    text_cleaning(month = month[0], year = month[1], joy = True, rm_urls = True, rm_tags = False, rm_punc = True, rm_stop = True, lower = True)
    text_cleaning(month = month[0], year = month[1], joy = False, rm_urls = True, rm_tags = False, rm_punc = True, rm_stop = True, lower = True)
    print(month)

['09', '2019']
['10', '2019']
['11', '2019']
['12', '2019']
['01', '2020']
['02', '2020']
['03', '2020']
['04', '2020']
['05', '2020']
['06', '2020']
['07', '2020']
['08', '2020']
['09', '2020']
['10', '2020']
['11', '2020']
['12', '2020']
['01', '2021']
['02', '2021']
['03', '2021']
['04', '2021']
['05', '2021']
['06', '2021']
['07', '2021']
['08', '2021']
['09', '2021']
['10', '2021']
['11', '2021']
['12', '2021']
['01', '2022']


In [19]:
# Look for Stemmer/Lemmatizer
import nltk 
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import RegexpStemmer
from nltk.stem import SnowballStemmer


# create an object for each of the stemmers
porter = PorterStemmer()
lancaster=LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
snowball = SnowballStemmer("english")


words = ['joy', 'joys', 'enjoy', 'joyful', 'joyous']

stems = []
for word in words:
  stems.append([word, porter.stem(word), lancaster.stem(word), regexp.stem(word), snowball.stem(word), wordnet_lemmatizer.lemmatize(word, pos='v')])


df = pd.DataFrame(stems, columns=['Word', 'Porter', 'Lancaster', 'RegExp', 'Snowball', 'Lemma'])
print(df)

# Lancaster includes the most variations of joy
# Using regular expressions to stem or word net lemmatizer includes the least
# Snowball and Porter include joy and joyful but not joyous
# None include enjoy

     Word Porter Lancaster  RegExp Snowball   Lemma
0     joy    joy       joy     joy      joy     joy
1    joys    joy       joy     joy      joy     joy
2   enjoy  enjoy     enjoy   enjoy    enjoy   enjoy
3  joyful    joy       joy  joyful      joy  joyful
4  joyous  joyou       joy   joyou   joyous  joyous


In [32]:
jan2020_tweets = text_cleaning(month = '01', year = '2020', joy = True, rm_urls = True, rm_tags = False, rm_punc = True, rm_stop = True, lower = True, stem = True)
print(jan2020_tweets)

['wanted wish yall happy 2020 thank making 2019 wonderful journey may new year bring much joy peace happiness love prosperity good health wealth blessingshappynewyear…', 'sun sets first day 2020flaming style happy new year everyone may lives light joy promise loveamen gratefulheart newyearsday2020…', 'sending love joy mary meyer school families friends happy new year2020 mary meyer school', 'audrinalane happy happy joy joy chicagos south side illinois', '2020 taking joy back tell satan want know 2020 thehand virgin hotels chicago', 'bad stuff still lot good ’ really loved getting deeper podcasting world ariynbf crew listeners kind fun supportive truly joy thank alisonrosen', 'teachers worked hard bring much joy love learning laughter fairviewsd72 school year ’ wait see ’ store 2020 happy new year', 'much turning new leafhope 2020 brings joy happiness seek may learn grow better together happy new year', 'hope video brings tears joy well love technology boy receives new 3dprinted hands c

In [33]:
# replace emojis with words
example = jan2020_tweets[12]

emot_obj = emot.core.emot() 
emojis = emot_obj.emoji(example)
    
# if there are emojis in the tweet replace them with their meanings
if len(emojis['value']) > 0:
    for i in range(len(emojis['value'])):
        emoji = emojis['value'][i]
        meaning = emojis['mean'][i]
        tweet = example.replace(emoji, meaning)

print(tweet)

happy new year therichardsons may year bring love joy peace prosperity :kiss_mark: metropolitan club


In [34]:
# examples of punctuation not being removed
print(jan2020_tweets[56])
print(jan2020_tweets[51])

“ pain parting nothing joy meeting ” ✨
see later 2019 good riddance onward upward 2020 may year bring hope happiness joy love light laughter relish simple pleasures find joy in…


In [33]:
# strange example
print(jan2020_tweets[16])
df = pd.read_csv('/joy/joyData/Joy-NoJoy/Joy-01-2020.csv')
print(list(df['content'])[16])

┏━━┓┏━━┓┏━━┓┏━━┓┗━┓┃┃┏┓┃┗━┓┃┃┏┓┃┏━┛┃┃┃┃┃┏━┛┃┃┃┃┃may year filled joy amp love┃┏━┛┃┃┃┃┃┏━┛┃┃┃┃┃┗━┓┃┗┛┃┃┗━┓┃┗┛┃┗━━┛┗━━┛┗━━┛┗━━┛happy new year
┏━━┓┏━━┓┏━━┓┏━━┓
┗━┓┃┃┏┓┃┗━┓┃┃┏┓┃
┏━┛┃┃┃┃┃┏━┛┃┃┃┃┃
May this year be filled with joy &amp; love!
┃┏━┛┃┃┃┃┃┏━┛┃┃┃┃
┃┗━┓┃┗┛┃┃┗━┓┃┗┛┃
┗━━┛┗━━┛┗━━┛┗━━┛
Happy New Year!!!


In [32]:
print(string.punctuation)
print(stopwords.words('english'))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only