In [None]:
import json
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import RegexpStemmer
import emot

In [3]:
# function for cleaning and stemming tweets for a given month and recording emoji meanings
# parameters for removing URLs, hashtags, punctuation, stop words, and converting to lowercase

def text_cleaning(month, year, joy, rm_urls, rm_tags, rm_punc, rm_stop, lower):
    
    # determine if looking at joy or no joy tweets
    if joy == True: 
        path = 'Joy-NoJoy/Joy-'
    
    else:
        path = 'Joy-NoJoy/NoJoy-'
        
        
    file = '/joy/joyData/' + path + str(month) + '-' + str(year) + '.csv'
    df = pd.read_csv(file)
    
    
    clean_tweets = []
    clean_stems = []
    emoji_meanings = []
    
    for tweet in df['content']:
        

        # remove URLs
        if rm_urls == True:
        
            url_regex = r"\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b"
            tweet = re.sub(url_regex, '', tweet)
        
        # remove hashtags
        if rm_tags == True:
            
            tag_regex = '/#\w+\s*/'
            tweet = re.sub(tag_regex, '', tweet)
            
        # remove punctuation
        if rm_punc == True:
            
            tweet = "".join([char for char in tweet if char not in string.punctuation and char not in ['\n']])
        
        # convert to lower case
        if lower == True:
            
            tweet = tweet.lower()
            
        # tokenization (second one removes emojis)
        word_tokens = nltk.word_tokenize(tweet)
        #word_tokens = RegexpTokenizer(r'\w+').tokenize(tweet)
        
        # remove stop words
        if rm_stop == True:
            
            #nltk.download('stopwords')
            
            non_stopwords = [word for word in word_tokens if word not in stopwords.words('english')]
            tweet = " ".join(non_stopwords)
        
        clean_tweets.append(tweet)
          
            
        # stemming
        regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
            
        stems = []
        for word in word_tokens:
            stems.append(regexp.stem(word))
                             
        stem = " ".join(stems)
    
        clean_stems.append(stem)
        
        
        # find emoji meanings
        emot_obj = emot.core.emot() 
        emojis = emot_obj.emoji(tweet)
        tweet_emojis = []
    
        # if there are emojis in the tweet keep the emoji meanings in a list
        if len(emojis['value']) > 0:
                
            for i in range(len(emojis['value'])):
                    
                meaning = emojis['mean'][i]
                clean_meaning = meaning.replace("_", " ")
                clean_meaning = meaning.replace(":", "")
                tweet_emojis.append(clean_meaning)
        
        emoji_meanings.append(tweet_emojis)
                            
        
        
    # write df to file with new column that includes the cleaned tweets, stems, and emoji meanings
    df['clean_tweet'] = clean_tweets
    df['clean_stems'] = clean_stems
    df['emoji_meanings'] = emoji_meanings
    df.to_csv(path + str(month) + '-' + str(year) + '.csv', index = False, mode = 'w')
    
    return clean_tweets
    


In [4]:
# create a list of months to loop through for counting joy tweets 
dates = pd.date_range(start = '09/01/2019', end = '01/31/2022', freq = 'M')


months = []
for date in dates:
    date = str(date).split('-')
    year = date[0]
    month = date[1]
    months.append([month, year])
    
    
# loop through months and clean both joy and no joy tweets

for month in months:
    
    text_cleaning(month = month[0], year = month[1], joy = True, rm_urls = True, rm_tags = False, rm_punc = True, rm_stop = True, lower = True)
    text_cleaning(month = month[0], year = month[1], joy = False, rm_urls = True, rm_tags = False, rm_punc = True, rm_stop = True, lower = True)
    print(month)

['09', '2019']
['10', '2019']
['11', '2019']
['12', '2019']
['01', '2020']
['02', '2020']
['03', '2020']
['04', '2020']
['05', '2020']
['06', '2020']
['07', '2020']
['08', '2020']
['09', '2020']
['10', '2020']
['11', '2020']
['12', '2020']
['01', '2021']
['02', '2021']
['03', '2021']
['04', '2021']
['05', '2021']
['06', '2021']
['07', '2021']
['08', '2021']
['09', '2021']
['10', '2021']
['11', '2021']
['12', '2021']
['01', '2022']


In [19]:
# Determining the Best Stemmer/Lemmatizer for above function 

# which ones convert words like "enjoy", "joyous", or "joyful" to "joy"?


import nltk 
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem import RegexpStemmer
from nltk.stem import SnowballStemmer


# create an object for each of the stemmers
porter = PorterStemmer()
lancaster=LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
snowball = SnowballStemmer("english")


words = ['joy', 'joys', 'enjoy', 'joyful', 'joyous']

stems = []
for word in words:
  stems.append([word, porter.stem(word), lancaster.stem(word), regexp.stem(word), snowball.stem(word), wordnet_lemmatizer.lemmatize(word, pos='v')])


df = pd.DataFrame(stems, columns=['Word', 'Porter', 'Lancaster', 'RegExp', 'Snowball', 'Lemma'])
print(df)

# Lancaster includes the most variations of joy
# Using regular expressions to stem or word net lemmatizer includes the least
# Snowball and Porter include joy and joyful but not joyous
# None include enjoy

     Word Porter Lancaster  RegExp Snowball   Lemma
0     joy    joy       joy     joy      joy     joy
1    joys    joy       joy     joy      joy     joy
2   enjoy  enjoy     enjoy   enjoy    enjoy   enjoy
3  joyful    joy       joy  joyful      joy  joyful
4  joyous  joyou       joy   joyou   joyous  joyous
