In [2]:
import re
import string
import ekphrasis
import emoji
import nltk
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.corpus import wordnet
PUNCT_TO_REMOVE = string.punctuation
lemmatizer = WordNetLemmatizer()
# new_stopwords = ["rt"]
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.remove("nor")
stpwrd.remove("no")
stpwrd.remove("not")
# stpwrd.extend(new_stopwords)

In [None]:
annotation_hashtag = ['#ClimateHoax', '#YellowVests','#Qanon','#GlobalWarmingHoax','#ClimateChangeHoax','#ClimateDenial','#ClimateHoax','#SaveClimate','#ActOnClimate','#ClimateChangeIsReal',
                        '#ClimateActionNow','#FactsMatter','#ScienceMatters','#ScienceIsReal']

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

def remove_white_space(text):
    return re.sub('\s+', ' ', text)

def remove_unwanted(text):
    text = re.sub('\W+',' ', text )
    return re.sub('RT|cc', ' ', text) # remove RT and cc

def remove_stopwords(text):
    tokens = word_tokenize(text) 
    sentence_without_stopword = [k for k in tokens if not k in stpwrd]
    return " ".join(sentence_without_stopword)

# ekphrasis

# showing the example from https://github.com/cbaziotis/ekphrasis

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone',
        'time', 'date', 'number'],
    # terms that will be annotated
    # annotate={"hashtag", "allcaps", "elongated", "repeated",
    #     'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

# adding a simple wrapper that we can use later if we want
def ekphrasis_processor(text, text_processor=text_processor):
  return ' '.join(text_processor.pre_process_doc(text))


# no need
def tokenize_text(text):
    tokens = word_tokenize(text) 
    return " ".join(tokens)

# no need
def lowerCase(text):
  return text.lower()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(text)])


def stemming_words(text):
    tweets_clean = word_tokenize(text)
    stemmer = PorterStemmer() 
    # Create an empty list to store the stems
    tweets_stem = []
    for word in tweets_clean:
        stem_word = stemmer.stem(word)  # stemming word
        tweets_stem.append(stem_word)  # append to the list
    return " ".join(tweets_stem)

def emoji_annotation(text):
    return emoji.demojize(text, delimiters=("", ""))
def remove_annotation_hashtag(text):
    tokens = text.split()
    sentence_without_annotation_hashtag = [k for k in tokens if not k in annotation_hashtag]
    return " ".join(sentence_without_annotation_hashtag)

def remove_mention(text):
   return re.sub("@[A-Za-z0-9_]+","", text)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [None]:
sentences = [
    "CAN'T WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
    "I saw the new #johndoe movie and it suuuuucks!!! 😂 WAISTED $10... #badmovies :/",
    "@SentimentSymp:  can't wait for #GlobalWarmingHoax the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/."
]

In [None]:
def preprocessing_pipeline(text):
    text = remove_urls(text)
    text = remove_mention(text) # remove user mentions
    text = remove_annotation_hashtag(text)
    text = lowerCase(text)
    text = ekphrasis_processor(text) # segment hashtags
    text = remove_punctuation(text)
    text = emoji_annotation(text) # convert emoji into text
    text = remove_unwanted(text) # remove special characters
    text = remove_white_space(text)
    text = remove_stopwords(text)
    text = lemmatize_words(text)
    return text

In [None]:
for sent in sentences:
  print(preprocessing_pipeline(sent))

not wait new season twin peak david lynch tv series happy
saw new john doe movie suuuuucks face_with_tears_of_joy waisted money bad movie annoyed
not wait date sentiment talk yaaaaaay


In [None]:
import pandas as pd

In [None]:
df0 = pd.read_csv('/content/drive/MyDrive/nlp_course_project/tweetdata_with_annotation.csv')

In [None]:
df0

Unnamed: 0,tweetid,stance,sent,tweet_text
0,921191642844954624,believer,positive,@FriendsOScience @craigthomler @12zodiac_signs...
1,1422548442665177089,deny,neutral,@GretaThunberg Tweeting &amp; travelling to pa...
2,953945068502593536,believer,neutral,#THERMOMETER IN #WORLD’S #COLDEST VILLAGE BREA...
3,1435016842326708224,deny,positive,#global #canadanews #globalwarming #climatecha...
4,1425192287177527304,believer,neutral,#ClimateActionNow \nWe don't need ANY more #fo...
...,...,...,...,...
62758,953883848852488198,believer,positive,@1_TMF_ @climatefrauds @AtomsksSanakan @Mark_F...
62759,1454369889267879941,believer,positive,#Women in action\n#WorldClimateMarch #GlobalCl...
62760,953662087171100672,believer,positive,Via @euronews: A year of extremes: 2017 hottes...
62761,1499054675051438088,believer,neutral,Excellent thread on steps for #ClimateActionNo...


In [None]:
df0['cleaned_text'] = df0['tweet_text'].map(preprocessing_pipeline)

In [None]:
df0

Unnamed: 0,tweetid,stance,sent,tweet_text,cleaned_text
0,921191642844954624,believer,positive,@FriendsOScience @craigthomler @12zodiac_signs...,excellent video climate change global warm cli...
1,1422548442665177089,deny,neutral,@GretaThunberg Tweeting &amp; travelling to pa...,tweet travel paid public relation event do not...
2,953945068502593536,believer,neutral,#THERMOMETER IN #WORLD’S #COLDEST VILLAGE BREA...,thermometer world coldest village break temper...
3,1435016842326708224,deny,positive,#global #canadanews #globalwarming #climatecha...,global canada news global warm climate change ...
4,1425192287177527304,believer,neutral,#ClimateActionNow \nWe don't need ANY more #fo...,climate action not need fossil fuel drilling m...
...,...,...,...,...,...
62758,953883848852488198,believer,positive,@1_TMF_ @climatefrauds @AtomsksSanakan @Mark_F...,yep climate change global warm climate change ...
62759,1454369889267879941,believer,positive,#Women in action\n#WorldClimateMarch #GlobalCl...,woman action world climate march global climat...
62760,953662087171100672,believer,positive,Via @euronews: A year of extremes: 2017 hottes...,via year extreme number hottest year ever reco...
62761,1499054675051438088,believer,neutral,Excellent thread on steps for #ClimateActionNo...,excellent thread step climate action ip report...


In [None]:
df0.to_csv('cleaned_tweet_data.csv',index='False')