In [12]:
import pandas as pd
import numpy as np
import re
import string 
from stop_words import get_stop_words
from nltk.corpus import stopwords
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)
from contraction_map import CONTRACTION_MAP

In [10]:
# Drop some features
data = pd.read_csv("Task1-Data_collection/tweets_1_drop_duplicates.csv")
data.head()
cols = ['id', 'date', 'tweet', 'hashtags', 
            'username', 'place', 'geo', 'timezone']
#data.loc[5:10,['clean_tweet', 'year']]
data = data.loc[:,cols]

# Drop duplicates 
data =data.iloc[data.astype(str).drop_duplicates().index]
data.shape

(508, 8)

In [14]:
# Clean thoroughly 
def remove_hyperlinks(text):
    ptn = r'(https://[\w./-]+)|(www.[\w./-]+)|([\w./-]+.com)'
    return re.sub(ptn, '', text)

def remove_mentions(text):
    ptn = r'(@[\w_]+ | (@[.]+) | (@))'
    return re.sub(ptn, '', text)

def remove_stopwords(text):
    #text = list(text)
    text=' '.join([x for x in text.split() if x not in stop_words])
    return text

def remove_punctuations(text): 
    text = re.sub("[^a-zA-Z]", " ", text) 
    text = text.translate(str.maketrans('', '', string.punctuation))
    #text = text.translate(None, string.punctuation)
    return text 

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
    
def remove_alphabets(text):
    text = re.sub(r"\b[a-zA-Z]\b", "", text)
    return text 

def remove_digits(text):
    text = ''.join(i for i in text if not i.isdigit())
    return text

def clean_text(text, hyperlink=True, mention=True, stopwords=True,
               punctuations=True, contractions=True, digits=True,
               lowercase=True, alphabets=True):
    if lowercase: # Transform to lowercase
        text = text.lower()
    if hyperlink: # Remove Hyperlinks
        text = remove_hyperlinks(text)
    if mention: # Remove Mentions
        text = remove_mentions(text)
    if punctuations: # Remove Punctuations 
        text = remove_punctuations(text) 
    if contractions: # Expand Contractions e.g. can't -> cannot
        text = expand_contractions(text)
    if stopwords: # Remove english stopwords
        text = remove_stopwords(text)
    if alphabets: # Remove single alphabets 
        text = remove_alphabets(text)
    if digits: # Remove all numbers
        text = remove_digits(text)
    return text

data['tweet'] = data.tweet.map(clean_text)


In [16]:
data.to_csv("cleaned_tweets_data.csv")
data.head()

Unnamed: 0,id,date,tweet,hashtags,username,place,geo,timezone
0,1353492222344392707,2021-01-25 07:57:47,idk bothering alcohol inflation time high rn,[],MphoKgosidialwa,,,800
1,1353492195827965952,2021-01-25 07:57:40,inflation much money means prices rise eventua...,[],AllenWi92526840,,,800
2,1353492090311720960,2021-01-25 07:57:15,exerts daily fix talk inflation roll deafening...,"['fomc', 'trading']",ChrisWeston_PS,,,800
3,1353492059370479617,2021-01-25 07:57:08,regards michellins explains recent price rises...,[],morejunkfromu,,,800
4,1353492005008113665,2021-01-25 07:56:55,oh mike since clearly unaware inflation levels...,[],apecapital,,,800


### Text lemmatization

In [18]:
import spacy 
from nltk.tokenize.toktok import ToktokTokenizer
import nltk
#nlp = spacy.load('en_core')
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, entity=True)
tokenizer = ToktokTokenizer()

nlp = spacy.load('en_core_web_sm')


In [19]:
def lemmatize_text(text):
    text = str(text) #Type error (consist of float type..)
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

data["parse_tweet"] = data.tweet.map(lemmatize_text)

In [20]:
data.head()

Unnamed: 0,id,date,tweet,hashtags,username,place,geo,timezone,parse_tweet
0,1353492222344392707,2021-01-25 07:57:47,idk bothering alcohol inflation time high rn,[],MphoKgosidialwa,,,800,idk bother alcohol inflation time high rn
1,1353492195827965952,2021-01-25 07:57:40,inflation much money means prices rise eventua...,[],AllenWi92526840,,,800,inflation much money mean price rise eventuall...
2,1353492090311720960,2021-01-25 07:57:15,exerts daily fix talk inflation roll deafening...,"['fomc', 'trading']",ChrisWeston_PS,,,800,exert daily fix talk inflation roll deafen exu...
3,1353492059370479617,2021-01-25 07:57:08,regards michellins explains recent price rises...,[],morejunkfromu,,,800,regard michellin explain recent price rise inf...
4,1353492005008113665,2021-01-25 07:56:55,oh mike since clearly unaware inflation levels...,[],apecapital,,,800,oh mike since clearly unaware inflation level ...


In [21]:
data.to_csv("cleaned_lemmatized_tweets_data.csv")