In [None]:
import pandas as pd
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
def removePunctuation(tweet):
    """
    Removes punctuations (removes # as well).

    """
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return tweet.translate(translator)

def removeNums(tweet):
    """
    Removes numeric values from the given string.  
    """
    return ''.join([char for char in tweet if not char.isdigit()])

In [None]:
def stemming(tweet_list):
    """
    Stemming - reduces the word-forms by removing suffixes.

    """
    return [PorterStemmer().stem(word) for word in tweet_list]

def lemmatization(tweet_list):
    """
    Lemmatization - reduces the word-forms to linguistically valid lemmas.

    """
    return [WordNetLemmatizer().lemmatize(word) for word in tweet_list]


def removeStopWords(tweet_list):
    """
    Removes stop-words from the given tweet.

    """
    return [word for word in tweet_list if word not in stopwords.words('english')]

In [None]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.dicts.noslang.slangdict import slangdict

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    #annotate={"hashtag", "allcaps", "elongated", "repeated",
    #    'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons, ] # slangdict
)

In [None]:
train_df = pd.read_csv('../dataset/train.csv')
train_df.set_index('id', inplace=True)
train_df

In [None]:
for index, row in train_df.iterrows():
    train_df.at[index, 'ekphrasis'] = ' '.join(text_processor.pre_process_doc(row['text']))
    train_df.at[index, 'ekphrasis_no_symtags'] = train_df.at[index, 'ekphrasis'].remove('>', '').remove('<', '')
    tags = ['<url>', '<email>', '<percent>', '<money>', '<phone>', '<user>', '<time>', '<date>', '<number>']
    text = row['ekphrasis']
    for t in tags:
       text  = text.replace(t, '')
    train_df.at[index, 'ekphrasis_rm'] = removeNums(removePunctuation(text))

    text = removeStopWords(row['ekphrasis_rm'].split())
    train_df.at[index, 'lemmatization'] = ' '.join(lemmatization(text))
    train_df.at[index, 'stemming'] = ' '.join(stemming(text))

    print("record #{} processing finished".format(index))

In [None]:
train_df

In [None]:
train_df.to_csv('train_ekphrasis.csv')