In [46]:
import pandas as pd
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/george/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/george/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/george/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
def removePunctuation(tweet):
    """
    Removes punctuations (removes # as well).

    """
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return tweet.translate(translator)

def removeNums(tweet):
    """
    Removes numeric values from the given string.  
    """
    return ''.join([char for char in tweet if not char.isdigit()])

In [44]:
def stemming(tweet_list):
    """
    Stemming - reduces the word-forms by removing suffixes.

    """
    return [PorterStemmer().stem(word) for word in tweet_list]

def lemmatization(tweet_list):
    """
    Lemmatization - reduces the word-forms to linguistically valid lemmas.

    """
    return [WordNetLemmatizer().lemmatize(word) for word in tweet_list]


def removeStopWords(tweet_list):
    """
    Removes stop-words from the given tweet.

    """
    return [word for word in tweet_list if word not in stopwords.words('english')]

In [22]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.dicts.noslang.slangdict import slangdict

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    #annotate={"hashtag", "allcaps", "elongated", "repeated",
    #    'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons, ] # slangdict
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [23]:
train_df = pd.read_csv('../dataset/train.csv')
train_df.set_index('id', inplace=True)
train_df

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


In [24]:
for index, row in train_df.iterrows():
    train_df.at[index, 'ekphrasis'] = ' '.join(text_processor.pre_process_doc(row['text']))
    train_df.at[index, 'ekphrasis_no_symtags'] = train_df.at[index, 'ekphrasis'].remove('>', '').remove('<', '')
    tags = ['<url>', '<email>', '<percent>', '<money>', '<phone>', '<user>', '<time>', '<date>', '<number>']
    text = row['ekphrasis']
    for t in tags:
       text  = text.replace(t, '')
    train_df.at[index, 'ekphrasis_rm'] = removeNums(removePunctuation(text))

    text = removeStopWords(row['ekphrasis_rm'].split())
    train_df.at[index, 'lemmatization'] = ' '.join(lemmatization(text))
    train_df.at[index, 'stemming'] = ' '.join(stemming(text))

    print("record #{} processing finished".format(index))

record #1 processing finished
record #4 processing finished
record #5 processing finished
record #6 processing finished
record #7 processing finished
record #8 processing finished
record #10 processing finished
record #13 processing finished
record #14 processing finished
record #15 processing finished
record #16 processing finished
record #17 processing finished
record #18 processing finished
record #19 processing finished
record #20 processing finished
record #23 processing finished
record #24 processing finished
record #25 processing finished
record #26 processing finished
record #28 processing finished
record #31 processing finished
record #32 processing finished
record #33 processing finished
record #34 processing finished
record #36 processing finished
record #37 processing finished
record #38 processing finished
record #39 processing finished
record #40 processing finished
record #41 processing finished
record #44 processing finished
record #48 processing finished
record #49 pro

In [25]:
train_df

Unnamed: 0_level_0,keyword,location,text,target,ekphrasis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask . canada
5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to ' shelter in place ' ar...
6,,,"13,000 people receive #wildfires evacuation or...",1,<number> people receive wildfires evacuation o...
7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...
...,...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1,two giant cranes holding a bridge collapse int...
10870,,,@aria_ahrary @TheTawniest The out of control w...,1,<user> <user> the out of control wild fires in...
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,m1 . <number> [ <time> utc ] ? 5 km s of volca...
10872,,,Police investigating after an e-bike collided ...,1,police investigating after an e - bike collide...


In [51]:
train_df.to_csv('train_ekphrasis.csv')