In [None]:
# This is the first step of the tweets preprocessing procedure

In [1]:
# import libraries

import pandas as pd
#https://pypi.org/project/tweet-preprocessor/
import preprocessor as preproc
from greek_stemmer import GreekStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams

# print non truncated column info in pandas dataframe
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

In [2]:
def delete_tonous(df, column_to_process, processed_column='Text_only'):
    
    if (processed_column != column_to_process):
        df[processed_column] = df[column_to_process]  # create new column

    # replace greek hyphend letters
    replacements = {processed_column: {'ά': 'α', 'έ': 'ε', 'ή': 'η', 'ί': 'ι', 'ό': 'ο', 'ύ': 'υ', 'ώ': 'ω'}}
    df.replace(replacements, regex=True, inplace=True)
    
    return (df)

## Fix the gathered tweets

### - Read the tweets

In [3]:
# read the file with the gathered tweets
df_tweets = pd.read_pickle('data/tweets.pkl')

In [4]:
df_tweets.shape

(8352, 11)

### Preprocessing Text

<li> Extract only text without hashtags, urls and emojis
<li> Remove tonous
<li> Convert to uppercase
<li> Stemmer
<li> Tokenize

In [6]:
# Keep only text of each tweet
preproc.set_options(preproc.OPT.URL, preproc.OPT.MENTION, preproc.OPT.HASHTAG , 
        preproc.OPT.EMOJI ,preproc.OPT.SMILEY , preproc.OPT.NUMBER)

df_tweets['Text_only'] = df_tweets['text'].apply(lambda text: preproc.clean(text))

In [7]:
# convert to lower case
df_tweets['Text_only2'] = df_tweets['Text_only'].apply(lambda text: text.lower())

In [8]:
# delete tonous
df_tweets = delete_tonous(df=df_tweets, column_to_process='Text_only2', processed_column='Text_only3')

In [9]:
# convert tweet to uppercase
df_tweets['Tweet_upper'] = df_tweets['Text_only3'].apply(lambda text: text.upper())

In [10]:
# remove punctuation and tokenize
# create a new column with the tokens
tokenizer = RegexpTokenizer(r'\w+')
df_tweets['Tokens'] = df_tweets['Tweet_upper'].apply(lambda text: tokenizer.tokenize(text))


In [11]:
# stemmer
stemmer = GreekStemmer()
df_tweets['Tokens_stem'] = df_tweets['Tokens'].apply(lambda x: [stemmer.stem(i) for i in x])


In [12]:
# create a column with bigrams as list of sets
df_tweets['Bigrams'] = df_tweets['Tokens'].apply(lambda x: list(ngrams(x, 2)) )
# convert list of sets to single list of strings
df_tweets['Bigrams2'] = df_tweets['Bigrams'].apply(lambda x: list(i[0]+' '+i[1] for i in x))

  from ipykernel import kernelapp as app


### Create new attributes

In [13]:
# create a column 'N_exclam' for the number of exclamation marks
df_tweets['N_exclam'] = df_tweets['Text_only'].apply(lambda x: x.count('!'))

In [14]:
# create a column 'N_hash' for the number of hashtags
df_tweets['N_hash'] = df_tweets['text'].apply(lambda x: x.count('#'))

In [15]:
# create a column 'N_mentions' for the number of mentions
df_tweets['N_mentions'] = df_tweets['text'].apply(lambda x: x.count('@'))

In [16]:
# create a column 'N_uppers' for the number of capital words
def count_caps(x):
    n=0
    for word in x.split(' '):
        n = n + word.isupper()
    return n

df_tweets['N_uppers'] = df_tweets['text'].apply(lambda x: count_caps(x))

In [17]:
# create a column 'has_url' for the existance of url in each tweet
df_tweets['has_url'] = df_tweets['text'].apply(lambda x: 1 if 'http' in x
                                              else 0)

In [18]:
# number of words
df_tweets['N_words'] = df_tweets['Tokens'].apply(lambda x: len(x))

In [20]:
df_tweets.columns

Index(['date', 'favorite_count', 'hashtags', 'id', 'location', 'mentions',
       'retweet_count', 'searchParam', 'text', 'user_id', 'user_name',
       'Text_only', 'Text_only2', 'Text_only3', 'Tweet_upper', 'Tokens',
       'Tokens_stem', 'Bigrams', 'Bigrams2', 'N_exclam', 'N_hash',
       'N_mentions', 'N_uppers', 'has_url', 'N_words'],
      dtype='object')

In [21]:
# save the dataset as a pickle file
df_tweets.to_pickle('results/tweets_step_1_v1.pkl')