## Clean text

In [1]:
import pandas as pd
from ftfy import fix_text
import html
import re
import string

from ekphrasis.classes.segmenter import Segmenter
seg_tw = Segmenter(corpus="twitter")

from tqdm import tqdm, tqdm_notebook
pd.__version__ # marche pas avec pandas 0.25 https://github.com/tqdm/tqdm/issues/780
tqdm_notebook().pandas()

Reading twitter - 1grams ...
Reading twitter - 2grams ...


'0.24.2'

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [2]:
INPUT_FILE_GZIP = './_sources_final/pickles/pression_sociale_rounds_1_2_3_corrected_search_str_opt.pickle.gzip'

In [3]:
df = pd.read_pickle(INPUT_FILE_GZIP, compression='gzip')

In [4]:
len(df)

15883627

In [5]:
# Check types are OK
df.dtypes

id                  int64
conversation_id     int64
date_end           object
date_start         object
datetime           object
is_quote             bool
is_reply             bool
is_retweet           bool
nbr_favorite        int64
nbr_reply           int64
nbr_retweet         int64
search_string      object
text               object
user_id             int64
user_name          object
string_date        object
dtype: object

In [7]:
no_links_http = lambda tweet: re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', '', tweet, flags=re.MULTILINE) # no links
no_links_no_http = lambda tweet: re.sub(r'[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', '', tweet, flags=re.MULTILINE) # no links
no_mentions = lambda tweet: re.sub(r"@(\w+)", ' ', tweet, flags=re.MULTILINE)
no_hasgtags = lambda tweet: re.sub(r"#(\w+)", ' ', tweet, flags=re.MULTILINE)

all_hasgtags = lambda tweet: re.findall(r"#(\w+)(\s+#\w+)?(\s+#\w+)?(\s+#\w+)?(\s+#\w+)?(\s+#\w+)?(\s+#\w+)?(\s+#\w+)?(\s+#\w+)?(\s+#\w+)?", tweet)

In [8]:
def get_first_hasgtag_only(tweet):
    hasgtags = all_hasgtags(tweet)
    for h in hasgtags:
        premier_hashtag_seq = h[0]
        hasgtag_ext = seg_tw.segment(premier_hashtag_seq)
        tweet = tweet.replace('#{}'.format(premier_hashtag_seq),hasgtag_ext)
    # on supprime tous les autres hastags
    tweet = no_hasgtags(tweet)
    return tweet

In [9]:
def clean(tweet):
    tweet = html.unescape(tweet) # Fix unescape html special caracters
    tweet = fix_text(tweet) # Fix unicode
    tweet = tweet.lower().strip() # Minuscules et sans espaces devant et derrière
    tweet = no_links_http(tweet) # Pas de liens avec http
    tweet = no_links_no_http(tweet) # Pas de liens sans http (e.g. www.tt.com)
    tweet = tweet.replace('https', ' ').replace('http', ' ').replace('www', ' ') # ces résidus subsitent souvent
    tweet = no_mentions(tweet) # Pas de mentions
    tweet = get_first_hasgtag_only(tweet)
    
    no_punctiation = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
    no_digits = str.maketrans(string.digits, ' '*len(string.digits)) #map digits to space
    tweet = tweet.translate(no_punctiation)
    tweet = tweet.translate(no_digits)
    tweet = ' '.join(tweet.split()) # no multiple whitespaces
    return tweet

In [10]:
df.loc[:,'text_clean'] = df.text.progress_apply(clean)
df.head(2)

HBox(children=(IntProgress(value=0, max=15883627), HTML(value='')))




Unnamed: 0,id,conversation_id,date_end,date_start,datetime,is_quote,is_reply,is_retweet,nbr_favorite,nbr_reply,nbr_retweet,search_string,text,user_id,user_name,string_date,text_clean
0,940606059626090496,940606059626090496,2018-02-25,2017-12-11,2017-12-12 16:35:34,False,False,False,3,0,2,Accord Healthcare,We talk exclusively to Accord Healthcare 's E...,60939437,SupplyChainD,accordhealthcare_2017-12-11,we talk exclusively to accord healthcare s eu ...
1,940254856471896064,940254856471896064,2018-02-25,2017-12-11,2017-12-11 17:20:01,False,False,False,0,0,0,Accord Healthcare,"""The equitable/legal title conundrum and claim...",498639837,VennerShipley,accordhealthcare_2017-12-11,the equitable legal title conundrum and claimi...


In [11]:
# df.drop(['text'], axis=1, inplace=True)
# df.head(2)
# C'est mieux de garder même si cela prend plus de place

In [12]:
OUTPUT_FILE_GZIP = './_sources_final/pickles/pression_sociale_rounds_1_2_3_corrected_search_str_opt_clean_text.pickle.gzip'

In [13]:
df.to_pickle(OUTPUT_FILE_GZIP, compression='gzip')