In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from ftfy import fix_text
import html
import string
import unidecode
import re
import sys

import warnings
warnings.filterwarnings('ignore')

from ekphrasis.classes.segmenter import Segmenter
seg_tw = Segmenter(corpus="twitter")

import swifter

Reading twitter - 1grams ...
Reading twitter - 2grams ...


In [2]:
df = pd.read_pickle('_outputs/harassement.pickle')

In [3]:
df.head(2)

Unnamed: 0,index,username_tweet,id,text,url,nbr_retweet,nbr_favorite,nbr_reply,datetime,has_media,is_reply,is_retweet,is_quote,user_id
0,0,herveresse,1050078551687524358,"Un an après #metoo ou #balancetonporc , Rapp...",/herveresse/status/1050078551687524358,0,0,0,2018-10-10 19:40:10,1.0,0,0,0,18739382
1,1,LettreAudio,1044325242766807042,Hâte de participer à cette table ronde à #Wash...,/LettreAudio/status/1044325242766807042,0,2,0,2018-09-24 22:38:35,,0,0,0,189869909


In [4]:
len(df)

199827

In [5]:
remove_encoding_errors = lambda tweet: re.sub(r"\\\S+", r'', tweet, flags=re.MULTILINE)
# https://regex101.com/r/aIYcKR/1/

no_links_http = lambda tweet: re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', '', tweet, flags=re.MULTILINE) # no links
# no_www = lambda tweet: re.sub(r"www\.\S+", '', tweet, flags=re.MULTILINE)
no_www = lambda tweet: re.sub(r"www\S*", '', tweet, flags=re.MULTILINE) # cas où ça coupe après www

# https://regex101.com/r/M0MrfJ/2
no_http = lambda tweet: re.sub(r"http\S*", '', tweet, flags=re.MULTILINE) # cas où ça coupe après http

no_url_queries = lambda tweet: re.sub(r"([\S]*[?/&=%]\S+)", '', tweet, flags=re.MULTILINE)
# https://regex101.com/r/ORoXn5/5

no_remaining_http = lambda tweet: re.sub(r"https?", '', tweet, flags=re.MULTILINE)
# https://regex101.com/r/Cr9IzK/2/

punct_as_separators = lambda tweet: re.sub(r"(\S)[#,.+*](\S)", r'\1 \2', tweet, flags=re.MULTILINE)
# https://regex101.com/r/9xjwh0/1/

all_hasgtags = lambda tweet: re.findall(r"#(\w+)", tweet)
all_mentions_hashtags_with_spaces = lambda tweet: re.findall(r"[@#]\s+\S+", tweet, flags=re.MULTILINE)
# https://regex101.com/r/wB8CSx/2/

# no_mentions = lambda tweet: re.sub(r"@(\w+)", ' ', tweet, flags=re.MULTILINE)

no_digits = str.maketrans(string.digits, ' '*len(string.digits))

# https://stackoverflow.com/questions/34860982/replace-the-punctuation-with-whitespace
punctuation_to_space = lambda tweet: re.sub(r"[(){}\"'’,.;@#?!&%$/\\]+\ *", " ", tweet)

no_more_than_20 = lambda tweet: ' '.join(word for word in tweet.split() if len(word) <= 20)
no_less_than_3 = lambda tweet: ' '.join(word for word in tweet.split() if len(word) >= 3)

no_files_extensions_and_too = lambda tweet : re.sub(r'\bhtml\b|\bphp\b|\btoo\b', " ", tweet)

In [6]:
def clean_hashtags_mentions(tweet):
    mh_old = all_mentions_hashtags_with_spaces(tweet)
    mh_new = [m.replace(' ', '') for m in mh_old]
    tuples = zip(mh_old, mh_new)
    for t in tuples:
        tweet =tweet.replace(t[0], t[1])
    return tweet

In [7]:
def convert_hashtags(tweet):
    hasgtags = all_hasgtags(tweet)
    for h in hasgtags:
        h_extended = seg_tw.segment(h)
        tweet = tweet.replace('#{}'.format(h),h_extended)
    return tweet

In [8]:
# https://stackoverflow.com/a/49146722/330558
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [9]:
def clean_text(tweet):
    
    # Formatage
    tweet = remove_encoding_errors(tweet) # \\xe2\\x80\\xb
    tweet = fix_text(tweet) # Fix unicode
    ## fix_text('uÌˆnicode') -> ünicode
    tweet = html.unescape(tweet) # Fix unescape html special caracters
    
    tweet = remove_emoji(tweet)
    
    tweet = tweet.replace('\n', ' ').replace('\r', '') # replace line breaks with spaces, removes carriage returns

    tweet = tweet.strip() # Strip spaces
    
    tweet = tweet.lower()
        
    # Liens OK
    tweet = no_links_http(tweet) # No links with http(s)
    tweet = no_www(tweet) # no remaining www
    tweet = no_http(tweet) # no remaining http
    
    # Liens cassés
    tweet = no_url_queries(tweet) # No url queries when there ara spaces between the http(s) and url queries
    tweet = no_remaining_http(tweet) # remaining http(s)
    
    #listes : win,food,clothes,groceries
    tweet = punct_as_separators(tweet)
    
    #hashtags
    tweet = clean_hashtags_mentions(tweet) # when there are spaces between # or @ and the text
    tweet = convert_hashtags(tweet) # expand hashtags
    
    # remove mentions
    # tweet = no_mentions(tweet)
    
    # no digits
    tweet = tweet.translate(no_digits)
    
    tweet = punctuation_to_space(tweet)
    
    tweet = no_more_than_20(tweet) # no more than 20 characters
    
    tweet = no_less_than_3(tweet)
    
    tweet = no_files_extensions_and_too(tweet) # remove file extensions and too of #metoo since hashtags are expanded
    
    tweet = " ".join(tweet.split()) #remove multiple spaces
    
    return tweet

In [10]:
df['clean_text'] = df.text.swifter.apply(clean_text)

Pandas Apply: 100%|██████████| 199827/199827 [02:40<00:00, 1242.67it/s]


In [11]:
df.head(2)

Unnamed: 0,index,username_tweet,id,text,url,nbr_retweet,nbr_favorite,nbr_reply,datetime,has_media,is_reply,is_retweet,is_quote,user_id,clean_text
0,0,herveresse,1050078551687524358,"Un an après #metoo ou #balancetonporc , Rapp...",/herveresse/status/1050078551687524358,0,0,0,2018-10-10 19:40:10,1.0,0,0,0,18739382,après balance ton porc rappel règles homme cis...
1,1,LettreAudio,1044325242766807042,Hâte de participer à cette table ronde à #Wash...,/LettreAudio/status/1044325242766807042,0,2,0,2018-09-24 22:38:35,,0,0,0,189869909,hâte participer cette table ronde washington a...


In [12]:
df[:20]['clean_text']

0     après balance ton porc rappel règles homme cis...
1     hâte participer cette table ronde washington a...
2     est une histoire malheureusement réelle qui es...
3     juillet gouvernement baissé budget des droits ...
4     holy wood babylon tempera feuille sujet affair...
5     autant adhéré hashtag autant étais réservée su...
6     without becoming victime perpétuelle you can s...
7     french labour law how react situation sexual m...
8     balance ton porc balance ton racist esclavagel...
9     balance ton porc une bonne une mauvaise idée g...
10    mère trint ignant aimait tout simplement pas f...
11    gauguin pas drogue jeunes filles ans pour les ...
12    est marrant que meme apres morandiniblog soit ...
13    non pas pro-metoo jamais défendu mouvement par...
14    seule rencontre avec intime peut tuer démon pr...
15    vous allez surtout assister une révolution soc...
16    millions spectateurs accuse romanpolanski mill...
17    tribune editionshermann sur hashtag foncti

In [13]:
df.to_pickle('_outputs/harassement_clean.pickle')