In [1]:
from textblob import TextBlob
from textblob.taggers import NLTKTagger

import pandas as pd
import re

In [11]:
wiki = TextBlob("not a very great calculation")

In [12]:
wiki.sentiment.polarity

-0.3076923076923077

In [13]:
wiki.tags

[('not', 'RB'),
 ('a', 'DT'),
 ('very', 'RB'),
 ('great', 'JJ'),
 ('calculation', 'NN')]

In [20]:
wiki = TextBlob("it is so coool")

In [21]:
wiki.sentiment.polarity

0.0

## Preprocessing:
- Remove emoticons, add sentiment polarity as feature
- Replace all URLs with tag ||U||
- Replace targets with tag ||T||
- Replace negations by tag "NOT"
    - not, no, never, n’t, cannot
- Replace sequence of repeated characters by 3 characters
- Convert Acronym
E.g.: cooooool -> coool
## Features:
- Emoticon Polarity


- URLs

In [23]:
text = "http://www.baidu.com\nhttps://www.baidu.com\nwww.baidu.com"

In [26]:
re.sub(r'(http\S+)|((...)*.com(...)*)', "||U||", text)

'||U||\n||U||\n||U||'

- hashtags

In [27]:
text = "#TellTheTruth aukysdh iaiudh uiashd haousdh iauhds9 oiajsd"

In [30]:
re.sub(r'\#[a-zA-Z]+', "||H||", text)

'||T|| aukysdh iaiudh uiashd haousdh iauhds9 oiajsd'

- targets

In [2]:
text = "@Donald aukysdh iaiudh uiashd haousdh iauhds9 oiajsd"

In [3]:
re.sub(r'@[a-zA-Z]+', "||T||", text)

'||T|| aukysdh iaiudh uiashd haousdh iauhds9 oiajsd'

- repeated sequence

In [15]:
text = "cooooooool"

In [139]:
for ch in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
    text = re.sub(r'' + ch + '{3,}', ch + ch + ch, text)

In [140]:
text

'coool'

- negations

In [18]:
text = "I cannot never not no cann't do this."

In [19]:
# substitute negations with NOT
for negation in ['not', 'no', 'never', 'n\'t', 'cannot']:
    text = re.sub(r'' + negation, " NOT ", text)

In [20]:
text

'I can NOT   NOT   NOT   NOT  can NOT  do this.'

In [21]:
re.sub(r'' + ' ' + '{2,}', ' ', text)

'I can NOT NOT NOT NOT can NOT do this.'

- acronym and emoticon

In [68]:
acronym_table = pd.read_csv("./dictionary/acronym_dict.csv")
acronym_dict = dict()
emoticon_table = pd.read_csv("./dictionary/emoticon_dict.csv")
emoticon_dict = dict()

for row in range(acronym_table.shape[0]):
    slang = acronym_table.iloc[row, 0]
    # pattern = r'[.^$*+?|#]'
    # slang = re.sub(pattern, '\\' + pattern, slang)
    for ch in ".^$*+?|#":
        print(ch, slang)
        slang = re.sub(r'\\' + ch, r'\\' + ch, slang)
    acronym_dict[slang] = acronym_table.iloc[row, 1]
    
for row in range(emoticon_table.shape[0]):
    emoticon_dict[emoticon_table.iloc[row, 0]] = emoticon_table.iloc[row, 1]

. *4u
^ *4u
$ *4u
* *4u
+ \**\*4\*u\*
? \+**\+*4\+*u\+*
| \?+\?*\?*\?+\?*\?4\?+\?*\?u\?+\?*\?
# \|?\|+\|?\|*\|?\|*\|?\|+\|?\|*\|?\|4\|?\|+\|?\|*\|?\|u\|?\|+\|?\|*\|?\|
. *67
^ *67
$ *67
* *67
+ \**\*6\*7\*
? \+**\+*6\+*7\+*
| \?+\?*\?*\?+\?*\?6\?+\?*\?7\?+\?*\?
# \|?\|+\|?\|*\|?\|*\|?\|+\|?\|*\|?\|6\|?\|+\|?\|*\|?\|7\|?\|+\|?\|*\|?\|
. *eg*
^ *eg*
$ *eg*
* *eg*
+ \**\*e\*g\**\*
? \+**\+*e\+*g\+**\+*
| \?+\?*\?*\?+\?*\?e\?+\?*\?g\?+\?*\?*\?+\?*\?
# \|?\|+\|?\|*\|?\|*\|?\|+\|?\|*\|?\|e\|?\|+\|?\|*\|?\|g\|?\|+\|?\|*\|?\|*\|?\|+\|?\|*\|?\|
. 7734
^ 7734
$ 7734
* 7734
+ \*7\*7\*3\*4\*
? \+*7\+*7\+*3\+*4\+*
| \?+\?*\?7\?+\?*\?7\?+\?*\?3\?+\?*\?4\?+\?*\?
# \|?\|+\|?\|*\|?\|7\|?\|+\|?\|*\|?\|7\|?\|+\|?\|*\|?\|3\|?\|+\|?\|*\|?\|4\|?\|+\|?\|*\|?\|
. 0day
^ 0day
$ 0day
* 0day
+ \*0\*d\*a\*y\*
? \+*0\+*d\+*a\+*y\+*
| \?+\?*\?0\?+\?*\?d\?+\?*\?a\?+\?*\?y\?+\?*\?
# \|?\|+\|?\|*\|?\|0\|?\|+\|?\|*\|?\|d\|?\|+\|?\|*\|?\|a\|?\|+\|?\|*\|?\|y\|?\|+\|?\|*\|?\|
. 0noe
^ 0noe
$ 0noe
* 0noe
+ \*0\*n\*o\*e\*
?

TypeError: expected string or bytes-like object

In [53]:
line = "gr8 :-)"
for slang in acronym_dict:
#     print(slang, acronym_dict[slang])
    line = line.replace(str(slang), str(acronym_dict[slang]))
    # recursive substitution issue
    
# remove emoticons and maintain an emoticon sentiment score
emoti_score = 0
for emoti in emoticon_dict:
    if emoti in line:
        line = line.replace(emoti, "")
        emoti_score += emoticon_dict[emoti]

In [54]:
line

'gato represenTowhyear oldugh Lwhyear olduckwhyeahhat '

In [55]:
line = "The top three 5G stocks to buy right now -  https://mailchi.mp/exactoptionpicks/the-option-triggers-with-100-676825¬†‚Ä¶ $FB $AMZN $NFLX $AAPL $BABA $TSLA $CGC $TLRY $DIS $EA $CMG $NVDA $GOOGL $BTC $DIS"

In [58]:
slang = "*4u"
line = re.sub(r'' + slang, acronym_dict[slang], line)
line

error: nothing to repeat at position 0

- exclamation words

- Capitalized words

In [10]:
text = "London is GOOD. Especially when it's raining!"

In [18]:
re.findall(r'[A-Z][\'A-Z]+|[A-Z][a-zA-Z]+', text)

['London', 'GOOD', 'Especially']

- POS feature

In [39]:
text = "Microsoft Build 2019 is set for May. Will we hear more about Windows Core OS?  Microsoft Build 2018 is officially set for May in Seattle, where it‚Äôs expected to discuss what‚Äôs next across its various products and platforms. It arrives in the same week and overlaps with Googl‚Ä¶"

In [42]:
nltk_tagger = NLTKTagger()
blob = TextBlob(text, pos_tagger=nltk_tagger)
pos_tuples = blob.pos_tags

In [43]:
pos_count = dict()
for _, pos_tag in pos_tuples:
    pos_count[pos_tag] = pos_count.get(pos_tag, 0) + 1
print(pos_count)

{'NNP': 11, 'CD': 2, 'VBZ': 3, 'VBN': 3, 'IN': 7, 'MD': 1, 'PRP': 2, 'VB': 2, 'RBR': 1, 'RB': 1, 'WRB': 1, 'NN': 2, 'TO': 1, 'JJ': 4, 'PRP$': 1, 'NNS': 3, 'CC': 2, 'DT': 1}


In [44]:
word = "happy"
blob = TextBlob(word)
blob.sentiment.polarity

0.8

In [7]:
df = pd.read_csv("./dictionary/emoticon_dict.csv")

In [70]:
df = pd.read_csv("./dictionary/acronym_dict.csv")

In [124]:
df.head()

Unnamed: 0,slang,translation
0,*4u,Kiss for you
1,*67,unknown
2,*eg*,evil grin
3,7734,hello
4,0day,software illegally obtained before it was rele...


In [131]:
df[df.translation.str.contains('\*')]

Unnamed: 0,slang,translation
3453,q2c,quick to c**


In [8]:
df = df.replace(to_replace=r'f\*\*king', value='fucking', regex=True)

NameError: name 'df' is not defined

In [None]:
df = df.replace(to_replace=r'f\*\*k', value='fuck', regex=True)

In [9]:
df = df.replace(to_replace=r'F\*ck', value='fuck', regex=True)

NameError: name 'df' is not defined

In [125]:
df = df.replace(to_replace=r'f\*\*', value='fag', regex=True)

In [99]:
df = df.replace(to_replace=r'b\*\*\*h', value='bitch', regex=True)

In [101]:
df = df.replace(to_replace=r'a\*\*h\*\*e', value='asshole', regex=True)

In [97]:
df = df.replace(to_replace=r's\*\*t', value='shit', regex=True)

In [122]:
df = df.replace(to_replace=r'c\*\*t', value='cunt', regex=True)

In [104]:
df = df.replace(to_replace=r'd\*\*k', value='dick', regex=True)

In [106]:
df = df.replace(to_replace=r'p\*\*\*y', value='pussy', regex=True)

In [110]:
df = df.replace(to_replace=r'c\*\*k', value='cock', regex=True)

In [112]:
df = df.replace(to_replace=r'w\*\*\*e', value='whore', regex=True)

In [108]:
df = df.replace(to_replace=r'b\*\*\*\*\*d', value='bastard', regex=True)

In [114]:
df = df.replace(to_replace=r'n\*\*\*\*r', value='nigger', regex=True)

In [116]:
df = df.replace(to_replace=r'n\*\*\*a', value='nigga', regex=True)

In [127]:
df = df.replace(to_replace=r'd\*\*\*\*ebag', value='douchebag', regex=True)

In [132]:
df.to_csv('./dictionary/acronym_dict.csv', index=False)

In [145]:
wiki = TextBlob("We Love The North Chapter 10 of #TheBig400  https://mancunianbirder.wordpress.com/2019/03/30/we-love-the-north/¬†‚Ä¶ @MarkHChampion @BirdWatchingMag @WWTMartinMere @infocusWWTMM @BillAspin @googl_phil @DarbyBug @DaveyManMcG @doneycj21 @Leighton_moss @RSPB_N_England @benhoare5 @RareBirdAlertUK @BirdGuides @fred_fearn @RSPBbirders")

In [146]:
wiki.detect_language()

'en'