In [7]:
from tensorflow.keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

import spacy
import re
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize, sent_tokenize

# stemming package
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
# lemmatization package
from nltk.stem import WordNetLemmatizer
# stopwords package
from nltk.corpus import stopwords
from nltk.tree import Tree

import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [29]:
biden_sample_df = pd.read_csv("/content/gdrive/My Drive/webA2/archive/cleaned_Biden_171236.csv",encoding = "ISO-8859-1", lineterminator='\n')

In [30]:
print("biden sample data set size: ", len(biden_sample_df))

biden sample data set size:  171236


In [77]:
import re
# Reading contractions.csv and storing it as a dict.
contractions = pd.read_csv('/content/gdrive/My Drive/webA2/archive/contractions.csv', index_col='Contraction')
contractions.index = contractions.index.str.lower()
contractions.Meaning = contractions.Meaning.str.lower()
contractions_dict = contractions.to_dict()['Meaning']

# Defining regex patterns.
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
hashtagPattern    = '#[^\s]+'
alphaPattern      = "[^A-Za-z0-9<>]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

# Defining regex for emojis
smileemoji        = r"[8:=;]['`\-]?[)d]+"
sademoji          = r"[8:=;]['`\-]?\(+"
neutralemoji      = r"[8:=;]['`\-]?[\/|l*]"
lolemoji          = r"[8:=;]['`\-]?p+"

def preprocess_apply(tweet):

    #tweet = tweet.lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'<url>',tweet)
    # Replace @USERNAME to '<user>'.
    tweet = re.sub(userPattern,'<user>', tweet)
    
    # Replace #Hashtags to '<hashtags>'.
    # note that i don't remove hashtag during training, so ~ 
    #tweet = re.sub(hashtagPattern,'<hashtag>', tweet)
    
    # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    # Replace all emojis.
    tweet = re.sub(r'<3', '<heart>', tweet)
    tweet = re.sub(smileemoji, '<smile>', tweet)
    tweet = re.sub(sademoji, '<sadface>', tweet)
    tweet = re.sub(neutralemoji, '<neutralface>', tweet)
    tweet = re.sub(lolemoji, '<lolface>', tweet)

    for contraction, replacement in contractions_dict.items():
        tweet = tweet.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    tweet = re.sub(alphaPattern, ' ', tweet)

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

In [74]:
# do preprocess, and store in a new column, in df
%%time
biden_sample_df['processed_text'] = biden_sample_df.tweet.apply(preprocess_apply)

CPU times: user 14.4 s, sys: 32.9 ms, total: 14.4 s
Wall time: 14.5 s


In [78]:
# have a look at processed text
print("Raw text: ")
print(biden_sample_df.tweet[15])
print("Processed text:")
print(biden_sample_df.processed_text[15])
print("Raw text: ")
print(biden_sample_df.tweet[19])
print("Processed text:")
print(biden_sample_df.processed_text[19])

Raw text: 
@TwitterSafety You donÃ¢ÂÂt usually bother, why the huge effort in response to the #Biden @nypost story?
Processed text:
<user> You don      t usually bother  why the huge effort in response to the  Biden <user> story 
Raw text: 
Has this awoken you from your slumbers yet @BBCJonSopel. Getting to the point where even you may have to file a negative story about #Biden! https://t.co/Y54y25n6dU
Processed text:
Has this awoken you from your slumbers yet <user> Getting to the point where even you may have to file a negative story about  Biden  <url>


In [79]:
def get_NER(postag_text, NER_List):
    #get NER
    # could try : https://github.com/flairNLP/flair
    # https://blog.csdn.net/qq_27713281/article/details/72819219 nltk.ne_chunk(tags)#tags是句子词性标注后的结果，同样是句子级
    chunked = nltk.ne_chunk(postag_text)
    # nltk.ne_chunk returns a nested nltk.tree.Tree object so you would have to traverse the Tree object to get to the NEs
    #http://www.itkeyword.com/doc/1722997285955701853/nltk-named-entity-recognition-to-a-python-list

    for i in chunked:
        if type(i) == Tree:

            chunk_label = i.label()
            chunk_string = " ".join([token for token, pos in i.leaves()])
            NER_List.append((chunk_string, chunk_label))
            #print(current_chunk)
        '''
        if current_chunk:
            named_entity = " ".join(current_string)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                NER_List.append()
                current_chunk = []
        else:
            continue
        '''

    return

from nltk.tokenize import RegexpTokenizer
def NLProcess(text):
	# tokenize, remove punctuation, remove stopwords
    tokenizer = RegexpTokenizer(r'\w+')
	#intermediate = tokenizer.tokenize(text)
    sent_text = sent_tokenize(text)
   # print (sent_text)
    #分词word tokenize：使用nltk.word_tokenize(text)
	#print(sent_text)
    #nltk.sent_tokenize(text) #按句子分割
    #nltk.word_tokenize(sentence) #分词
    #nltk的分词是句子级别的，所以对于一篇文档首先要将文章按句子进行分割，然后句子进行分词
    #http://www.pythontip.com/blog/post/10012/
    NER_token = []
    NER_remove_long_token =[]
    for sent in sent_text:
        word_token = tokenizer.tokenize(sent)
     #   print (word_token)
		#word_token = tokenizer.tokenize(sent)
        all_stopwords = stopwords.words('english')
        intermediate = [w for w in word_token if not w in all_stopwords]


	# ==== Stemming process =====
		#porter = PorterStemmer()
		#lancaster = LancasterStemmer()
		#word_token = [lancaster.stem(i) for i in word_token]
		#intermediate = [porter.stem(i) for i in word_token]
        #删除停用词


#==== pos taging =======
#先用NLTK包的pos_tag方法（part-of-speech tagging ）来对单词的词性进行标记，标记后的结果是二元数组格式。之后从这个二元数列中挑出我们所有需要的tag，存放进一个二元数列。

        postag_token = nltk.pos_tag(intermediate)
      #  print (postag_token)
		#print(postag_token)



        get_NER(postag_token, NER_token)
		# remove the entity mentions which contains more than 3 words,
		#but not with capital alphabet


        return NER_token

# this function is for method 2 of parse_html



In [76]:
text = " Has this awoken you from your slumbers yet <user> Getting to the point where even you may have to file a negative story about  Biden  <url>"
NLProcess(text)

[' Has this awoken you from your slumbers yet <user> Getting to the point where even you may have to file a negative story about  Biden  <url>']
['Has', 'this', 'awoken', 'you', 'from', 'your', 'slumbers', 'yet', 'user', 'Getting', 'to', 'the', 'point', 'where', 'even', 'you', 'may', 'have', 'to', 'file', 'a', 'negative', 'story', 'about', 'Biden', 'url']
[('Has', 'NNP'), ('awoken', 'VBN'), ('slumbers', 'NNS'), ('yet', 'RB'), ('user', 'VBP'), ('Getting', 'VBG'), ('point', 'NN'), ('even', 'RB'), ('may', 'MD'), ('file', 'VB'), ('negative', 'JJ'), ('story', 'NN'), ('Biden', 'NNP'), ('url', 'NN')]


[('Biden', 'PERSON')]

In [80]:
%%time
biden_sample_df["ner"] = biden_sample_df.processed_text.apply(NLProcess)
biden_sample_df.to_csv("/content/gdrive/My Drive/webA2/archive/biden_sample_ner3.csv", index=False)


CPU times: user 20min 28s, sys: 15.2 s, total: 20min 43s
Wall time: 20min 47s


In [81]:
# load model 
model = load_model("/content/gdrive/My Drive/webA2/archive/BiLSTM_gensim_0839_15epo_100wdataset.h5")

In [82]:
# load the tokenizer
import pickle
# loading tokenizer
with open('/content/gdrive/My Drive/webA2/archive/Tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [83]:
# predict function
def predict(text):
    
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=60)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    #label = -1 if score < 0.5 else 1
    out_score = round(float(score),4)

    return out_score

In [84]:
biden_sample_df1 = pd.read_csv("/content/gdrive/My Drive/webA2/archive/biden_sample_ner3.csv",encoding = "ISO-8859-1", lineterminator='\n')

In [85]:
%%time
biden_sample_df1["predict_score_bi"] = biden_sample_df1.processed_text.apply(lambda x: predict(x))
# store the result
biden_sample_df1.to_csv("/content/gdrive/My Drive/webA2/archive/biden_scored.csv", index=False)

CPU times: user 2h 48min 36s, sys: 2min 48s, total: 2h 51min 24s
Wall time: 2h 27min 56s


In [9]:
import pandas as pd
biden_sored = pd.read_csv("/content/gdrive/My Drive/webA2/archive/biden_scored.csv",encoding = "ISO-8859-1", lineterminator='\n')
alphaPattern      = "[^A-Za-z0-9<>]"
org               = "ORGANIZATION"
per               = "PERSON"
gpe               = "GPE"

def NER_process (ner):
  ner = re.sub(alphaPattern, '', ner)
  ner = re.sub(org, ',', ner)
  ner = re.sub(per, ',', ner)
  ner = re.sub(gpe, ',', ner)

  return ner




In [10]:
ner = "[('PLEASE', 'ORGANIZATION'), ('RIGHT', 'ORGANIZATION'), ('Move', 'PERSON'), ('Orange Monkey', 'ORGANIZATION'), ('Biden PLEASE', 'PERSON')]"
NER_process(ner)

'PLEASE,RIGHT,Move,OrangeMonkey,BidenPLEASE,'

In [13]:
%%time
biden_sored["entity"] = biden_sored.ner.apply(NER_process)
biden_sored.to_csv("/content/gdrive/My Drive/webA2/archive/biden_sample_ner4.csv", index=False)

CPU times: user 5.36 s, sys: 145 ms, total: 5.5 s
Wall time: 5.93 s


In [14]:
print(type(biden_sored.ner[0]))
print(biden_sored.head)

<class 'str'>
<bound method NDFrame.head of                  created_at  ...                                             entity
0       2020-10-15 00:00:01  ...  Elecciones2020EnFlorida,DonaldTrump,PembrokePi...
1       2020-10-15 00:00:20  ...                           Biden,TrumpIsNotAmerica,
2       2020-10-15 00:00:22  ...                                  HunterBidenBiden,
3       2020-10-15 00:00:25  ...  NYPost,CENSORED,USGSPJoeBidenTrump,China,Twitter,
4       2020-10-15 00:00:57  ...  FBI,HunterBidenComputerDataUkraineDealingsRepo...
...                     ...  ...                                                ...
171231  2020-11-03 23:59:45  ...  JoeBiden,Vote2020BidenHarris2020Amen,SaveOurCo...
171232  2020-11-03 23:59:49  ...                                     ElectionNight,
171233  2020-11-03 23:59:50  ...                      Biden,Elections2020Biden2020,
171234  2020-11-03 23:59:54  ...        DNC,JoeBiden,BidenFamilyBobulinski,Rosneft,
171235  2020-11-03 23:59:57  ...