In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import RegexpTokenizer
# stemming package
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
# lemmatization package
from nltk.stem import WordNetLemmatizer
# stopwords package
from nltk.corpus import stopwords
from nltk.tree import Tree

from tensorflow.keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import re



In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('state_union')

In [None]:
#read file 
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
trump_sample_df = pd.read_csv("/content/gdrive/MyDrive/WDPS/cleaned_Trump_255441.csv",encoding = "ISO-8859-1", lineterminator='\n')

In [None]:
print("trump sample data set size: ", len(trump_sample_df))

trump sample data set size:  255441


In [None]:

# Reading contractions.csv and storing it as a dict.
contractions = pd.read_csv('/content/gdrive/MyDrive/WDPS/contractions.csv', index_col='Contraction')
contractions.index = contractions.index.str.lower()
contractions.Meaning = contractions.Meaning.str.lower()
contractions_dict = contractions.to_dict()['Meaning']

# Defining regex patterns.
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
hashtagPattern    = '#[^\s]+'
alphaPattern      = "[^A-Za-z0-9<>]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

# Defining regex for emojis
smileemoji        = r"[8:=;]['`\-]?[)d]+"
sademoji          = r"[8:=;]['`\-]?\(+"
neutralemoji      = r"[8:=;]['`\-]?[\/|l*]"
lolemoji          = r"[8:=;]['`\-]?p+"

def preprocess_apply(tweet):

    #tweet = tweet.lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'<url>',tweet)
    # Replace @USERNAME to '<user>'.
    tweet = re.sub(userPattern,'<user>', tweet)
    
    # Replace #Hashtags to '<hashtags>'.
    # note that i don't remove hashtag during training, so ~ 
    #tweet = re.sub(hashtagPattern,'<hashtag>', tweet)
    
    # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    # Replace all emojis.
    tweet = re.sub(r'<3', '<heart>', tweet)
    tweet = re.sub(smileemoji, '<smile>', tweet)
    tweet = re.sub(sademoji, '<sadface>', tweet)
    tweet = re.sub(neutralemoji, '<neutralface>', tweet)
    tweet = re.sub(lolemoji, '<lolface>', tweet)

    for contraction, replacement in contractions_dict.items():
        tweet = tweet.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    tweet = re.sub(alphaPattern, ' ', tweet)

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

In [None]:
# do preprocess, and store in a new column, in df
%%time
trump_sample_df['processed_text'] = trump_sample_df.tweet.apply(preprocess_apply)

CPU times: user 21.1 s, sys: 60 ms, total: 21.2 s
Wall time: 21.2 s


In [None]:
# have a look at processed text
print("Raw text: ")
print(trump_sample_df.tweet[798])
print("Processed text:")
print(trump_sample_df.processed_text[798])
print("Raw text: ")
print(trump_sample_df.tweet[2567])
print("Processed text:")
print(trump_sample_df.processed_text[2567])

In [None]:
def get_NER(postag_text, NER_List):
    # get NER
    # could try : https://github.com/flairNLP/flair
    chunked = nltk.ne_chunk(postag_text)

    for i in chunked:
        if type(i) == Tree:
            chunk_label = i.label()
            chunk_string = " ".join([token for token, pos in i.leaves()])
            NER_List.append((chunk_string, chunk_label))

    return


def NLProcess(text):
    # tokenize, remove punctuation, remove stopwords
     tokenizer = RegexpTokenizer(r'\w+')
     #intermediate = tokenizer.tokenize(text)
     sent_text = sent_tokenize(text)
     #print(sent_text)
     NER_token = []
     NER_remove_long_token = []
     for sent in sent_text:
        word_token = tokenizer.tokenize(sent)
        #print (word_token)
        # word_token = tokenizer.tokenize(sent)
        all_stopwords = stopwords.words('english')
        intermediate = [w for w in word_token if not w in all_stopwords]
        
        for sent in sent_text:
          word_token = word_tokenize(sent)
        # word_token = tokenizer.tokenize(sent)

        # ==== Stemming process =====
        # porter = PorterStemmer()
        # lancaster = LancasterStemmer()
        # word_token = [lancaster.stem(i) for i in word_token]
        # intermediate = [porter.stem(i) for i in word_token]
        
        intermediate = [w for w in word_token if not w in stopwords.words('english')]
        # ==== pos taging =======
        postag_token = nltk.pos_tag(intermediate)
        #print (postag_token)

        get_NER(postag_token, NER_token)
        
        for mention in NER_token:
          if len(mention[0].split()) < 3 or mention[0].isupper():
            NER_remove_long_token.append(mention)

        return NER_remove_long_token


In [None]:
%%time
trump_sample_df["ner"] =trump_sample_df.processed_text.apply(NLProcess)


CPU times: user 40min 45s, sys: 1min 24s, total: 42min 10s
Wall time: 42min 12s


In [None]:
trump_sample_df.to_csv("/content/gdrive/MyDrive/WDPS/trump_sample_ner1.csv", index=False)

In [None]:
print(trump_sample_df.ner[78934])


[('Trump', 'PERSON')]


In [None]:
# load model 
model = load_model("/content/gdrive/MyDrive/WDPS/BiLSTM_gensim_0839_15epo_100wdataset.h5")

In [None]:
# load the tokenizer
import pickle
# loading tokenizer
with open('/content/gdrive/MyDrive/WDPS/Tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
# predict function
def predict(text):
    
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=60)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    #label = -1 if score < 0.5 else 1
    out_score = round(float(score),4)

    return out_score

In [None]:
trump_sample_df1 = pd.read_csv("/content/gdrive/MyDrive/WDPS/trump_sample_ner1.csv",encoding = "ISO-8859-1", lineterminator='\n')

In [None]:
# try predict the sample dataset and estimate the time, cpu runtime type
%%time
trump_sample_df1["predict_score_bi"] = trump_sample_df1.processed_text.apply(lambda x: predict(x))
# store the result
trump_sample_df1.to_csv("/content/gdrive/MyDrive/WDPS/trump_protext_scored1.csv", index=False)

CPU times: user 4h 5min 25s, sys: 3min 49s, total: 4h 9min 15s
Wall time: 3h 29min 53s


In [None]:

trump_sample_df3 = pd.read_csv("/content/gdrive/MyDrive/WDPS/trump_protext_scored1.csv",encoding = "ISO-8859-1", lineterminator='\n')
alphaPattern      = "[^A-Za-z0-9<>]"
org               = "ORGANIZATION"
per               = "PERSON"
gpe               = "GPE"

def NER_process (ner):
  try:
    ner = re.sub(alphaPattern, '', ner)
    ner = re.sub(org, ',', ner)
    ner = re.sub(per, ',', ner)
    ner = re.sub(gpe, ',', ner)
    return ner
  except:
    print ("")

In [None]:
ner= "[('NYPost', 'ORGANIZATION'), ('CENSORED', 'ORGANIZATION'), ('US', 'GSP'), ('JoeBiden Trump', 'ORGANIZATION'), ('China', 'GPE'), ('Twitter', 'PERSON')]"
NER_process (ner)

'NYPost,CENSORED,USGSPJoeBidenTrump,China,Twitter,'

In [None]:
%%time
trump_sample_df3["entity"] = trump_sample_df3.ner.apply(NER_process)
trump_sample_df3.to_csv("/content/gdrive/MyDrive/WDPS/trump_entity.csv", index=False)


CPU times: user 1.87 s, sys: 18.1 ms, total: 1.89 s
Wall time: 1.9 s
