# NLP

</a></span></li><li><span>
[Tokenising and Stemming for NLP Feature Engineering](#Tokenising-and-Stemming-for-NLP-Feature-Engineering)
<a href='Tokenising and Stemming for NLP Feature Engineering'></a>
</a></span></li><li><span>

[Stemming](#Stemming)
<a href='Stemming'></a>
</a></span></li><li><span>


[Lemmatization to group words for Sentiment Analysis](#Lemmatization-to-group-words-for-Sentiment-Analysis)
<a href='Lemmatization-to-group-words-for-Sentiment-Analysis'></a>
</a></span></li><li><span>

[Count Vectorizing](#Count-Vectorizing)
<a href='Count Vectorizing'></a>
</a></span></li><li><span>

[Open Close Significant Words](#Open-Close-Significant-Words)
<a href='Open Close Significant Words'></a>
</a></span></li><li><span>

[High Low Significant Words](#High-Low-Significant-Words)
<a href='High Low Significant Words'></a>


---

# Tokenising and Stemming for NLP Feature Engineering

---

In [434]:
def process_text(documents, pos=False):
    nlp = textacy.load_spacy('en_core_web_sm')
    
    texts = []
    tokenised_texts = []

    if pos: # pos can either be False or a list of parts of speech
        for document in tqdm_notebook(nlp.pipe(documents, batch_size=200)):
            assert document.is_parsed
            tokens = [token
                      for token in document 
                      if token.is_stop == False # Remove Stop Words
                      and token.pos_ in pos 
                      and token.pos_ != 'PUNCT'] # Remove Punctuation
            doc_ = ''
            for token in tokens:
                doc_ += str(token) + ' '
            
            doc_ = doc_.strip()
            texts.append(doc_)
            tokenised_texts.append(tokens)
    
    
    else:    
        for document in tqdm_notebook(nlp.pipe(documents, batch_size=200)):
            assert document.is_parsed
            tokens = [token
                      for token in document 
                      if token.is_stop == False # Remove Stop Words
                      and token.pos_ != 'PUNCT'] # Remove Puncuation
            doc_ = ''
            for token in tokens:
                doc_ += str(token) + ' '
            
            doc_ = doc_.strip()
            texts.append(doc_)
            tokenised_texts.append(tokens)
            
    return texts, tokenised_texts

In [435]:
import textacy
from tqdm import tqdm_notebook



In [436]:
# Important to turn original text all to lower case as to avoid inconsitencies in stemming

In [437]:
pos = ['NOUN', 'ADJ', 'VERB', 'ADV'] #to include only these, essentially removing stops

In [438]:
text_lower = []
for word in df_final.text_new:
    text_lower.append(word.lower())

In [439]:
df_final['text_lower'] = text_lower

In [440]:
processed_titles, tokenised_titles = process_text(df_final['text_lower'], pos=pos)
df_final['processed_title'] = processed_titles
df_final['tokenised_title'] = tokenised_titles

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




# Stemming

---

In [442]:
from nltk.stem import PorterStemmer

In [443]:
stemmer = PorterStemmer()

In [446]:
stemmed_titles = []
for j in range(0, df_final.shape[0]):
    stemmed_titles.append(stemmer.stem(str(df_final.tokenised_title[j]))) #Stemming all tokens

In [447]:
df_final['stemmed_titles'] = stemmed_titles

In [448]:
stemmer.stem('tariffs') #test-run

'tariff'

In [449]:
import spacy

In [450]:
sp = spacy.load('en_core_web_sm')

# Lemmatization to group words for Sentiment Analysis

---

In [451]:
lemmed_titles = []
for list in df_final.tokenised_title: #starting the conversion to lemmatized titles
    lemmed_list  =[]
    for word in list:
        lemmed_list.append(word.lemma_)
    lemmed_titles.append(str(lemmed_list))
    

In [452]:
df_final['lemmed_titles'] = lemmed_titles

In [417]:
df_test_2.pct_sig_ch_high_low

2016-01-03 23:18:00+00:00    0
2016-01-03 23:19:00+00:00    0
2016-01-04 00:26:00+00:00    0
2016-01-04 11:42:00+00:00    0
2016-01-04 17:31:00+00:00    0
                            ..
2019-08-09 12:03:00+00:00    1
2019-08-09 12:03:00+00:00    1
2019-08-09 18:44:00+00:00    0
2019-08-09 18:44:00+00:00    0
2019-08-12 04:01:00+00:00    0
Name: pct_sig_ch_high_low, Length: 7571, dtype: int64

In [419]:
import pickle #pickling the document to supress its size 

In [420]:
pickle.dump(df_test_2[['text_lower', 'pct_sig_ch_high_low']],open('test.pkl', 'wb'))

In [418]:
df_test_2[['text_lower', 'pct_sig_ch_high_low']]

Unnamed: 0,text_lower,pct_sig_ch_high_low
2016-01-03 23:18:00+00:00,the worst thing hillary could do is have her h...,0
2016-01-03 23:19:00+00:00,"by the way, hillary &amp; the msm forgot to me...",0
2016-01-04 00:26:00+00:00,"""@icareeguns: this #iowan is voting for @reald...",0
2016-01-04 11:42:00+00:00,"""@rhettriley1234: trump only wins with our vot...",0
2016-01-04 17:31:00+00:00,"i look forward to being in lowell, massachuset...",0
2016-01-04 20:00:00+00:00,"woody johnson, owner of the nyjets, is @jebbus...",0
2016-01-05 02:20:00+00:00,"the rally in lowell, massachusetts, was amazin...",0
2016-01-05 02:22:00+00:00,"""@troyconway: now 2-more it firms going over s...",0
2016-01-05 02:28:00+00:00,"""@thecybermenace: @realdonaldtrump donald trum...",0
2016-01-05 03:33:00+00:00,"""@samuelghaddad: wow! let's hear it for the fa...",0


# Count Vectorizing

Count Vecting lemmatized words to determine which words appear most in 'significant' tweets

---

In [453]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [454]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop += ['http', 'https', 'realdonaldtrump', 'co', 'people', 'trump', 'thank', 'new'\
         'today', 'amp', 'president', 'new', 'today']
        #The above stopwords were after a few trials of most common, invaluable words

In [456]:
cvec_full = CountVectorizer(strip_accents='unicode', stop_words=stop) 
X_all_full = cvec_full.fit_transform(df_final.lemmed_titles) #CountVect of FULL dataset

In [457]:
len(cvec_full.get_feature_names())

12620

In [458]:
cv_df = pd.DataFrame(X_all_full.toarray(), columns=cvec_full.get_feature_names())
freq_full = pd.DataFrame(cv_df.sum().sort_values(ascending=False))
#The above is to show the frequency to feature engineer further

In [459]:
freq_full[freq_full.index.str.contains(r'tariff')] #Trialling with tariff, suggesting
# The stemming hasn't quite worked.

Unnamed: 0,0
tariff,137
tariffed,7
tariffs,1


# 'Significant' Words

---

In [461]:
df_final.y.value_counts() # to again see distribution of explanatory values

 0.0    10511
-1.0       99
 1.0       52
Name: y, dtype: int64

In [463]:
df_subset_open_close = df_final[df_final.y.abs()==1][['lemmed_titles']]
#Looking specifically at 'significant' tweets

In [464]:
stop = stopwords.words('english')
stop += ['http', 'https', 'realdonaldtrump', 'co', 'people', 'trump', 'thank', 'new'\
         'today', 'amp', 'president', 'new', 'today']

In [465]:
cvec_open_close = CountVectorizer(strip_accents='unicode', stop_words=stop, min_df=2)
X_all_open_close = cvec_open_close.fit_transform(df_subset_open_close.lemmed_titles)

In [466]:
len(cvec_open_close.get_feature_names()) #To see the number of different words

356

In [467]:
cv_open_close = pd.DataFrame(X_all_open_close.toarray(), columns=cvec_open_close.get_feature_names())
freq_open_close = pd.DataFrame(cv_open_close.sum().sort_values(ascending=False))

In [488]:
cv_open_close.head(1)

Unnamed: 0,absolutely,act,additional,administration,affect,ago,agreement,agricultural,agriculture,allow,...,woman,wonderful,work,worker,world,worth,wrong,year,york,young
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [468]:
freq_open_close.shape

(356, 1)

In [469]:
freq_open_close[freq_open_close.index.str.contains(r'tariff')]

Unnamed: 0,0
tariff,15


In [470]:
ratios_open_close = pd.merge(freq_open_close, freq_full, left_index=True, right_index=True)

In [None]:
# merging 'sigificant' corpus with 'non-significant' corpus to look at the frequency 
# ratio between both

In [472]:
ratios_open_close.head()

Unnamed: 0,0_x,0_y
great,45,2145
china,25,315
job,23,672
country,22,802
border,20,589


In [473]:
ratios_open_close['ratio'] = ratios_open_close['0_x']/ratios_open_close['0_y']

In [474]:
ratios_open_close.columns = ['#sig', '#insig', 'ratio']

In [475]:
ratios_open_close.sort_values('ratio', ascending=False)

Unnamed: 0,#sig,#insig,ratio
edwin,2,2,1.000000
quicker,4,4,1.000000
starve,7,9,0.777778
imbalance,3,4,0.750000
manner,2,3,0.666667
automatically,2,3,0.666667
traditional,4,6,0.666667
lebanon,2,3,0.666667
phenomenal,5,8,0.625000
redo,3,5,0.600000


In [542]:
key_words_open_close = ratios_open_close[(ratios_open_close['#sig']>=4) & (ratios_open_close['ratio']>=0.06)]

In [None]:
#Setting this threshold to get relevant words that may have some predictive power in
#my model

In [543]:
key_words_open_close

Unnamed: 0,#sig,#insig,ratio
china,25,315,0.079365
dollar,18,209,0.086124
tariff,15,137,0.109489
product,13,57,0.22807
continue,10,144,0.069444
farmer,9,92,0.097826
ohio,9,128,0.070312
buy,7,57,0.122807
starve,7,9,0.777778
government,7,110,0.063636


In [None]:
# I have used words that have appeared in significant tweets more than 5 times and with a greater significant
# to non-significant ratio of 0.1 to remove the noise or outlier words

In [389]:
key_words_open_close

Unnamed: 0,#sig,#insig,ratio
china,21,159,0.132075
cruz,18,140,0.128571
iowa,13,63,0.206349
leave,11,110,0.1
bush,11,39,0.282051
jeb,9,33,0.272727
crowd,8,71,0.112676
te,8,54,0.148148
hampshire,7,38,0.184211
weak,7,61,0.114754


In [None]:
# Here are a list of key-words that we can consider for dummy variables

In [None]:
# Allowed words that have appeared more to trade-off for a higher ratio, as this will have a greater impact
# On impacting a 'significant' tweet.

In [512]:
key_words_high_low.sort_values('#sig', ascending=False).to_csv('plot_2.csv')

In [544]:
key_words_open_close.shape

(37, 3)

In [545]:
key_words_open_close.T.columns

Index(['china', 'dollar', 'tariff', 'product', 'continue', 'farmer', 'ohio',
       'buy', 'starve', 'government', 'enforcement', 'form', 'hand', 'begin',
       'easy', 'fantastic', 'phenomenal', 'meantime', 'kevin', 'kim', 'kind',
       'officer', 'quicker', 'oval', 'marine', 'send', 'faster', 'wealth',
       'approve', 'traditional', 'grant', 'waiver', 'school', 'schumer',
       'better', 'source', 'ship'],
      dtype='object')

In [493]:
cv_open_close.shape

(151, 356)

In [494]:
cvec_open_close = CountVectorizer(strip_accents='unicode', stop_words=stop, min_df=2)
X_all = cvec_open_close.fit_transform(df_final.lemmed_titles)

In [495]:
len(cvec_open_close.get_feature_names())

5700

In [496]:
cv_all = pd.DataFrame(X_all.toarray(), columns=cvec_open_close.get_feature_names())

In [546]:
binary_words = cv_all[['china', 'dollar', 'tariff', 'product', 'continue', 'farmer', 'ohio',
       'buy', 'starve', 'government', 'enforcement', 'form', 'hand', 'begin',
       'easy', 'fantastic', 'phenomenal', 'meantime', 'kevin', 'kim', 'kind',
       'officer', 'quicker', 'oval', 'marine', 'send', 'faster', 'wealth',
       'approve', 'traditional', 'grant', 'waiver', 'school', 'schumer',
       'better', 'source', 'ship'
       ]]

In [559]:
binary_words.shape

(10662, 37)

In [573]:
df_final.shape

(10662, 33)

In [574]:
df_combine = df_final[['y', 'text_new']]

In [575]:
df_combine.shape

(10662, 2)

In [549]:
binary_words.index = df_final.index

In [576]:
df_combine = pd.merge(df_combine, binary_words, left_index=True, right_index = True)

In [577]:
df_combine.shape

(12822, 39)

In [578]:
df_combine.reset_index(inplace=True)

In [579]:
df_combine.drop_duplicates(subset=['text_new', 'level_0'], inplace=True)

In [580]:
df_combine.shape

(10661, 40)

In [570]:
df_combine.to_csv('df_combine.csv')

In [557]:
df_final.shape

(10662, 33)

In [522]:
key_words_open_close

Unnamed: 0,#sig,#insig,ratio
dollar,18,209,0.086124
tariff,15,137,0.109489
product,13,57,0.22807
farmer,9,92,0.097826
buy,7,57,0.122807
starve,7,9,0.777778
form,5,30,0.166667
hand,5,49,0.102041
phenomenal,5,8,0.625
meantime,5,23,0.217391


In [500]:
binary_words.to_csv('binary_words.csv')

# Have a look at using N-grams = 2

---

In [None]:
# Using N-Grams=1

In [501]:
stop = stopwords.words('english')
stop += ['http', 'https', 'realdonaldtrump', 'co', 'people', 'trump', 'thank', 'new'\
         'today', 'amp', 'president', 'new', 'today', 'idea', '']

In [528]:
cvec_2 = CountVectorizer(strip_accents='unicode', stop_words=stop, min_df=2, ngram_range=(2,2))
X_final_2 = cvec_2.fit_transform(df_subset_open_close.lemmed_titles)

In [529]:
cv_final_2 = pd.DataFrame(X_final_2.toarray(), columns=cvec_2.get_feature_names())
freq_sig_2 = pd.DataFrame(cv_final_2.sum().sort_values(ascending=False))

In [531]:
freq_sig_2

Unnamed: 0,0
united state,11
fake news,9
great job,7
law enforcement,6
good product,5
open border,5
tariff bring,4
starve nation,4
great republican,4
deal traditional,4


In [497]:
#Full dataset

In [533]:
cvec = CountVectorizer(strip_accents='unicode', stop_words=stop, min_df=5)
X_full = cvec.fit_transform(df_final.lemmed_titles)

In [534]:
cv_full = pd.DataFrame(X_full.toarray(), columns=cvec.get_feature_names())
freq_full = pd.DataFrame(cv_full.sum().sort_values(ascending=False))

In [510]:
freq_full.shape

(2435, 1)

In [501]:
#Merging

In [535]:
ratios_final_2 = pd.merge(freq_sig_2, freq_full, right_index=True, left_index=True)

In [539]:
ratios_final_2.columns = ['#sig', '#insig']
ratios_final_2['ratio'] = ratios_final_2['#sig']/ratios_final_2['#insig']

In [541]:
ratios_final_2.sort_values('ratio', ascending=False)

Unnamed: 0,#sig,#insig,ratio


In [517]:
ratios_final.sort_values('#sig', ascending=False).head(20)

Unnamed: 0,#sig,#insig,ratio
great,92,1233,0.074615
trump2016,29,229,0.126638
china,28,159,0.176101
country,28,317,0.088328
cruz,28,140,0.2
many,26,277,0.093863
hillary,24,390,0.061538
time,24,252,0.095238
big,23,349,0.065903
one,23,227,0.101322


In [522]:
key_words = ratios_final[(ratios_final['#sig']>=10) & (ratios_final['ratio']>=0.1)]

In [524]:
key_words.sort_values('ratio', ascending=False) # Plot this graph? Are these tweets too political?

Unnamed: 0,#sig,#insig,ratio
fitn,11,17,0.647059
hampshire,16,38,0.421053
bush,15,39,0.384615
jeb,11,33,0.333333
tariffs,14,43,0.325581
iowa,17,63,0.269841
congressman,11,45,0.244444
man,11,50,0.22
highly,10,47,0.212766
office,10,49,0.204082


In [571]:
df_final.to_csv('df_final.csv')