In [83]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.append('https')
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\somdd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [108]:
train = pd.read_csv('sent_train.csv')
test = pd.read_csv('sent_valid.csv')

In [66]:
train.head()

Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0


In [28]:
train['text'].iloc[2]

'$CX - Cemex cut at Credit Suisse, J.P. Morgan on weak building outlook https://t.co/KN1g4AWFIb'

In [68]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9543 entries, 0 to 9542
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    9543 non-null   object
 1   label   9543 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 149.2+ KB


In [69]:
# Took a look at the value counts to see if there is a  class imbalance
train['label'].value_counts(normalize = True)

2    0.647386
1    0.201509
0    0.151106
Name: label, dtype: float64

## Tokenizing

In [70]:
corpus = [word_tokenize(doc) for doc in train['text']]

In [71]:
corpus

[['$',
  'BYND',
  '-',
  'JPMorgan',
  'reels',
  'in',
  'expectations',
  'on',
  'Beyond',
  'Meat',
  'https',
  ':',
  '//t.co/bd0xbFGjkT'],
 ['$',
  'CCL',
  '$',
  'RCL',
  '-',
  'Nomura',
  'points',
  'to',
  'bookings',
  'weakness',
  'at',
  'Carnival',
  'and',
  'Royal',
  'Caribbean',
  'https',
  ':',
  '//t.co/yGjpT2ReD3'],
 ['$',
  'CX',
  '-',
  'Cemex',
  'cut',
  'at',
  'Credit',
  'Suisse',
  ',',
  'J.P.',
  'Morgan',
  'on',
  'weak',
  'building',
  'outlook',
  'https',
  ':',
  '//t.co/KN1g4AWFIb'],
 ['$',
  'ESS',
  ':',
  'BTIG',
  'Research',
  'cuts',
  'to',
  'Neutral',
  'https',
  ':',
  '//t.co/MCyfTsXc2N'],
 ['$',
  'FNKO',
  '-',
  'Funko',
  'slides',
  'after',
  'Piper',
  'Jaffray',
  'PT',
  'cut',
  'https',
  ':',
  '//t.co/z37IJmCQzB'],
 ['$',
  'FTI',
  '-',
  'TechnipFMC',
  'downgraded',
  'at',
  'Berenberg',
  'but',
  'called',
  'Top',
  'Pick',
  'at',
  'Deutsche',
  'Bank',
  'https',
  ':',
  '//t.co/XKcPDilIuU'],
 ['$',
  'GM

In [32]:
import itertools
flattenedcorpus_tokens = pd.Series(list(itertools.chain(*corpus)))
print(flattenedcorpus_tokens.shape)

(146209,)


In [33]:
dictionary = pd.Series(
    flattenedcorpus_tokens.unique())
print(len(dictionary))

26745


In [72]:
flattenedcorpus_tokens.value_counts()

:                    7355
https                5093
$                    3310
,                    2532
to                   2338
                     ... 
Strengthened            1
ENDV                    1
Houses                  1
fiber…                  1
//t.co/fS66vRei8N       1
Length: 26745, dtype: int64

In [35]:
# creating a function to lowercase all tokenized words if they are capitals and not in the stop words
def first_step_normalizer(doc):
    norm_text = [x.lower() for x in word_tokenize(doc) if ((x.isalpha()) & (x.lower() not in stop_words))]
    return norm_text

In [36]:
train['text'] = train['text'].apply(first_step_normalizer)

In [37]:
norm_toks_flattened = pd.Series(list(
    itertools.chain(*train['text'])))
new_dictionary = norm_toks_flattened.unique()
print(len(new_dictionary))

14080


In [38]:
len(norm_toks_flattened)

72991

In [39]:
norm_toks_flattened.value_counts()

stock       911
new         413
results     373
earnings    347
stocks      347
           ... 
mnst          1
aurelia       1
hawks         1
writes        1
taxpayer      1
Length: 14080, dtype: int64

## Lemmatizing

In [59]:
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\somdd\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\somdd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [60]:
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [61]:
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(norm_toks_flattened))) 

In [62]:
wnl = WordNetLemmatizer()
doc_lemmatized = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]

In [63]:
len(doc_lemmatized)

70474

## Fully Normalized Document

In [109]:
def process_doc(doc):

    #initialize lemmatizer
    wnl = WordNetLemmatizer()

    # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
        
    # remove stop words and punctuations, then lower case
    doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok.lower() not in stop_words)) ]

    #  POS detection on the result will be important in telling Wordnet's lemmatizer how to lemmatize
    
    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm))) 
    doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]
    doc_norm = [token for token in doc_norm if len(token) >  0]
    
    return doc_norm

In [110]:
normalized_corpus = train['text'].apply(process_doc)
normalized_corpus

In [111]:
flattened_fully_norm = pd.Series(list(itertools.chain(*normalized_corpus)))
len(flattened_fully_norm.unique())

11903

In [118]:
normalized_corpus = normalized_corpus.loc[normalized_corpus.map(lambda x: len(x) > 0)]

In [119]:
fnc = normalized_corpus.apply(" ".join)

In [120]:
fnc.to_csv("finance_tweets_train.csv")

In [121]:
fnc

0                     bynd jpmorgan reel expectation meat
1       ccl rcl nomura point booking weakness carnival...
2       cx cemex cut credit suisse morgan weak buildin...
3                            es btig research cut neutral
4                         fnko funko slide jaffray pt cut
                              ...                        
9538    week gainer loser stoxx europe economy markets...
9539    tupperware brand consumer gainer unilever lead...
9540    vtv therapeutic lead healthcare gainer myomo b...
9541                         work xpo pyx amkr hour mover
9542                              yndx qd oesx tech mover
Name: text, Length: 9534, dtype: object