In [242]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.append('https')
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\somdd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [243]:
train = pd.read_csv('sent_train.csv')
test = pd.read_csv('sent_valid.csv')

In [311]:
train.tail()

Unnamed: 0,text,label
9538,The Week's Gainers and Losers on the Stoxx Eur...,2
9539,Tupperware Brands among consumer gainers; Unil...,2
9540,vTv Therapeutics leads healthcare gainers; Myo...,2
9541,"WORK, XPO, PYX and AMKR among after hour movers",2
9542,"YNDX, I, QD and OESX among tech movers",2


In [282]:
train['text'].iloc[2]

'$CX - Cemex cut at Credit Suisse, J.P. Morgan on weak building outlook https://t.co/KN1g4AWFIb'

In [245]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9543 entries, 0 to 9542
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    9543 non-null   object
 1   label   9543 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 149.2+ KB


In [246]:
# Took a look at the value counts to see if there is a  class imbalance
train['label'].value_counts(normalize = True)

2    0.647386
1    0.201509
0    0.151106
Name: label, dtype: float64

## Tokenizing

In [265]:
corpus = [word_tokenize(doc) for doc in train['text']]

In [250]:
import itertools
flattenedcorpus_tokens = pd.Series(list(itertools.chain(*corpus)))
print(flattenedcorpus_tokens.shape)

(146209,)


In [229]:
dictionary = pd.Series(
    flattenedcorpus_tokens.unique())
print(len(dictionary))

26745


In [230]:
flattenedcorpus_tokens.value_counts()

:                    7355
https                5093
$                    3310
,                    2532
to                   2338
                     ... 
//t.co/uiXjEqyc6e       1
Moynihan                1
//t.co/ggKzSYVcPy       1
//t.co/T2Zdt3jNiy       1
zooplus                 1
Length: 26745, dtype: int64

In [231]:
# creating a function to lowercase all tokenized words if they are capitals and not in the stop words
def first_step_normalizer(doc):
    norm_text = [x.lower() for x in word_tokenize(doc) if ((x.isalpha()) & (x.lower() not in stop_words))]
    return norm_text

In [232]:
train['text'] = train['text'].apply(first_step_normalizer)

In [233]:
norm_toks_flattened = pd.Series(list(
    itertools.chain(*train['text'])))
new_dictionary = norm_toks_flattened.unique()
print(len(new_dictionary))

14080


In [292]:
len(norm_toks_flattened)

72991

In [293]:
norm_toks_flattened.value_counts()

stock       911
new         413
results     373
stocks      347
earnings    347
           ... 
basics        1
orr           1
gambler       1
ntpc          1
skyrizi       1
Length: 14080, dtype: int64

## Lemmatizing

In [238]:
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\somdd\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\somdd\AppData\Roaming\nltk_data...


True

In [239]:
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None

In [286]:
wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(norm_toks_flattened))) 

In [287]:
wnl = WordNetLemmatizer()
doc_lemmatized = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]

In [305]:
len(doc_lemmatized)

70474

## Fully Normalized Documents

In [299]:
def process_doc(doc):

    #initialize lemmatizer
    wnl = WordNetLemmatizer()

    # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
        
    # remove stop words and punctuations, then lower case
    doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok.lower() not in stop_words)) ]

    #  POS detection on the result will be important in telling Wordnet's lemmatizer how to lemmatize
    
    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm))) 
    doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]
    
    return doc_norm

In [300]:
normalized_corpus = train['text'].apply(process_doc)

In [301]:
normalized_corpus

0               [bynd, jpmorgan, reel, expectation, meat]
1       [ccl, rcl, nomura, point, booking, weakness, c...
2       [cx, cemex, cut, credit, suisse, morgan, weak,...
3                      [es, btig, research, cut, neutral]
4                  [fnko, funko, slide, jaffray, pt, cut]
                              ...                        
9538    [week, gainer, loser, stoxx, europe, economy, ...
9539    [tupperware, brand, consumer, gainer, unilever...
9540    [vtv, therapeutic, lead, healthcare, gainer, m...
9541                  [work, xpo, pyx, amkr, hour, mover]
9542                        [yndx, qd, oesx, tech, mover]
Name: text, Length: 9543, dtype: object

In [306]:
flattened_fully_norm = pd.Series(list(itertools.chain(*normalized_corpus)))
len(flattened_fully_norm.unique())

11903

In [308]:
flattened_fully_norm.value_counts()

stock        1258
market        537
say           460
report        419
new           415
             ... 
glabellar       1
fury            1
looks           1
polarize        1
slew            1
Length: 11903, dtype: int64

In [309]:
fnc = normalized_corpus.apply(" ".join)

In [310]:
fnc

0                     bynd jpmorgan reel expectation meat
1       ccl rcl nomura point booking weakness carnival...
2       cx cemex cut credit suisse morgan weak buildin...
3                            es btig research cut neutral
4                         fnko funko slide jaffray pt cut
                              ...                        
9538    week gainer loser stoxx europe economy markets...
9539    tupperware brand consumer gainer unilever lead...
9540    vtv therapeutic lead healthcare gainer myomo b...
9541                         work xpo pyx amkr hour mover
9542                              yndx qd oesx tech mover
Name: text, Length: 9543, dtype: object

In [312]:
fnc.to_csv("finance_tweets_train.csv")

In [313]:
fnc

0                     bynd jpmorgan reel expectation meat
1       ccl rcl nomura point booking weakness carnival...
2       cx cemex cut credit suisse morgan weak buildin...
3                            es btig research cut neutral
4                         fnko funko slide jaffray pt cut
                              ...                        
9538    week gainer loser stoxx europe economy markets...
9539    tupperware brand consumer gainer unilever lead...
9540    vtv therapeutic lead healthcare gainer myomo b...
9541                         work xpo pyx amkr hour mover
9542                              yndx qd oesx tech mover
Name: text, Length: 9543, dtype: object