In [23]:
import pandas as pd
import numpy as np
import nltk
import re
from tensorflow import keras
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bobby\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bobby\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bobby\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
# read in the data and remove rows with missing values
articles = pd.read_csv('articles.csv')[['title', 'text', 'subreddit']].dropna().reset_index(drop=True)
articles

Unnamed: 0,title,text,subreddit
0,Meta's threat to close down Facebook and Insta...,Meta’s threat to close down Facebook and Insta...,nottheonion
1,Pregnant Texas woman driving in HOV lane told ...,Is an unborn fetus a human being in the eyes o...,nottheonion
2,Mark Zuckerberg Says Meta Employees “Lovingly”...,Mark Zuckerberg hasn’t always had a squeaky-cl...,nottheonion
3,Police didn't immediately confront the gunman ...,Law enforcement is getting slammed for its res...,nottheonion
4,Shaquille O'Neal says gorillas freak out when ...,Shaquille O'Neal says gorillas always freak ou...,nottheonion
...,...,...,...
1683,Elton John Awarded Medal By Joe Biden For Work...,President Biden has awarded Sir Elton John wit...,TheOnion
1684,What Republicans Are Saying About The Paul Pel...,“How many innocent people have to die before w...,TheOnion
1685,What To Say To Someone Struggling With Inflation,We may earn a commission from links on this pa...,TheOnion
1686,Herschel Walker Quietly Asking Around For D.C....,"WRIGHTSVILLE, GA—In a display of confidence ah...",TheOnion


In [25]:
articles['length'] = articles['text'].apply(lambda x: len(x.split()))
articles = articles.loc[articles['length'] > 30]
articles

Unnamed: 0,title,text,subreddit,length
0,Meta's threat to close down Facebook and Insta...,Meta’s threat to close down Facebook and Insta...,nottheonion,595
1,Pregnant Texas woman driving in HOV lane told ...,Is an unborn fetus a human being in the eyes o...,nottheonion,469
2,Mark Zuckerberg Says Meta Employees “Lovingly”...,Mark Zuckerberg hasn’t always had a squeaky-cl...,nottheonion,325
3,Police didn't immediately confront the gunman ...,Law enforcement is getting slammed for its res...,nottheonion,447
4,Shaquille O'Neal says gorillas freak out when ...,Shaquille O'Neal says gorillas always freak ou...,nottheonion,474
...,...,...,...,...
1683,Elton John Awarded Medal By Joe Biden For Work...,President Biden has awarded Sir Elton John wit...,TheOnion,77
1684,What Republicans Are Saying About The Paul Pel...,“How many innocent people have to die before w...,TheOnion,63
1685,What To Say To Someone Struggling With Inflation,We may earn a commission from links on this pa...,TheOnion,911
1686,Herschel Walker Quietly Asking Around For D.C....,"WRIGHTSVILLE, GA—In a display of confidence ah...",TheOnion,154


### Processing Text
1. expand contractions (don't --> do not)
2. remove punctuation
3. extract Proper Nouns
4. lowercase everything else
5. lemmatize (doing --> do)
6. remove stopwords
7. Counter of everything

In [26]:
import contractions
from collections import Counter

def process_text(text):
    ''' process the text of a document according to above
    args.
        text: a string
    returns.
        Counter object with word counts
    '''
    # remove line breaks
    text.replace('\n', ' ')
    
    # expand contractions
    expanded = ' '.join([contractions.fix(word) for word in text.split()])
    
    # remove punctuation and numbers
    expanded.replace('-', ' ') # replace hyphens with spaces
    expanded = re.sub(r'[^\w\s]', '', expanded)
    expanded = re.sub(r'\d+', '', expanded)
    
    # extract not proper nouns
    other_words = ' '.join([word for word, pos in pos_tag(expanded.split()) if pos != 'NNP' and pos !='NNPS'])

    # lowercase, remove stopwords, and stem words
    other_words = [word for word in other_words.lower().split() if word not in stopwords]
    lemma = WordNetLemmatizer()
    words = [lemma.lemmatize(word) for word in other_words]
    
    return Counter(words)

In [27]:
articles['word_counts'] = articles['text'].map(process_text)
articles.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles['word_counts'] = articles['text'].map(process_text)


Unnamed: 0,title,text,subreddit,length,word_counts
0,Meta's threat to close down Facebook and Insta...,Meta’s threat to close down Facebook and Insta...,nottheonion,595,"{'metas': 1, 'threat': 1, 'close': 1, 'backfir..."
1,Pregnant Texas woman driving in HOV lane told ...,Is an unborn fetus a human being in the eyes o...,nottheonion,469,"{'unborn': 3, 'fetus': 1, 'human': 2, 'eye': 1..."
2,Mark Zuckerberg Says Meta Employees “Lovingly”...,Mark Zuckerberg hasn’t always had a squeaky-cl...,nottheonion,325,"{'always': 1, 'squeakyclean': 1, 'reputation':..."
3,Police didn't immediately confront the gunman ...,Law enforcement is getting slammed for its res...,nottheonion,447,"{'enforcement': 2, 'getting': 1, 'slammed': 1,..."
4,Shaquille O'Neal says gorillas freak out when ...,Shaquille O'Neal says gorillas always freak ou...,nottheonion,474,"{'say': 2, 'gorilla': 9, 'always': 2, 'freak':..."


In [28]:
clean_articles = articles[['title', 'subreddit', 'word_counts']].copy()
clean_articles.to_csv('clean_articles.csv', index=False)

In [29]:
import csv

#Open and process data set
file = open('./go_emotions_dataset.csv')
csvreader = csv.reader(file)
header = []
GOheader = next(csvreader)

GOrows = []
for row in csvreader:
  # GOrows.append(row)
  GOrows.append(row[:len(row) - 1])

GONNmodelLabels = GOheader[3:]
#Neutral removed for more interesting results and to better capture the sole emotion of sentences without it
GONNmodelLabels.remove('neutral') 

In [30]:
#Setup Nueral Network model
GOData = []
GOTrainData = []
GOValDataX = []
GOTestData = []

for item in GOrows:
  GOData.append(item[1])

for item in GOrows[:150000]:
  GOTrainData.append(item[1])

for item in GOrows[150000:170000]:
  GOValDataX.append(item[1])

for item in GOrows[170000:]:
  GOTestData.append(item[1])

print(f'Sample GOTrainDataset entry:{GOTrainData[1]}')
print(f'GO Dataset is of length {len(GOTrainData)}')

Sample GOTrainDataset entry: >sexuality shouldn’t be a grouping category It makes you different from othet ppl so imo it fits the definition of "grouping" 
GO Dataset is of length 150000


In [31]:
GOTokenizer = Tokenizer() #tokenize and encode X value sentences for training, testing, and validation
GOTokenizer.fit_on_texts(GOData)
GOencoded = pad_sequences(GOTokenizer.texts_to_sequences(GOTrainData), maxlen=100)
GOValDataXEncoded = pad_sequences(GOTokenizer.texts_to_sequences(GOValDataX), maxlen=100)
GOTestDataEncoded = pad_sequences(GOTokenizer.texts_to_sequences(GOTestData), maxlen=100)
# len(GOTestDataEncoded)