In [1]:
import nltk
import os
import csv
import string
import json

import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer

# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Open the file and read its contents
with open('input/chatgpt.txt', 'r') as file:
    text = file.read()
    
# os.path.join(os.getcwd(), '..', 'input/chatgpt.txt')

In [3]:
tokens = word_tokenize(text)

In [4]:
sia = SentimentIntensityAnalyzer()
overall_sentiment = sia.polarity_scores(text)

In [5]:
sentences = nltk.sent_tokenize(text)
sentences_sentiment = []
for sentence in sentences:
    sentiment_scores = sia.polarity_scores(sentence)
    sentiment_scores['sentence'] = sentence
    sentences_sentiment.append(sentiment_scores)

In [6]:
stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [7]:
def find_token_stemmed_index(searchToken):
    for item in tokens_final:
        if searchToken == tokens_final[item]['token']:
            return item
    return -1

In [8]:
def find_token_index(searchToken):
    for item in tokens_final:
        if searchToken == tokens_final[item]['token']:
            return item
    return -1

In [9]:
tokens_clean = [token.lower() for token in tokens if not token.startswith("'") and token not in string.punctuation]
fdistClean = FreqDist(tokens_clean)
tokens_unique = list(set(tokens_clean))
tokens_hapaxes = set(fdistClean.hapaxes())
tokens_tags = nltk.pos_tag(tokens_clean)

tokens_final = {}
i = 0
for token in tokens_clean:
    tokens_final[i] = {'token' : token, 
                       'count' : fdistClean[token],
                       'tag' : '',
                       'frequency' : fdistClean.freq(token), 
                       'sentiment' : sia.polarity_scores(token)}
    i += 1
    
for token in tokens_tags:
    tokens_final[find_token_index(token[0])]['tag'] = token[1]

In [10]:
tokens_stemmed = [stemmer.stem(token) for token in tokens_clean if token.lower() not in stopwords]
fdistStemmed = FreqDist(tokens_stemmed)
tokens_stemmed_unique = list(set(tokens_stemmed))
tokens_stemmed_hapaxes = set(fdistStemmed.hapaxes())
tokens_stemmed_tags = nltk.pos_tag(tokens_stemmed)

tokens_stemmed_final = {}
i = 0
for token in tokens_clean:
    tokens_stemmed_final[i] = {'token' : token, 
                       'count' : fdistClean[token],
                       'tag' : '',
                       'frequency' : fdistClean.freq(token), 
                       'sentiment' : sia.polarity_scores(token)}
    i += 1

for token in tokens_tags:
    tokens_stemmed_final[find_token_stemmed_index(token[0])]['tag'] = token[1]

In [11]:
# general stats
txtSpecs = {
    'totalTokens' : len(tokens),
    'totalTokensClean' : len(tokens_clean),
    'totalTokensUnique' : len(tokens_unique),
    'totalTokensHapaxes' : len(tokens_hapaxes),
    'totalFilteredStemmed' : len(tokens_stemmed),
    'totalFilteredStemmedUnique' : len(tokens_stemmed_unique),
    'totalFilteredStemmedHapaxes' : len(tokens_stemmed_hapaxes),
    'totalSentences' : len(sentences),
    'sentiment': {
        'neg': overall_sentiment['neg'], 
        'neu': overall_sentiment['neu'], 
        'pos': overall_sentiment['pos'], 
        'compound': overall_sentiment['compound']
    }
}

In [12]:
# THINGS TO SAVE:
# txtSpecs

# tokens_final
# tokens_unique
# tokens_hapaxes

# tokens_stemmed_final
# tokens_stemmed_unique
# tokens_stemmed_hapaxes

# sentences_sentiment

In [14]:
# print(tokens_clean_fdist)
# tokensFiltered = list(fdistFiltered.most_common(31))

In [17]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/txtSpecs.txt'), "w") as file:
    json.dump(txtSpecs, file)

In [18]:
# WRITE CLEAN BATCH #
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensFinal.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'token', 'tag', 'frequency', 'neg', 'neu', 'pos', 'compound'])
    for item in tokens_final:
        writer.writerow([item, tokens_final[item]['token'], tokens_final[item]['tag'], tokens_final[item]['frequency'], tokens_final[item]['sentiment']['neg'], tokens_final[item]['sentiment']['neu'], tokens_final[item]['sentiment']['pos'], tokens_final[item]['sentiment']['compound']])
        

In [19]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensUnique.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Token"])
    for item in tokens_unique:
        writer.writerow([item[0]])

In [20]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensHapaxes.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Token"])
    for item in tokens_hapaxes:
        writer.writerow([item[0]])

In [22]:
# WRITE STEMMED BATCH #
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensStemmedFinal.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'token', 'tag', 'frequency', 'neg', 'neu', 'pos', 'compound'])
    for item in tokens_stemmed_final:
         writer.writerow([item, tokens_stemmed_final[item]['token'], tokens_stemmed_final[item]['tag'], tokens_stemmed_final[item]['frequency'], tokens_stemmed_final[item]['sentiment']['neg'], tokens_stemmed_final[item]['sentiment']['neu'], tokens_stemmed_final[item]['sentiment']['pos'], tokens_stemmed_final[item]['sentiment']['compound']])


In [23]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensStemmedUnique.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Token"])
    for item in tokens_stemmed_unique:
        writer.writerow([item[0]])

In [24]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensStemmedHapaxes.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Token"])
    for item in tokens_stemmed_hapaxes:
        writer.writerow([item[0]])

In [25]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/sentencesSentiment.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['neg', 'neu', 'pos', 'compound', 'sentence'])
    for item in sentences_sentiment:
        writer.writerow([item['neg'], item['neu'], item['pos'] , item['compound'], item['sentence']])