In [1]:
import nltk
import os
import csv
import string
import json

import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer

# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Open the file and read its contents
with open('input/chatgpt.txt', 'r') as file:
    text = file.read()
    
# os.path.join(os.getcwd(), '..', 'input/chatgpt.txt')

In [3]:
tokens = word_tokenize(text)

In [4]:
sia = SentimentIntensityAnalyzer()
overall_sentiment = sia.polarity_scores(text)

In [5]:
sentences = nltk.sent_tokenize(text)
sentences_sentiment = []
for sentence in sentences:
    sentiment_scores = sia.polarity_scores(sentence)
    sentiment_scores['sentence'] = sentence
    sentences_sentiment.append(sentiment_scores)

In [6]:
stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [7]:
def find_index(searchToken, structure):
    for item in structure:
        if searchToken == structure[item]['token']:
            return item
    print('ERROR!' + ' searchToken: ' + searchToken)
    return -1

In [8]:
def find_tag(searchToken, structure):
    tag = ''
    for token in structure:
        if searchToken == token[0]:
            return token[1]
    return 'error'

In [29]:
tokens_clean = [token.lower() for token in tokens if not token.startswith("'") and token not in string.punctuation]
fdistClean = FreqDist(tokens_clean)
tokens_unique = list(set(tokens_clean))
tokens_hapaxes = set(fdistClean.hapaxes())
tokens_tags = nltk.pos_tag(tokens_clean)

tokens_final = {}
i = 0
for token in tokens_clean:
    tokens_final[i] = {'token' : token, 
                       'count' : fdistClean[token],
                       'tag' : find_tag(token, tokens_tags),
                       'frequency' : fdistClean.freq(token), 
                       'sentiment' : sia.polarity_scores(token)}
    i += 1

    

    
tokens_unique_final = {}
i = 0
for token in tokens_unique:
    tokens_unique_final[find_index(token, tokens_final)] = { 'token': token, 
                                                    'count' : fdistClean[token], 
                                                    'tag' : find_tag(token, tokens_tags),
                                                    'frequency' : fdistClean.freq(token), 
                                                    'sentiment' : sia.polarity_scores(token)}
    i += 1
print(tokens_unique_final)

    
tokens_hapaxes_final = {}
i = 0
for token in tokens_hapaxes:
    tokens_hapaxes_final[find_index(token, tokens_final)] = { 'token': token, 
                                                    'count' : fdistClean[token], 
                                                    'tag' : find_tag(token, tokens_tags),
                                                    'frequency' : fdistClean.freq(token), 
                                                    'sentiment' : sia.polarity_scores(token)}
    i += 1
    
tokens_unique_final_sorted = dict(sorted(tokens_unique_final.items()))


{1190: {'token': 'user', 'count': 1, 'tag': 'NN', 'frequency': 0.000683526999316473, 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 831: {'token': 'fractioned', 'count': 2, 'tag': 'VBN', 'frequency': 0.001367053998632946, 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 266: {'token': 'advisable', 'count': 1, 'tag': 'JJ', 'frequency': 0.000683526999316473, 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 871: {'token': 'ledger', 'count': 1, 'tag': 'NN', 'frequency': 0.000683526999316473, 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 702: {'token': 'specialize', 'count': 1, 'tag': 'VBP', 'frequency': 0.000683526999316473, 'sentiment': {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}}, 25: {'token': 'energy', 'count': 9, 'tag': 'NN', 'frequency': 0.006151742993848257, 'sentiment': {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.2732}}, 853: {'token': 'its', 'count': 2, 'tag': 'PRP$', 'frequency':

In [10]:
tokens_stemmed = [stemmer.stem(token) for token in tokens_clean if token.lower() not in stopwords]
fdistStemmed = FreqDist(tokens_stemmed)
tokens_stemmed_unique = list(set(tokens_stemmed))
tokens_stemmed_hapaxes = set(fdistStemmed.hapaxes())
tokens_stemmed_tags = nltk.pos_tag(tokens_stemmed)

tokens_stemmed_final = {}
i = 0
for token in tokens_stemmed:
    tokens_stemmed_final[i] = {'token' : token, 
                       'count' : fdistStemmed[token],
                       'tag' : find_tag(token, tokens_stemmed_tags),
                       'frequency' : fdistStemmed.freq(token), 
                       'sentiment' : sia.polarity_scores(token)}
    i += 1
    
# print(tokens_stemmed_final)

tokens_stemmed_unique_final = {}
i = 0
for token in tokens_stemmed_unique:
    tokens_stemmed_unique_final[find_index(token, tokens_stemmed_final)] = { 'token': token,
                                                    'count' : fdistStemmed[token],
                                                    'tag' : find_tag(token, tokens_stemmed_tags),
                                                    'frequency' : fdistStemmed.freq(token), 
                                                    'sentiment' : sia.polarity_scores(token)}
    i += 1

tokens_stemmed_hapaxes_final = {}
i = 0
for token in tokens_stemmed_hapaxes:
    tokens_stemmed_hapaxes_final[find_index(token, tokens_stemmed_final)] = { 'token': token,
                                                    'count' : fdistStemmed[token],
                                                    'tag' : find_tag(token, tokens_stemmed_tags),
                                                    'frequency' : fdistStemmed.freq(token), 
                                                    'sentiment' : sia.polarity_scores(token)}
    i += 1

    
tokens_stemmed_unique_final_sorted = dict(sorted(tokens_stemmed_unique_final.items()))


In [11]:
# general stats
txtSpecs = {
    'totalTokens' : len(tokens),
    'totalTokensClean' : len(tokens_clean),
    'totalTokensUnique' : len(tokens_unique),
    'totalTokensHapaxes' : len(tokens_hapaxes),
    'totalFilteredStemmed' : len(tokens_stemmed),
    'totalFilteredStemmedUnique' : len(tokens_stemmed_unique),
    'totalFilteredStemmedHapaxes' : len(tokens_stemmed_hapaxes),
    'totalSentences' : len(sentences),
    'sentiment': {
        'neg': overall_sentiment['neg'], 
        'neu': overall_sentiment['neu'], 
        'pos': overall_sentiment['pos'], 
        'compound': overall_sentiment['compound']
    }
}

In [12]:
# THINGS TO SAVE:
# txtSpecs
# tokens_final
# tokens_unique_final
# tokens_unique_final_sorted
# tokens_hapaxes

# tokens_stemmed_final
# tokens_stemmed_unique_final
# tokens_stemmed_unique_final_sorted
# tokens_stemmed_hapaxes

# sentences_sentiment
# print(tokens_unique_final)

In [13]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/txtSpecs.txt'), "w") as file:
    json.dump(txtSpecs, file)

In [14]:
# WRITE CLEAN BATCH #
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensFinal.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'token', 'count', 'tag', 'frequency', 'neg', 'neu', 'pos', 'compound'])
    for item in tokens_final:
        writer.writerow([item, tokens_final[item]['token'], tokens_final[item]['count'], tokens_final[item]['tag'], tokens_final[item]['frequency'], tokens_final[item]['sentiment']['neg'], tokens_final[item]['sentiment']['neu'], tokens_final[item]['sentiment']['pos'], tokens_final[item]['sentiment']['compound']])
        

In [15]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensUnique.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'token', 'count', 'tag', 'frequency', 'neg', 'neu', 'pos', 'compound'])
    for item in tokens_unique_final:
        writer.writerow([item, tokens_unique_final[item]['token'], tokens_unique_final[item]['count'], tokens_unique_final[item]['tag'], tokens_unique_final[item]['frequency'], tokens_unique_final[item]['sentiment']['neg'], tokens_unique_final[item]['sentiment']['neu'], tokens_unique_final[item]['sentiment']['pos'], tokens_unique_final[item]['sentiment']['compound']])
        

In [16]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensUniqueSorted.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'token', 'count', 'tag', 'frequency', 'neg', 'neu', 'pos', 'compound'])
    for item in tokens_unique_final_sorted:
        writer.writerow([item, tokens_unique_final_sorted[item]['token'], tokens_unique_final_sorted[item]['count'], tokens_unique_final_sorted[item]['tag'], tokens_unique_final_sorted[item]['frequency'], tokens_unique_final_sorted[item]['sentiment']['neg'], tokens_unique_final_sorted[item]['sentiment']['neu'], tokens_unique_final_sorted[item]['sentiment']['pos'], tokens_unique_final_sorted[item]['sentiment']['compound']])
        

In [17]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensHapaxes.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'token', 'count', 'tag', 'frequency', 'neg', 'neu', 'pos', 'compound'])
    for item in tokens_hapaxes_final:
        writer.writerow([item, tokens_hapaxes_final[item]['token'], tokens_hapaxes_final[item]['count'], tokens_hapaxes_final[item]['tag'], tokens_hapaxes_final[item]['frequency'], tokens_hapaxes_final[item]['sentiment']['neg'], tokens_hapaxes_final[item]['sentiment']['neu'], tokens_hapaxes_final[item]['sentiment']['pos'], tokens_hapaxes_final[item]['sentiment']['compound']])


In [20]:
# WRITE STEMMED BATCH #
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensStemmedFinal.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'token', 'count', 'tag', 'frequency', 'neg', 'neu', 'pos', 'compound'])
    for item in tokens_stemmed_final:
        writer.writerow([item, tokens_stemmed_final[item]['token'], tokens_stemmed_final[item]['count'], tokens_stemmed_final[item]['tag'], tokens_stemmed_final[item]['frequency'], tokens_stemmed_final[item]['sentiment']['neg'], tokens_stemmed_final[item]['sentiment']['neu'], tokens_stemmed_final[item]['sentiment']['pos'], tokens_stemmed_final[item]['sentiment']['compound']])


In [21]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensStemmedUnique.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'token', 'count', 'tag', 'frequency', 'neg', 'neu', 'pos', 'compound'])
    for item in tokens_stemmed_unique_final:
        writer.writerow([item, tokens_stemmed_unique_final[item]['token'], tokens_stemmed_unique_final[item]['count'], tokens_stemmed_unique_final[item]['tag'], tokens_stemmed_unique_final[item]['frequency'], tokens_stemmed_unique_final[item]['sentiment']['neg'], tokens_stemmed_unique_final[item]['sentiment']['neu'], tokens_stemmed_unique_final[item]['sentiment']['pos'], tokens_stemmed_unique_final[item]['sentiment']['compound']])


In [22]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensStemmedUniqueSorted.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'token', 'count', 'tag', 'frequency', 'neg', 'neu', 'pos', 'compound'])
    for item in tokens_stemmed_unique_final_sorted:
        writer.writerow([item, tokens_stemmed_unique_final_sorted[item]['token'], tokens_stemmed_unique_final_sorted[item]['count'], tokens_stemmed_unique_final_sorted[item]['tag'], tokens_stemmed_unique_final_sorted[item]['frequency'], tokens_stemmed_unique_final_sorted[item]['sentiment']['neg'], tokens_stemmed_unique_final_sorted[item]['sentiment']['neu'], tokens_stemmed_unique_final_sorted[item]['sentiment']['pos'], tokens_stemmed_unique_final_sorted[item]['sentiment']['compound']])


In [23]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/tokensStemmedHapaxes.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['index', 'token', 'count', 'tag', 'frequency', 'neg', 'neu', 'pos', 'compound'])
    for item in tokens_stemmed_hapaxes_final:
        writer.writerow([item, tokens_stemmed_hapaxes_final[item]['token'], tokens_stemmed_hapaxes_final[item]['count'], tokens_stemmed_hapaxes_final[item]['tag'], tokens_stemmed_hapaxes_final[item]['frequency'], tokens_stemmed_hapaxes_final[item]['sentiment']['neg'], tokens_stemmed_hapaxes_final[item]['sentiment']['neu'], tokens_stemmed_hapaxes_final[item]['sentiment']['pos'], tokens_stemmed_hapaxes_final[item]['sentiment']['compound']])


In [24]:
with open(os.path.join(os.getcwd(), '..', 'landigest/input/sentencesSentiment.csv'), "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['neg', 'neu', 'pos', 'compound', 'sentence'])
    for item in sentences_sentiment:
        writer.writerow([item['neg'], item['neu'], item['pos'] , item['compound'], item['sentence']])

In [33]:
print(len(tokens))
print(len(tokens_stemmed_final))


1600
803
