In [None]:
# https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline
from pathlib import Path
import pandas as pd
import numpy as np

In [None]:
import os
from os import listdir

In [None]:
DATA = "/kaggle/input"

In [None]:
# explore the input data
# for dirname, _, filenames in os.walk(DATA):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

In [None]:
train_directory = os.path.join(DATA,'feedback-prize-2021','train')

In [None]:
test_directory = os.path.join(DATA,'feedback-prize-2021','test')

In [None]:
glove_directory = os.path.join(DATA,'glovedata')

In [None]:
root_directory = os.path.join(DATA,'feedback-prize-2021')

In [None]:
# explore the word2vec datasets
for dirname, _, filenames in os.walk(glove_directory):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_filename = "train.csv"

In [None]:
train_filepath = os.path.join(root_directory,train_filename)

In [None]:
essay_files = listdir(train_directory)

In [None]:
len(essay_files)

In [None]:
# to bring in the training set spreadsheet (.csv)

In [None]:
train_df = pd.read_csv(train_filepath)

In [None]:
train_df.head()

In [None]:
print(len(train_df))

In [None]:
def retrieveWordage(essay_id,directory):
  essay_filename = essay_id + ".txt"
  essay_filepath = os.path.join(directory,essay_filename)
  with open(essay_filepath) as file:
    file_contents = file.read()
  #try:
  #  essay_text = pd.read_table(essay_filepath,header=0,engine='python',error_bad_lines=False)
  #except:
  #  return None
  #essay_text.columns = ['narrative']
  #return essay_text
  essay_text = []
  essay_line = ""
  for c in file_contents:
    if c == '\n':
      if len(essay_line) > 1:
        essay_text.append(essay_line)
      essay_line = ""
    else:
      essay_line += c
  essay_text.append(essay_line)
  essay_pd = pd.DataFrame(essay_text)
  essay_pd.columns = ['narrative']
  return essay_pd

In [None]:
wordage_test = retrieveWordage("0027FC00C35B",train_directory)
print(wordage_test)

In [None]:
def cleanText2(item):
  item2 = ""
  for ch in item:
      if ch == ' ' or (ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z') or (ch >= '0' and ch <= '9'):
          if ch >= 'A' and ch <= 'Z':
              item2 += ch.lower()
          else:
              item2 += ch
  return item2

In [None]:
def retrieveEssay(essay_id,directory):
  essay_text = retrieveWordage(essay_id,directory)
  if essay_text is None:
        return None
  essay_count = len(essay_text)
  essay = ""
  for i in range(0,essay_count-1):
      essay += cleanText2(essay_text.iloc[i,0])
  return essay

In [None]:
def retrieveRawEssay(essay_id,directory):
  essay_text = retrieveWordage(essay_id,directory)
  if essay_text is None:
      return None
  essay_count = len(essay_text)
  essay = ""
  for i in range(0,essay_count-1):
      item = essay_text.iloc[i,0]
      essay += item
  return essay

In [None]:
essay_test = retrieveEssay("423A1CA112E2",train_directory)
print(essay_test)

In [None]:
from tensorflow.data import Dataset

In [None]:
from tensorflow.data import TextLineDataset

In [None]:
def retrieveDataset(essay_id,directory):
  essay_array = []
  essay_text = retrieveWordage(essay_id,directory)
  if essay_text is None:
    return None
  essay_count = len(essay_text)
  #essay = ""
  for i in range(0,essay_count-1):
      essay_array.append(cleanText2(essay_text.iloc[i,0]))
  #essay = essay_text.iloc[0,0]
  dataset = Dataset.from_tensor_slices(essay_array)
  return dataset

In [None]:
test_dataset = retrieveDataset("423A1CA112E2",train_directory)
print(test_dataset)
for element in test_dataset:
    print(element)

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
# Create the layer.
vectorize_layer = TextVectorization(
 max_tokens=5000,
 output_mode='int',
 output_sequence_length=500)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow import string as tf_string

In [None]:
def retrieveDiscourse(essay_id,discourse_id,discourse_start,discourse_end,directory):
  essay_text = retrieveRawEssay(essay_id,directory)
  essay_count = len(essay_text)
  i = 0
  discourse_text = ""
  for ch in essay_text:
     #print(ch) 
     if i >= discourse_start - 2 and i <= discourse_end + 1:
        discourse_text += ch
     i += 1
  return cleanText2(discourse_text)

In [None]:
test_discourse = retrieveDiscourse("423A1CA112E2",0,8,229,train_directory)
print(test_discourse)
print(len(test_discourse))

In [None]:
# https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/pretrained_word_embeddings.ipynb#scrollTo=Q6j-LRMFc-AR

In [None]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip -q glove.6B.zip

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.initializers import Constant
from tensorflow.keras import layers
from tensorflow.keras import Input
from tensorflow.keras import Model

In [None]:
essays_group = train_df.groupby(['id'])

In [None]:
class_dict = {}
samples = []
labels = []
class_index = 0
for essay_id, essay_pd in essays_group:  
  essay = retrieveEssay(essay_id,train_directory)
  if essay is None:
    break
  discourse_count = len(essay_pd)
  for i in range(0,discourse_count):
    item = essay_pd.iloc[i,]
    discourse_id = item['discourse_id']
    discourse_start = item['discourse_start']
    discourse_end = item['discourse_end']
    discourse_type = item['discourse_type']
    discourse_type = discourse_type.lower()
    if class_dict.get(discourse_type) == None:
      class_name_index = class_index
      class_dict[discourse_type] = class_name_index
      class_index += 1
    else:
      class_name_index = class_dict.get(discourse_type)
    discourse = retrieveDiscourse(essay_id,discourse_id,discourse_start,discourse_end,train_directory)
    samples.append(discourse)
    labels.append(class_name_index)

In [None]:
essay_array = []
samples_len = len(samples)
for i in range(0,samples_len):
  essay_text = samples[i]
  if essay_text != "":
      essay_array.append(essay_text)
vocab = Dataset.from_tensor_slices(essay_array)

In [None]:
# Now that the vocab layer has been created, call `adapt` on the text-only
# dataset to create the vocabulary. You don't have to batch, but for large
# datasets this means we're not keeping spare copies of the dataset.
vectorize_layer.adapt(vocab)

In [None]:
voc = vectorize_layer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
print(len(word_index))

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

In [None]:
path_to_glove_file = os.path.join(glove_directory,"glove.6B.100d.txt")

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False,
)

In [None]:
class_names = []
for key in class_dict:
  print(key)
  class_names.append(key)

In [None]:
print(class_dict)
print(class_names)

In [None]:
# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
test_samples = samples[-num_validation_samples:]

In [None]:
print(len(train_samples))
print(len(test_samples))

In [None]:
train_labels = labels[0:len(train_samples)]
test_labels = labels[len(train_samples):(len(train_samples)+len(test_samples))]

In [None]:
# The class names should be the categories we are looking for.

In [None]:
int_sequences_input = Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(len(class_names), activation="softmax")(x)
model = Model(int_sequences_input, preds)
model.summary()

In [None]:
x_train = vectorize_layer(np.array([[s] for s in train_samples])).numpy()
x_test = vectorize_layer(np.array([[s] for s in test_samples])).numpy()

In [None]:
y_train = np.array([[s] for s in train_labels])
y_test = np.array([[s] for s in test_labels])

In [None]:
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)

In [None]:
#model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_test, y_test))
model.fit(x_train, y_train, batch_size=128, epochs=20)
#model.fit(x_train,y_train)

In [None]:
# Kaggle has a read-only file system so we might have to find another way to persist!
# path_to_model = os.path.join(DATA,'feedback-prize-2021','model')
# model.save(path_to_model)

In [None]:
y_result = model.predict(x_test)

In [None]:
print(len(y_result))

In [None]:
yt0 = y_test[1]

In [None]:
y_rounded = np.round(y_result,1)

In [None]:
y0 = y_rounded[1]

In [None]:
print(y0)

In [None]:
print(yt0)

In [None]:
print(len(y_rounded))
print(len(y_test))

In [None]:
for i in range(0,10):
    y_out = y_rounded[i]
    yt = y_test[i]
    print(y_out)
    print(yt)

In [None]:
# Create Statistics about Where Discourses are in Essays
# To do this we first create a pandas dataframe 'essay_statistics', per essay it lists
# essay_id,essay_length (in characters),discourse_count
class_dict = {}
samples = []
labels = []
class_index = 0
length_of_essay = 0
stats = []
for essay_id, essay_pd in essays_group: 
  stats_entry = []
  stats_entry.append(essay_id)
  essay = retrieveEssay(essay_id,train_directory)
  if essay is None:
    break
  length_of_essay = len(essay)
  stats_entry.append(length_of_essay)
  discourse_count = len(essay_pd)
  stats_entry.append(discourse_count)
  stats.append(stats_entry)
essay_statistics = pd.DataFrame(stats)

In [None]:
essay_statistics.head()

In [None]:
essay_statistics.columns = ['essay_id','essay_length','no_of_discourses']

In [None]:
essay_statistics.head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
x = essay_statistics['essay_length']
y = essay_statistics['no_of_discourses']

In [None]:
plt.plot(x,y)

In [None]:
# https://realpython.com/numpy-scipy-pandas-correlation-python/#visualization-of-correlation

In [None]:
x.corr(y)

In [None]:
y.corr(x)

In [None]:
from scipy import stats as sci_stats

In [None]:
#https://realpython.com/numpy-scipy-pandas-correlation-python/#visualization-of-correlation
slope, intercept, r, p, stderr = sci_stats.linregress(x, y)

In [None]:
print(slope)
print(intercept)
print(r)
print(p)
print(stderr)

In [None]:
def returnExpectedDiscourses(essay_length):
    yyy = intercept + essay_length * slope
    return yyy

In [None]:
discourse_num_err = np.round(r,0)

In [None]:
print(discourse_num_err)

In [None]:
print(returnExpectedDiscourses(1000))

In [None]:
sample_x = []
sample_y = []
for i in range(0,8000,500):
    yy = returnExpectedDiscourses(i)
    sample_x.append(i)
    sample_y.append(yy)

In [None]:
plt.plot(sample_x,sample_y)

In [None]:
essay_test = retrieveEssay("423A1CA112E2",train_directory)
print(essay_test)

In [None]:
length_of_essay = len(essay_test)
expected_discourses = returnExpectedDiscourses(length_of_essay)
print(length_of_essay)
print(expected_discourses)

In [None]:
expected_discourses = 13

In [None]:
# Start out with all discourses of even length
test_discourse = []
length_of_discourses = round(length_of_essay/expected_discourses,0)
print(length_of_discourses)

In [None]:
text_start = 0
discourse_element = []
test_discourse = []
for i in range(1,int(expected_discourses)):
    #print(i)
    text_end = text_start + length_of_discourses
    discourse_element.append(text_start)
    discourse_element.append(text_end)
    text_start = text_end + 1
    test_discourse.append(discourse_element)
    discourse_element = []
text_start = text_end + 1
text_end = length_of_essay
discourse_element.append(text_start)
discourse_element.append(text_end)
test_discourse.append(discourse_element)

In [None]:
# Given an expected length in characters
# the essay
# the offset
# returns a Prediction String as a numerical list
# Of word offsets into the essay
def returnPredictionStringFromCharacterOffsets(essay_text,start_offset,end_offset):
    ch_offset = 0
    record = 0
    word = 1
    prediction_string = ""
    word_count = 0
    new_word = 0
    for ch in essay_text:
        if ch == ' ':
            word += 1
            new_word = 1
        if record == 1:
            if new_word == 1:
                prediction_string += ' '
                prediction_string += str(word)
                new_word = 0
        if ch_offset >= start_offset and ch_offset <= end_offset:
            # ready to record
            record = 1
        elif ch_offset > end_offset:
            # turn off record
            record = 0
        ch_offset += 1
    return prediction_string

In [None]:
essay_test = retrieveEssay("423A1CA112E2",train_directory)
print(essay_test)

In [None]:
print(len(essay_test.split()))

In [None]:
wordage_test = retrieveWordage("423A1CA112E2",train_directory)

In [None]:
essay_count = len(wordage_test)

In [None]:
total = 0
essay = ""
for i in range(1,essay_count-1):
    words = wordage_test.iloc[i,0]
    len_words = len(words.split(' '))
    print(len_words)
    total+=len_words
    essay += words
    essay += " "
print(total)
print(len(essay.split()))

In [None]:
print(len(essay.split(' ')))

In [None]:
for item in wordage_test:
    print(item)

In [None]:
prediction_string_test = returnPredictionStringFromCharacterOffsets(essay_test,25,50)
print(prediction_string_test)

In [None]:
def returnPredictionStringsGivenExpectedNumberOfDiscourses(essay_text,number_of_discourses):
    prediction_strings = []
    word_start = 1
    word_end = 1
    essay_length = len(essay_text)
    essay_words = essay_text.split(' ')
    essay_word_count = len(essay_words)
    #print(essay_length)
    discourse_length = round(essay_word_count / number_of_discourses)
    #print(discourse_length)
    for i in range(0,number_of_discourses):
        word_end = discourse_length + word_start
        prediction_string = ""
        prediction_count = 0
        for j in range(word_start,word_end):
            if prediction_count > 0:
                prediction_string += " "
            prediction_string += str(j)
            prediction_count += 1
        prediction_strings.append(prediction_string)
        word_start = word_end + 1
    return prediction_strings

In [None]:
prediction_strings = returnPredictionStringsGivenExpectedNumberOfDiscourses(essay_test,10)
#print(prediction_strings)
#print(len(prediction_strings))
for prediction_string in prediction_strings:
    #print(prediction_string)
    print(len(prediction_string.split(' ')))

In [None]:
print(prediction_strings)

In [None]:
# Given a prediction string and a score, how do we know if the string can be improved.
# Do we remove a word and add another?
# The final test has to adjust the discourses appropriately to create the correct answer.
# Should we use the example prediction strings to somehow infer whether we are selecting the correct words?
# Haven't we already using the discourses to build the model?
#
# Certainly we could feed the proper discourses back into our model using the prediction string
# and see if we achieve the highest score that way
#
# Using the training set, can we create a set of adjustments that increase the score by adjusting the 
# predition string from the equal set where we have chopped the essay into the appropriate number of discourses
# to the prediction strings indicated by the high score.

In [None]:
def returnDiscoursesFromPredictionStrings(essay_id,directory,prediction_strings):
    discourses = []
    essay = retrieveEssay(essay_id,directory)
    if essay is None:
        return None
    discourse_count = len(prediction_strings)
    essay_words = essay.split(' ')
    #print(len(essay_words))
    for i in range(0,len(prediction_strings)):
        discourse = ""
        prediction_string = prediction_strings[i]
        prediction_numbers = prediction_string.split()
        for i in range(0,len(prediction_numbers)):
            prediction_number = int(prediction_numbers[i])
            #print(prediction_number)
            try:
                word = essay_words[prediction_number-1]
            except:
                word = ""
            #word = essay_words[int(prediction_number)]
            discourse += word
            discourse += " "
        discourses.append(discourse)
    return discourses

In [None]:
discourses_test = returnDiscoursesFromPredictionStrings("423A1CA112E2",train_directory,prediction_strings)
print(len(discourses_test))
for discourse in discourses_test:
    print(discourse)
    print(" ")

In [None]:
def returnPredictions(essay_id,directory,prediction_strings):
    discourses = returnDiscoursesFromPredictionStrings(essay_id,directory,prediction_strings)
    x_discourses = vectorize_layer(np.array([[s] for s in discourses])).numpy()
    predictions = model.predict(x_discourses)
    return predictions

In [None]:
y_result = returnPredictions("423A1CA112E2",train_directory,prediction_strings)

In [None]:
# We wish to have a good way to decide if a given 'fit' is good enough
# Given the results of model.predict of a given discourse using the prediction string above.
# This is a numerical value (statistic) between 0 and 1: Power
# Not arbitrary!
# Given the test result, compute the score
def returnScore(y_result):
    cumulative_score = 0
    number_of_items = 0
    for item in y_result:
        max_score = 0
        for score in item:
            if max_score < score:
                max_score = score
        cumulative_score += max_score
        number_of_items += 1
    #print(number_of_items)
    return cumulative_score / number_of_items

In [None]:
y_score = returnScore(y_result)
print(y_score)

In [None]:
def returnTrainingStrings(essay_id,directory):
    predictions = []
    training_pd = train_df[train_df['id']==essay_id]
    predictionstrings = training_pd['predictionstring']
    #predictionstrings.head()
    for predictionstring in predictionstrings:
        prediction = ""
        for item in predictionstring:
            prediction += item
        predictions.append(prediction.lstrip())
    return predictions

In [None]:
training_strings = returnTrainingStrings("423A1CA112E2",train_directory)
print(training_strings)

In [None]:
print(len(prediction_strings))
print(len(training_strings))
print(prediction_strings[5])
print(training_strings[5])

In [None]:
y_training = returnPredictions("423A1CA112E2",train_directory,training_strings)

In [None]:
y_score_training = returnScore(y_training)
print(y_score_training)

In [None]:
def returnPredictionsFromResult(essay_id,result,strings):
    i = 0
    string_results = []
    for discourse_result in result:
        j = 0
        k = 0
        maxscore = 0
        for score in discourse_result:
            if score > maxscore:
                maxscore = score
                k = j
            j += 1
        try:
            class_name = class_names[k]
        except:
            class_name = "NA"
        try:
            stringlist = strings[i]
        except:
            stringlist = "NA"
        class_name = class_name.capitalize()
        array_result = []
        array_result.append(essay_id)
        array_result.append(class_name)
        array_result.append(stringlist)
        i += 1
        string_results.append(array_result)
    return string_results

In [None]:
returnPredictionsFromResult("423A1CA112E2",y_result,prediction_strings)

In [None]:
returnPredictionsFromResult("423A1CA112E2",y_training,training_strings)

In [None]:
# We are ready to run the test!

In [None]:
# explore the word2vec datasets
def processDirectory(directory):
    essay_results = []
    for dirname, _, filenames in os.walk(directory):
        essay_result = []
        for filename in filenames:
            essay_id = (filename.split('.'))[0]
            essay_test = retrieveEssay(essay_id,test_directory)
            essay_length = len(essay_test)
            number_of_discourses = int(round(returnExpectedDiscourses(essay_length))+1)
            X_strings = returnPredictionStringsGivenExpectedNumberOfDiscourses(essay_test,number_of_discourses)
            #X = returnDiscoursesFromPredictionStrings(essay_id,test_directory,X_strings)
            y = returnPredictions(essay_id,test_directory,X_strings)
            essay_result = returnPredictionsFromResult(essay_id,y,prediction_strings)
            for result in essay_result:
                if len(result) == 3:
                    essay_results.append(result)
        predictions = pd.DataFrame(essay_results)
        predictions.columns = ['id','class','predictionstring']
        return predictions

In [None]:
predictions = processDirectory(test_directory)

In [None]:
print(predictions)

In [None]:
predictions.to_csv('submission.csv',index=False)