Based on scripts/data_nyt.py 

This will create the data set containing the full corpus, so the topics of the full corpus can be determined

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pickle
import random
from scipy import sparse
import itertools
from scipy.io import savemat, loadmat
import pandas as pd

In [2]:
# Maximum / minimum document frequency
max_df = 0.75
min_df = 3  # choose desired value for min_df

# Train/Test Proportions
TrProp = 0.95
TsProp = 1.00

# Read stopwords
with open('scripts/stops.txt', 'r') as f:
    stops = f.read().split('\n')


stops.extend(['inspiration', 'challenges', 'accomplishments', 'hackathon', 'eu'])
print(stops)

['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'b', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'both', 'brief', 'but', 'by', 'c', 'came', 'can', 'cannot', 'cant', 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'course', 'currentl

In [3]:
# Read data
print('reading text file...')

# The file with all the project data is in "data/all_data.tsv"
tsvFile = "data/all_data.tsv"
df = pd.read_csv(tsvFile, sep="\t")

df = df.assign(oldIndex = df.index)

print(df.head())
df.info()


reading text file...
  Challenge SubChallenge                                            ProjURL  \
0    Health    Equipment                  https://devpost.com/software/evam   
1    Health    Equipment            https://devpost.com/software/nanomaskcz   
2    Health    Equipment  https://devpost.com/software/ecological-medica...   
3    Health    Equipment  https://devpost.com/software/ecological-medica...   
4    Health    Equipment  https://devpost.com/software/innovative-respir...   

                     title                                               text  \
0                    EVAM   Inspiration\nThere is a huge shortage in the s...   
1               NanomaskCZ  Inspiration\nThe story of Technical University...   
2  Ecological medical coat  Inspiration\nWhat it does\nEconomic medical co...   
3  Ecological medical coat  Inspiration\nThe simplicity and the economical...   
4           Respire Action  Inspiration\n• A recent study shows that over ...   

   oldIndex  
0  

In [4]:
# drop rows with NA text, keep track of old Index Numbers
newDF = df[df['text'].notnull()].reset_index(drop = True)

print(newDF.info())
print(newDF.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2069 entries, 0 to 2068
Data columns (total 6 columns):
Challenge       2069 non-null object
SubChallenge    2069 non-null object
ProjURL         2069 non-null object
title           2069 non-null object
text            2069 non-null object
oldIndex        2069 non-null int64
dtypes: int64(1), object(5)
memory usage: 97.1+ KB
None
  Challenge SubChallenge                                            ProjURL  \
0    Health    Equipment                  https://devpost.com/software/evam   
1    Health    Equipment            https://devpost.com/software/nanomaskcz   
2    Health    Equipment  https://devpost.com/software/ecological-medica...   
3    Health    Equipment  https://devpost.com/software/ecological-medica...   
4    Health    Equipment  https://devpost.com/software/innovative-respir...   

                     title                                               text  \
0                    EVAM   Inspiration\nThere is a huge shor

In [5]:
# How many rows have been dropped?
len(set(df.oldIndex).difference(set(newDF.oldIndex)))

90

In [6]:
docs = newDF.text.copy()

In [7]:
# split the corpus into sentences before saving the corpus:
from nltk.tokenize import sent_tokenize
from itertools import chain

sent1 = [sent_tokenize(doc) for doc in docs]

# flatten the list:
sent1 = list(chain.from_iterable(sent1))


In [8]:
# Many people forgot to punctuate between sentences, and used EOL instead.
# So lets also split sentences by EOL

import re
import string


def clean_sentence(s):
    newS = ' '.join(word.strip(string.punctuation) for word in s.split())  # get rid of extra punctuation
    yield newS
    
sentences = [clean_sentence(s) for l in sent1 for s in re.split("\n", l)]

# Again, flatten the list
sentences = list(chain.from_iterable(sentences))

In [9]:
# Write the sentences to a .txt file
with open("data/fullcorpus.txt", 'w') as f:
    for v in sentences:
        f.write(v + '\n')

In [10]:
# Create count vectorizer
print('counting document frequency of words...')
cvectorizer = CountVectorizer(min_df=min_df, max_df=max_df, stop_words=None)
cvz = cvectorizer.fit_transform(docs).sign()

counting document frequency of words...


In [11]:
# Get vocabulary
print('building the vocabulary...')
sum_counts = cvz.sum(axis=0)
v_size = sum_counts.shape[1]
sum_counts_np = np.zeros(v_size, dtype=int)
for v in range(v_size):
    sum_counts_np[v] = sum_counts[0,v]
word2id = dict([(w, cvectorizer.vocabulary_.get(w)) for w in cvectorizer.vocabulary_])
id2word = dict([(cvectorizer.vocabulary_.get(w), w) for w in cvectorizer.vocabulary_])
del cvectorizer
print('  initial vocabulary size: {}'.format(v_size))

building the vocabulary...
  initial vocabulary size: 12247


In [12]:
# Sort elements in vocabulary
idx_sort = np.argsort(sum_counts_np)
vocab_aux = [id2word[idx_sort[cc]] for cc in range(v_size)]

# Filter out stopwords (if any)
vocab_aux = [w for w in vocab_aux if w not in stops]
print('  vocabulary size after removing stopwords from list: {}'.format(len(vocab_aux)))

  vocabulary size after removing stopwords from list: 11805


In [13]:
# Create dictionary and inverse dictionary
vocab = vocab_aux
# del vocab_aux
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])


In [14]:
# add a column to newDF, with the text converted into word IDs
docs_full = [[word2id[w] for w in doc.split() if w in word2id] for doc in docs]
newDF = newDF.assign(fullDS = docs_full)
print(newDF.head())

  Challenge SubChallenge                                            ProjURL  \
0    Health    Equipment                  https://devpost.com/software/evam   
1    Health    Equipment            https://devpost.com/software/nanomaskcz   
2    Health    Equipment  https://devpost.com/software/ecological-medica...   
3    Health    Equipment  https://devpost.com/software/ecological-medica...   
4    Health    Equipment  https://devpost.com/software/innovative-respir...   

                     title                                               text  \
0                    EVAM   Inspiration\nThere is a huge shortage in the s...   
1               NanomaskCZ  Inspiration\nThe story of Technical University...   
2  Ecological medical coat  Inspiration\nWhat it does\nEconomic medical co...   
3  Ecological medical coat  Inspiration\nThe simplicity and the economical...   
4           Respire Action  Inspiration\n• A recent study shows that over ...   

   oldIndex                           

In [15]:
# get rid of rows (documents) that contain too few words
def not_empty(in_docs, minDocLen = 3):
    num_docs = len(in_docs)
    return[j for j in range(num_docs) if len(in_docs[j]) > minDocLen]

keepIdx = not_empty(newDF.fullDS)
removeIdx = list(set(newDF.index).difference(keepIdx))

prevLen = newDF.shape[0]
tempLen = len(set(newDF.index).difference(keepIdx))

newDF = newDF.loc[keepIdx].reset_index(drop = True)

print('  current number of rows: {} [this should be equal to {}]'.format(newDF.shape[0], prevLen - tempLen))


del prevLen
del tempLen
del keepIdx
del removeIdx

  current number of rows: 1961 [this should be equal to 1961]


In [16]:
# Split in train/test/valid
print('tokenizing documents and splitting into train/test/valid...')
num_docs = newDF.shape[0]
print(num_docs)
trSize = int(np.floor(TrProp*num_docs))
tsSize = int(np.floor(TsProp*num_docs))
vaSize = int(num_docs - trSize)
del cvz
idx_permute = np.random.permutation(num_docs).astype(int)

print('  training set size: {}'.format(trSize))
print('  validation set size: {}'.format(vaSize))
print('  test set size: {}'.format(tsSize))
print('  total documents size: {} (this should be equal to {})'.format(num_docs, trSize + vaSize))

tokenizing documents and splitting into train/test/valid...
1961
  training set size: 1862
  validation set size: 99
  test set size: 1961
  total documents size: 1961 (this should be equal to 1961)


In [17]:
newDF = newDF.assign(newIndex = newDF.index)
newDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 8 columns):
Challenge       1961 non-null object
SubChallenge    1961 non-null object
ProjURL         1961 non-null object
title           1961 non-null object
text            1961 non-null object
oldIndex        1961 non-null int64
fullDS          1961 non-null object
newIndex        1961 non-null int64
dtypes: int64(2), object(6)
memory usage: 122.6+ KB


In [18]:
#newDF = newDF.assign(permuteIdx = newDF.apply(lambda x: np.where(x.newIndex == idx_permute)[0][0], axis = 1))
print(len(idx_permute))
def get_permuteIndex(n):
    N = np.where(n == idx_permute)
    return N[0][0]
    
newDF = newDF.assign(permuteIdx = newDF.apply(lambda x: get_permuteIndex(x.newIndex), axis = 1))
newDF.head()

1961


Unnamed: 0,Challenge,SubChallenge,ProjURL,title,text,oldIndex,fullDS,newIndex,permuteIdx
0,Health,Equipment,https://devpost.com/software/evam,EVAM,Inspiration\nThere is a huge shortage in the s...,0,"[11434, 10677, 11466, 11152, 11753, 10756, 393...",0,1003
1,Health,Equipment,https://devpost.com/software/nanomaskcz,NanomaskCZ,Inspiration\nThe story of Technical University...,1,"[10628, 11725, 9162, 11460, 11640, 11651, 3967...",1,1327
2,Health,Equipment,https://devpost.com/software/ecological-medica...,Ecological medical coat,Inspiration\nWhat it does\nEconomic medical co...,2,"[11746, 10822, 11491, 9551, 11438, 11756, 9138...",2,1434
3,Health,Equipment,https://devpost.com/software/ecological-medica...,Ecological medical coat,Inspiration\nThe simplicity and the economical...,3,"[9389, 9740, 10793, 11621, 10928, 5426, 9138, ...",3,60
4,Health,Equipment,https://devpost.com/software/innovative-respir...,Respire Action,Inspiration\n• A recent study shows that over ...,4,"[11037, 11349, 11093, 11747, 11561, 11669, 114...",4,706


In [19]:
# Remove words not in train_data
vocab = list(set([w for idx_d in range(trSize) for w in newDF.text[idx_permute[idx_d]].split() if w in word2id]))
word2id = dict([(w, j) for j, w in enumerate(vocab)])
id2word = dict([(j, w) for j, w in enumerate(vocab)])
print('  vocabulary after removing words not in train: {}'.format(len(vocab)))


  vocabulary after removing words not in train: 10684


In [20]:
docs_vocab = [[i for i in doc if i in id2word] for doc in newDF.fullDS]
newDF = newDF.assign(vocabDS = docs_vocab)

In [21]:
keepIdx = not_empty(newDF.vocabDS, minDocLen = 1)
removeIdx = list(set(range(len(docs_vocab))).difference(keepIdx))


In [22]:
print(len(keepIdx))
len(removeIdx)

1945


16

In [None]:
idx_tr = [idx_permute[idx_d] for idx_d in range(trSize) if idx_permute[idx_d] in keepIdx]
idx_va = [idx_permute[idx_d] for idx_d in range(trSize, trSize + vaSize) if idx_permute[idx_d] in keepIdx]
idx_ts = [idx_permute[idx_d] for idx_d in range(num_docs) if idx_permute[idx_d] in keepIdx]

docs_tr = [newDF.vocabDS[idx_d] for idx_d in idx_tr]
docs_ts = [newDF.vocabDS[idx_d] for idx_d in idx_ts]
docs_va = [newDF.vocabDS[idx_d] for idx_d in idx_va]

print('  number of documents (train): {} [this should be less than {} and equal {}]'.format(len(docs_tr), trSize, len(idx_tr)))
print('  number of documents (test): {} [this should be less than {} and equal {}]'.format(len(docs_ts), tsSize, len(idx_ts)))
print('  number of documents (valid): {} [this should be less than {} and equal {}]'.format(len(docs_va), vaSize, len(idx_va)))


In [24]:
def assign_type(Idx):
    pIdx = newDF.permuteIdx[Idx]
    if Idx in removeIdx:
        dsType = "NA"
    elif (pIdx < trSize):
        dsType = "Tr"
    else:
        dsType = "Va"
        
    return dsType


newDF = newDF.assign(dsType = [assign_type(Idx) for Idx in newDF.index])

print(newDF.head())

print(newDF.info())


  Challenge SubChallenge                                            ProjURL  \
0    Health    Equipment                  https://devpost.com/software/evam   
1    Health    Equipment            https://devpost.com/software/nanomaskcz   
2    Health    Equipment  https://devpost.com/software/ecological-medica...   
3    Health    Equipment  https://devpost.com/software/ecological-medica...   
4    Health    Equipment  https://devpost.com/software/innovative-respir...   

                     title                                               text  \
0                    EVAM   Inspiration\nThere is a huge shortage in the s...   
1               NanomaskCZ  Inspiration\nThe story of Technical University...   
2  Ecological medical coat  Inspiration\nWhat it does\nEconomic medical co...   
3  Ecological medical coat  Inspiration\nThe simplicity and the economical...   
4           Respire Action  Inspiration\n• A recent study shows that over ...   

   oldIndex                           

In [25]:
trSize1 = sum(newDF.dsType == "Tr")
vaSize1 = sum(newDF.dsType == "Va")
tsSize1 = sum(newDF.dsType != "NA")

print('  number of documents (train): {} [this should be equal to {}]'.format(len(docs_tr), trSize1))
print('  number of documents (test): {} [this should be equal to {}]'.format(len(docs_ts), tsSize1))
print('  number of documents (valid): {} [this should be equal to {}]'.format(len(docs_va), vaSize1))


  number of documents (train): 1848 [this should be equal to 1848]
  number of documents (test): 1945 [this should be equal to 1945]
  number of documents (valid): 97 [this should be equal to 97]


In [None]:
# Split test set in 2 halves
print('splitting test documents in 2 halves...')
docs_ts_h1 = [[w for i,w in enumerate(doc) if i<=len(doc)/2.0-1] for doc in docs_ts]
docs_ts_h2 = [[w for i,w in enumerate(doc) if i>len(doc)/2.0-1] for doc in docs_ts]

In [None]:
# Getting lists of words and doc_indices
print('creating lists of words...')

def create_list_words(in_docs):
    return [x for y in in_docs for x in y]

words_tr = create_list_words(docs_tr)
words_ts = create_list_words(docs_ts)
words_ts_h1 = create_list_words(docs_ts_h1)
words_ts_h2 = create_list_words(docs_ts_h2)
words_va = create_list_words(docs_va)

print('  len(words_tr): ', len(words_tr))
print('  len(words_ts): ', len(words_ts))
print('  len(words_ts_h1): ', len(words_ts_h1))
print('  len(words_ts_h2): ', len(words_ts_h2))
print('  len(words_va): ', len(words_va))


In [None]:
# Get doc indices
print('getting doc indices...')

def create_doc_indices(in_docs):
    aux = [[j for i in range(len(doc))] for j, doc in enumerate(in_docs)]
    return [int(x) for y in aux for x in y]

doc_indices_tr = create_doc_indices(docs_tr)
doc_indices_ts = create_doc_indices(docs_ts)
doc_indices_ts_h1 = create_doc_indices(docs_ts_h1)
doc_indices_ts_h2 = create_doc_indices(docs_ts_h2)
doc_indices_va = create_doc_indices(docs_va)

print('  len(np.unique(doc_indices_tr)): {} [this should be {}]'.format(len(np.unique(doc_indices_tr)), len(docs_tr)))
print('  len(np.unique(doc_indices_ts)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts)), len(docs_ts)))
print('  len(np.unique(doc_indices_ts_h1)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h1)), len(docs_ts_h1)))
print('  len(np.unique(doc_indices_ts_h2)): {} [this should be {}]'.format(len(np.unique(doc_indices_ts_h2)), len(docs_ts_h2)))
print('  len(np.unique(doc_indices_va)): {} [this should be {}]'.format(len(np.unique(doc_indices_va)), len(docs_va)))

In [None]:
# Number of documents in each set
n_docs_tr = len(docs_tr)
n_docs_ts = len(docs_ts)
n_docs_ts_h1 = len(docs_ts_h1)
n_docs_ts_h2 = len(docs_ts_h2)
n_docs_va = len(docs_va)

# Remove unused variables
del docs_tr
del docs_ts
del docs_ts_h1
del docs_ts_h2
del docs_va


In [None]:
print(len(doc_indices_tr))
print(len(words_tr))
print(n_docs_tr)
print(len(vocab))

In [None]:
# Create bow representation
print('creating bow representation...')

def create_bow(doc_indices, words, n_docs, vocab_size):
    return sparse.coo_matrix(([1]*len(doc_indices),(doc_indices, words)), shape=(n_docs, vocab_size)).tocsr()

bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab))
bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab))
bow_ts_h1 = create_bow(doc_indices_ts_h1, words_ts_h1, n_docs_ts_h1, len(vocab))
bow_ts_h2 = create_bow(doc_indices_ts_h2, words_ts_h2, n_docs_ts_h2, len(vocab))
bow_va = create_bow(doc_indices_va, words_va, n_docs_va, len(vocab))

del words_tr
del words_ts
del words_ts_h1
del words_ts_h2
del words_va
del doc_indices_tr
del doc_indices_ts
del doc_indices_ts_h1
del doc_indices_ts_h2
del doc_indices_va


In [None]:
# Save vocabulary to file
import os

path_save = './min_df_' + str(min_df) + '/'
if not os.path.isdir(path_save):
    os.system('mkdir -p ' + path_save)

with open(path_save + 'vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)
del vocab


In [None]:
with open(path_save + 'dataDF.pkl', 'wb') as f:
    pickle.dump(newDF, f)

In [None]:
# Split bow into token/value pairs
print('splitting bow into token/value pairs and saving to disk...')

def split_bow(bow_in, n_docs):
    indices = [[w for w in bow_in[doc,:].indices] for doc in range(n_docs)]
    counts = [[c for c in bow_in[doc,:].data] for doc in range(n_docs)]
    return indices, counts

bow_tr_tokens, bow_tr_counts = split_bow(bow_tr, n_docs_tr)
savemat(path_save + 'bow_tr_tokens', {'tokens': bow_tr_tokens}, do_compression=True)
savemat(path_save + 'bow_tr_counts', {'counts': bow_tr_counts}, do_compression=True)
del bow_tr
del bow_tr_tokens
del bow_tr_counts

bow_ts_tokens, bow_ts_counts = split_bow(bow_ts, n_docs_ts)
savemat(path_save + 'bow_ts_tokens', {'tokens': bow_ts_tokens}, do_compression=True)
savemat(path_save + 'bow_ts_counts', {'counts': bow_ts_counts}, do_compression=True)
del bow_ts
del bow_ts_tokens
del bow_ts_counts

bow_ts_h1_tokens, bow_ts_h1_counts = split_bow(bow_ts_h1, n_docs_ts_h1)
savemat(path_save + 'bow_ts_h1_tokens', {'tokens': bow_ts_h1_tokens}, do_compression=True)
savemat(path_save + 'bow_ts_h1_counts', {'counts': bow_ts_h1_counts}, do_compression=True)
del bow_ts_h1
del bow_ts_h1_tokens
del bow_ts_h1_counts

bow_ts_h2_tokens, bow_ts_h2_counts = split_bow(bow_ts_h2, n_docs_ts_h2)
savemat(path_save + 'bow_ts_h2_tokens', {'tokens': bow_ts_h2_tokens}, do_compression=True)
savemat(path_save + 'bow_ts_h2_counts', {'counts': bow_ts_h2_counts}, do_compression=True)
del bow_ts_h2
del bow_ts_h2_tokens
del bow_ts_h2_counts

bow_va_tokens, bow_va_counts = split_bow(bow_va, n_docs_va)
savemat(path_save + 'bow_va_tokens', {'tokens': bow_va_tokens}, do_compression=True)
savemat(path_save + 'bow_va_counts', {'counts': bow_va_counts}, do_compression=True)
del bow_va
del bow_va_tokens
del bow_va_counts

print('Data ready !!')
print('*************')

In [None]:
# need to get the word embeddings for the data used, and save it in data/embeddings.txt
# this is done in preProcessText2_test

Good resources for pre-trained models:

https://www.analyticsvidhya.com/blog/2019/03/pretrained-models-get-started-nlp/