In [1]:
import pandas as pd
import gensim as gs

1. Read in the file with project decription data as pandas dataframe
2. Extract the text column
3. Preprocess, and create dictionary
4. Create TFIDF model
5. Experiment with other models
6. Model to dataframe

In [2]:
# The file with all the project data is in "Data/all_data.tsv"
tsvFile = "Data/all_data.tsv"
df = pd.read_csv(tsvFile, sep="\t")

df.head()

Unnamed: 0,Challenge,SubChallenge,ProjURL,title,text
0,Health,Equipment,https://devpost.com/software/evam,EVAM,Inspiration\nThere is a huge shortage in the s...
1,Health,Equipment,https://devpost.com/software/nanomaskcz,NanomaskCZ,Inspiration\nThe story of Technical University...
2,Health,Equipment,https://devpost.com/software/ecological-medica...,Ecological medical coat,Inspiration\nWhat it does\nEconomic medical co...
3,Health,Equipment,https://devpost.com/software/ecological-medica...,Ecological medical coat,Inspiration\nThe simplicity and the economical...
4,Health,Equipment,https://devpost.com/software/innovative-respir...,Respire Action,Inspiration\n• A recent study shows that over ...


In [3]:
df.describe()

Unnamed: 0,Challenge,SubChallenge,ProjURL,title,text
count,2159,2159,2159,2159,2069
unique,6,32,2159,2145,2064
top,Health,Other,https://devpost.com/software/roots_to_leaves-9...,UVC_safecase,Built With\npython
freq,896,328,1,2,3


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2159 entries, 0 to 2158
Data columns (total 5 columns):
Challenge       2159 non-null object
SubChallenge    2159 non-null object
ProjURL         2159 non-null object
title           2159 non-null object
text            2069 non-null object
dtypes: object(5)
memory usage: 84.4+ KB


In [10]:
# drop rows with NA text, keep track of old Index Numbers
filtered_df = df[df['text'].notnull()].reset_index(drop=False).rename(columns = {"index":"oldIndex"})  # re-index after dropping NA rows
print(filtered_df.info())
print(filtered_df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2069 entries, 0 to 2068
Data columns (total 6 columns):
oldIndex        2069 non-null int64
Challenge       2069 non-null object
SubChallenge    2069 non-null object
ProjURL         2069 non-null object
title           2069 non-null object
text            2069 non-null object
dtypes: int64(1), object(5)
memory usage: 97.1+ KB
None
   oldIndex Challenge SubChallenge  \
0         0    Health    Equipment   
1         1    Health    Equipment   
2         2    Health    Equipment   
3         3    Health    Equipment   
4         4    Health    Equipment   

                                             ProjURL                    title  \
0                  https://devpost.com/software/evam                    EVAM    
1            https://devpost.com/software/nanomaskcz               NanomaskCZ   
2  https://devpost.com/software/ecological-medica...  Ecological medical coat   
3  https://devpost.com/software/ecological-medica...  Ecological

In [11]:
print(filtered_df.info())
print(filtered_df.head())
print(len(filtered_df))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2069 entries, 0 to 2068
Data columns (total 6 columns):
oldIndex        2069 non-null int64
Challenge       2069 non-null object
SubChallenge    2069 non-null object
ProjURL         2069 non-null object
title           2069 non-null object
text            2069 non-null object
dtypes: int64(1), object(5)
memory usage: 97.1+ KB
None
   oldIndex Challenge SubChallenge  \
0         0    Health    Equipment   
1         1    Health    Equipment   
2         2    Health    Equipment   
3         3    Health    Equipment   
4         4    Health    Equipment   

                                             ProjURL                    title  \
0                  https://devpost.com/software/evam                    EVAM    
1            https://devpost.com/software/nanomaskcz               NanomaskCZ   
2  https://devpost.com/software/ecological-medica...  Ecological medical coat   
3  https://devpost.com/software/ecological-medica...  Ecological

Pre-process the data:
1. tokenize
2. remove stopwords and standalone numbers
3. remove words that occur in only one document, and in more than 75% of documents

In [12]:
from nltk.corpus import stopwords
stopWords = stopwords.words('english')
stopWords.extend(['inspiration', 'challenges', 'accomplishments', 'hackathon', 'eu'])
print(stopWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [13]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')

def tokenize_docs(d):
    d = d.lower()  # Convert to lowercase.
    d = tokenizer.tokenize(d)  # Split into words.
    
    # Remove numbers, but not words that contain numbers.
    d = [token for token in d if not (token.isnumeric() or token in stopWords)]
    
    # Remove words that are only one character.
    d = [token for token in d if len(token) > 1]
    
    return d
    

In [16]:
newdf = filtered_df.copy()
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2069 entries, 0 to 2068
Data columns (total 6 columns):
oldIndex        2069 non-null int64
Challenge       2069 non-null object
SubChallenge    2069 non-null object
ProjURL         2069 non-null object
title           2069 non-null object
text            2069 non-null object
dtypes: int64(1), object(5)
memory usage: 97.1+ KB


In [17]:

newdf = newdf.assign(docs = filtered_df.apply(lambda x: tokenize_docs(d = x.text), axis=1))
newdf = newdf.assign(docLen = [len(d) for d in newdf.docs])

print(newdf.info())
newdf.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2069 entries, 0 to 2068
Data columns (total 8 columns):
oldIndex        2069 non-null int64
Challenge       2069 non-null object
SubChallenge    2069 non-null object
ProjURL         2069 non-null object
title           2069 non-null object
text            2069 non-null object
docs            2069 non-null object
docLen          2069 non-null int64
dtypes: int64(2), object(6)
memory usage: 129.4+ KB
None


Unnamed: 0,oldIndex,Challenge,SubChallenge,ProjURL,title,text,docs,docLen
0,0,Health,Equipment,https://devpost.com/software/evam,EVAM,Inspiration\nThere is a huge shortage in the s...,"[huge, shortage, supply, chain, ppe, since, vi...",138
1,1,Health,Equipment,https://devpost.com/software/nanomaskcz,NanomaskCZ,Inspiration\nThe story of Technical University...,"[story, technical, university, liberec, tul, m...",110
2,2,Health,Equipment,https://devpost.com/software/ecological-medica...,Ecological medical coat,Inspiration\nWhat it does\nEconomic medical co...,"[economic, medical, combination, single, patte...",77
3,3,Health,Equipment,https://devpost.com/software/ecological-medica...,Ecological medical coat,Inspiration\nThe simplicity and the economical...,"[simplicity, economical, way, designing, garme...",70
4,4,Health,Equipment,https://devpost.com/software/innovative-respir...,Respire Action,Inspiration\n• A recent study shows that over ...,"[recent, study, shows, caregivers, nurses, doc...",340


In [18]:
newdf1 = newdf[newdf.docLen > 4].reset_index(drop=True)
print(newdf1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979 entries, 0 to 1978
Data columns (total 8 columns):
oldIndex        1979 non-null int64
Challenge       1979 non-null object
SubChallenge    1979 non-null object
ProjURL         1979 non-null object
title           1979 non-null object
text            1979 non-null object
docs            1979 non-null object
docLen          1979 non-null int64
dtypes: int64(2), object(6)
memory usage: 123.8+ KB
None


In [19]:
out = [doc for doc in newdf.docs if len(doc) < 5]
# take a look at the first 2 documents
for d in out:
    print(d)
    print(len(d))
    print("+++++++++++++++")
    
print(len(out))    

['built', 'fullfacecovermaskreusableuv']
2
+++++++++++++++
['built', 'laser', 'cutplastic']
3
+++++++++++++++
['built', 'language']
2
+++++++++++++++
['built', 'hardware']
2
+++++++++++++++
['built', 'itgoesbeyondsoftwares']
2
+++++++++++++++
[]
0
+++++++++++++++
['built', 'automationcoolersheat', 'exchangersvalves']
3
+++++++++++++++
['built', 'asp', 'net']
3
+++++++++++++++
['built', 'pythonsklearn']
2
+++++++++++++++
['built', 'drupaljavascript']
2
+++++++++++++++
['built', 'androidasteriskc', 'ibmjavajavascriptopensipsperconaphp']
3
+++++++++++++++
['built', 'android']
2
+++++++++++++++
['built', 'apibledp', '3tnode', 'jsoauth2']
4
+++++++++++++++
['built', 'python']
2
+++++++++++++++
['built', 'python']
2
+++++++++++++++
['built', 'netandroidmysql', 'etc']
3
+++++++++++++++
['built', 'angular', 'jselasticsearchfirebaseml']
3
+++++++++++++++
['built', 'react', 'nativeunity']
3
+++++++++++++++
['built', 'androidiosjava']
2
+++++++++++++++
['built', 'angular', 'jsbootstrapcss3firebas

In [20]:
# Remove docs that have less than 5 words in text description
docs = [doc for doc in newdf1.docs if len(doc) > 4]

# take a look at the first 2 documents
for d in docs[:2]:
    print(d)
    print(len(d))
    print("+++++++++++++++")
    
print(len(docs))

['huge', 'shortage', 'supply', 'chain', 'ppe', 'since', 'virus', 'initially', 'struck', 'lot', 'intense', 'disjointed', 'efforts', 'hospitals', 'frontline', 'workers', 'know', 'request', 'ppe', 'local', 'manufacturers', 'enough', 'capabilities', 'supply', 'needs', 'direct', 'communication', 'hospitals', 'frontline', 'workers', 'matchmaking', 'platform', 'supply', 'needs', 'hospitals', 'frontline', 'workers', 'soon', 'possible', 'enabling', 'better', 'alignment', 'mapping', 'local', 'industry', 'determine', 'real', 'time', 'manufacturing', 'capabilities', 'evam', 'platform', 'assume', 'leadership', 'position', 'marshal', 'tremendous', 'unified', 'activity', 'around', 'supply', 'certified', 'ppe', 'built', 'application', 'built', 'based', 'gentelella', 'open', 'sourced', 'dashboard', 'admin', 'panel', 'website', 'built', 'using', 'jquery', 'bootstrap', 'mongodb', 'python', 'backend', 'use', 'owncloud', 'distribute', 'repository', 'files', 'proud', 'evam', 'helps', 'reducing', 'supply', '

In [21]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur in less than 2 documents, or more than 75% of the documents.
dictionary.filter_extremes(no_below=2, no_above=0.75)

In [22]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]


In [23]:
# How many unique tokens?
print('Number of unique tokens: %d' % len(dictionary))
num_terms = len(dictionary)

# How many documents in total?
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 15736
Number of documents: 1979


In [24]:
# Create an TFIDF model
from gensim.models import TfidfModel, LdaModel

tfidf = TfidfModel(corpus)  # step 1 -- initialize a model

In [25]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf[:2]:
    print(doc)

[(0, 0.026685043004652877), (1, 0.05273554470221744), (2, 0.05996049168516476), (3, 0.08546358894237012), (4, 0.11463773435528961), (5, 0.049427510421978946), (6, 0.029709975318643903), (7, 0.06028028479826235), (8, 0.09125289274942194), (9, 0.04842493161916068), (10, 0.016644741844073874), (11, 0.028750789927342323), (12, 0.08817217398473383), (13, 0.028307402205632395), (14, 0.12817429967860766), (15, 0.07379028725900502), (16, 0.15340415524714046), (17, 0.11737232392649141), (18, 0.03693523330576522), (19, 0.10207210194725623), (20, 0.02442004053299428), (21, 0.06636554863462223), (22, 0.06989963799003192), (23, 0.049737226157912616), (24, 0.15486996393014718), (25, 0.07670207762357023), (26, 0.05661656324624962), (27, 0.056900795330827515), (28, 0.12439828918938549), (29, 0.06509422601527302), (30, 0.046275921420060415), (31, 0.07536113984603159), (32, 0.05408196643016936), (33, 0.03595756647893225), (34, 0.07379028725900502), (35, 0.3650115709976878), (36, 0.07015625832440933), (3

In [26]:
dictionary.get(0)

'access'

In [27]:

# transform the gensim tfidf corpus into a matrix, with documents as rows, and terms as columns
matrix_tfidf = gs.matutils.corpus2dense(corpus_tfidf, num_terms = num_terms).transpose()
print(matrix_tfidf)

[[0.02668504 0.05273554 0.05996049 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.01758248 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [28]:
# documents are rows, terms are columns
matrix_tfidf.shape

(1979, 15736)

In [33]:
from sklearn.cluster import KMeans

num_clusters = 200

km = KMeans(n_clusters=num_clusters)

%time km.fit(matrix_tfidf)


CPU times: user 8min 45s, sys: 2min 57s, total: 11min 42s
Wall time: 3min 14s


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=200, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [34]:
clusters = km.labels_.tolist()
print(len(clusters))
for c in clusters[:10]:
    print(c)

1979
189
53
38
38
53
53
68
53
8
53


In [35]:
clusterDF = newdf1.assign(Clusters = clusters)
print(clusterDF.head())

   oldIndex Challenge SubChallenge  \
0         0    Health    Equipment   
1         1    Health    Equipment   
2         2    Health    Equipment   
3         3    Health    Equipment   
4         4    Health    Equipment   

                                             ProjURL                    title  \
0                  https://devpost.com/software/evam                    EVAM    
1            https://devpost.com/software/nanomaskcz               NanomaskCZ   
2  https://devpost.com/software/ecological-medica...  Ecological medical coat   
3  https://devpost.com/software/ecological-medica...  Ecological medical coat   
4  https://devpost.com/software/innovative-respir...           Respire Action   

                                                text  \
0  Inspiration\nThere is a huge shortage in the s...   
1  Inspiration\nThe story of Technical University...   
2  Inspiration\nWhat it does\nEconomic medical co...   
3  Inspiration\nThe simplicity and the economical...   
4  I

In [36]:
clusterDF.Clusters.value_counts()

72     58
121    55
73     45
143    31
88     28
1      28
2      27
51     26
91     25
53     24
18     24
33     23
97     22
189    22
44     22
105    21
132    21
152    20
69     20
40     20
187    20
156    20
59     19
116    19
139    18
155    18
119    18
19     18
161    17
29     17
       ..
6       3
174     3
168     3
167     3
166     3
188     3
164     3
46      3
151     3
76      3
27      3
114     3
108     3
128     3
176     2
50      2
186     2
184     2
95      2
183     2
21      2
104     2
124     2
158     2
125     2
107     2
110     1
28      1
162     1
78      1
Name: Clusters, Length: 200, dtype: int64

In [37]:
import pprint
cluster72 = clusterDF[clusterDF.Clusters == 72]
pprint.pprint(cluster72.iloc[15:20, ])

     oldIndex Challenge             SubChallenge  \
386       417    Health  CommunicationPrevention   
401       433    Health  CommunicationPrevention   
408       440    Health  CommunicationPrevention   
454       493    Health  CommunicationPrevention   
458       498    Health  CommunicationPrevention   

                                             ProjURL  \
386            https://devpost.com/software/covics19   
401        https://devpost.com/software/covid-safety   
408            https://devpost.com/software/aaron-ai   
454            https://devpost.com/software/covid360   
458  https://devpost.com/software/medilinkup-ehealth   

                                                 title  \
386                                           Covics19   
401                                       COVID SAFETY   
408                                      COVID BUSTERS   
454                                           Covid360   
458  MediLinkUp:eHEALTH SYSTEM 4DETECTION,DIAGNOSIS...   

 

In [39]:
import pprint
cluster128 = clusterDF[clusterDF.Clusters == 128]
pprint.pprint(cluster128)
pprint.pprint(cluster128.ProjURL)

      oldIndex Challenge SubChallenge  \
528        573    Health   RapidTests   
541        588    Health   RapidTests   
1169      1270  Business        Other   

                                       ProjURL         title  \
528   https://devpost.com/software/prevencovid   Prevencovid   
541    https://devpost.com/software/ddtvsvirus  DDT vs VIRUS   
1169    https://devpost.com/software/walkmydog     WalkMyDog   

                                                   text  \
528   Inspiration\nDogs love help us, my work is let...   
541   Detection Dogs Ticino in overview\nDetection D...   
1169  Inspiration\nI had have an Idea while “Collie ...   

                                                   docs  docLen  Clusters  
528   [dogs, love, help, us, work, let, people, know...      90       128  
541   [detection, dogs, ticino, overview, detection,...     965       128  
1169  [idea, collie, midday, walking, moment, plenty...     699       128  
528     https://devpost.com/software/

In [41]:
#save the results to CSV file

fName = f'Data/TFIDF_{num_clusters}clusters.csv'
print(fName)
clusterDF.to_csv(fName)

Data/TFIDF_200clusters.csv


In [146]:
# Train LDA model.

# Set training parameters.
num_topics = 100
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [147]:
# For each of the num_topics topics, determine the top (20, by default) words in the topic
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -3.8863.
[([(0.00858227, 'solution'),
   (0.0068623223, 'also'),
   (0.005707453, 'project'),
   (0.005237375, 'crisis'),
   (0.0051258174, 'team'),
   (0.0048869634, 'covid'),
   (0.0046775066, 'time'),
   (0.0042104283, 'would'),
   (0.0040072636, 'business'),
   (0.00394939, 'use'),
   (0.003876881, 'order'),
   (0.003845131, 'based'),
   (0.0037838111, 'well'),
   (0.0037745114, 'need'),
   (0.0037551026, 'development'),
   (0.003535463, 'platform'),
   (0.0034644823, 'impact'),
   (0.0033837864, 'support'),
   (0.003304356, 'provide'),
   (0.003300399, 'new')],
  -0.5875600642290859),
 ([(0.012116218, 'community'),
   (0.011626942, 'work'),
   (0.009488415, 'us'),
   (0.008477686, 'help'),
   (0.007341419, 'get'),
   (0.006831982, 'social'),
   (0.0067313686, 'team'),
   (0.0065870564, 'also'),
   (0.006518814, 'idea'),
   (0.0063450546, 'together'),
   (0.0056585595, 'crisis'),
   (0.0052640014, 'working'),
   (0.004785875, 'solution'),
   (0.004664495, '

In [152]:
# Visualize the topics using pyLDAvis

import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
vis

In [156]:
# Assign the topic distribution to the corpus (i.e. which topics does each document consist of)
doc_lda = model[corpus]

print(type(doc_lda))
for i in range(3):
    print(doc_lda[i])
    
print(len(doc_lda))

<class 'gensim.interfaces.TransformedCorpus'>
[(0, 0.048877906), (16, 0.03479093), (31, 0.034578357), (39, 0.02626111), (41, 0.03106575), (48, 0.17453487), (55, 0.05675604), (56, 0.031938918), (68, 0.0178188), (70, 0.0691536), (90, 0.3640898), (94, 0.08867111)]
[(0, 0.06321582), (8, 0.017902354), (15, 0.033481665), (17, 0.08714294), (22, 0.038842563), (26, 0.011292393), (30, 0.031809334), (41, 0.036081225), (46, 0.019989822), (48, 0.23483726), (59, 0.03953643), (62, 0.021020861), (63, 0.038522113), (70, 0.10134835), (83, 0.02694071), (93, 0.026458945), (95, 0.04159373), (98, 0.10386344)]
[(3, 0.12771735), (22, 0.101763986), (37, 0.027848227), (41, 0.054141544), (48, 0.12023762), (59, 0.08670301), (63, 0.023022313), (87, 0.034366142), (89, 0.08438267), (90, 0.2894129), (98, 0.026466504)]
2069


Some interesting websites:
https://www.tutorialspoint.com/gensim/gensim_quick_guide.htm