In [2]:
import pandas as pd
import numpy as np
import os
import datetime as dt
import re
#import gensim
import sklearn; # sklearn.get_config()['working_memory']
#from sklearn.metrics import pairwise_distances, pairwise_distances_chunked # not used
#from sklearn.metrics.pairwise import cosine_similarity, cosine_distances # not used
#from scipy.spatial.distance import pdist, cosine # not used

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import AgglomerativeClustering
# Visualisation
import seaborn as sns
import plotly.express as px

#import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [3]:
data_dir = os.path.join(os.environ["HOME"],"Dropbox","EUvsVirus")

data_import = pd.read_csv(os.path.join(data_dir,"all_Data.tsv"), sep="\t")

In [4]:
stopwords = stopwords.words('english')

In [5]:
data = (data_import.dropna(subset=pd.Index(["text"]))
                   .assign(uniqueID = lambda x: x.ProjURL.str.extract(r"(?<=https://devpost.com/software/)(.+$)"),
                           text = lambda x: x.text.str.lower(), # not removing non-word chars yet because later we need the '\n' to separate on lines
                           ) 
)

Calculating the headings which should be removed

In [6]:
headings = (data.text.str.split('\n', expand=True) #splitting into lines
              .melt(value_name='lines') # melting into one column
                .lines.value_counts() # selecting the 'lines' column and counting frequency
                .reset_index()
)
headings.columns = ['headings','counts']


Visual inspection of the 200 most common lines showed that lines present 6+ times were not very defining. 

These will be removed by creating a variable called headings6plus and removing these headings from the text. Note: This is not removing the word from the entire text, just the line

In [7]:
headings.iloc[100:150]

Unnamed: 0,headings,counts
100,what we have learned,7
101,what inspired us?,7
102,funding,7
103,no,7
104,... coming soon,7
105,design,7
106,challenge,7
107,backend,7
108,accomplishments that i'm proud of:,7
109,mitigation or risk avoidance measures,7


In [8]:
headings6plus = headings.loc[headings.counts >= 6,'headings'] # counts of 6+
headings6plus

0                                         
1                              inspiration
2                             what it does
3        accomplishments that i'm proud of
4                           how i built it
                      ...                 
129                           development:
130                               progress
131    our solution’s impact to the crisis
132                                sources
133                               frontend
Name: headings, Length: 134, dtype: object

In [9]:
# lol = list of lines
# low = list of words
data = (data.assign(headings_rmvd = data.text.str.split('\n') # splitting into lines
                                             .apply(lambda lol: [line for line in lol if line not in list(headings6plus)]) # filtering out headings
                                             .apply(lambda lol: ' '.join(lol)), # joining the lines back together
                    words = lambda x: x.headings_rmvd.str.replace('\W',' ').str.split() # spliting on non-word chars and white space
                                                         .apply(lambda low: [word for word in low if word not in stopwords]) # removing stopwords
                   )
)



In [10]:
print(data.shape)
data.head()

(2069, 8)


Unnamed: 0,Challenge,SubChallenge,ProjURL,title,text,uniqueID,headings_rmvd,words
0,Health,Equipment,https://devpost.com/software/evam,EVAM,inspiration\nthere is a huge shortage in the s...,evam,there is a huge shortage in the supply chain f...,"[huge, shortage, supply, chain, ppe, since, vi..."
1,Health,Equipment,https://devpost.com/software/nanomaskcz,NanomaskCZ,inspiration\nthe story of technical university...,nanomaskcz,the story of technical university liberec (tul...,"[story, technical, university, liberec, tul, m..."
2,Health,Equipment,https://devpost.com/software/ecological-medica...,Ecological medical coat,inspiration\nwhat it does\neconomic medical co...,ecological-medical-coat-bfe2jp,economic medical combination in a single patte...,"[economic, medical, combination, single, patte..."
3,Health,Equipment,https://devpost.com/software/ecological-medica...,Ecological medical coat,inspiration\nthe simplicity and the economical...,ecological-medical-coat,the simplicity and the economical way of desig...,"[simplicity, economical, way, designing, garme..."
4,Health,Equipment,https://devpost.com/software/innovative-respir...,Respire Action,inspiration\n• a recent study shows that over ...,innovative-respiratory-mask-with-nanocellulose...,• a recent study shows that over 20% of caregi...,"[recent, study, shows, 20, caregivers, nurses,..."


Checking the word count of the submissions now.

45 submissions have only 1 word (after heading removal). These are not so useful.
After visual inspection I will remove entries with 10 words or less after headings removal. I don't think you could anyway get meaningful information from these submissions.

In [11]:
data.words.apply(len).value_counts().sort_index().head(15)

0      5
1     45
2     31
3     10
4      7
5      8
6      2
7      1
8      2
9      2
10     4
11     3
12     2
13     1
16     1
Name: words, dtype: int64

for x in data.text[data.words.apply(len) == 10]:
    print(x)
    print('---\n---\n')

In [12]:
data = data.loc[data.words.apply(len) > 10]

In [13]:
data

Unnamed: 0,Challenge,SubChallenge,ProjURL,title,text,uniqueID,headings_rmvd,words
0,Health,Equipment,https://devpost.com/software/evam,EVAM,inspiration\nthere is a huge shortage in the s...,evam,there is a huge shortage in the supply chain f...,"[huge, shortage, supply, chain, ppe, since, vi..."
1,Health,Equipment,https://devpost.com/software/nanomaskcz,NanomaskCZ,inspiration\nthe story of technical university...,nanomaskcz,the story of technical university liberec (tul...,"[story, technical, university, liberec, tul, m..."
2,Health,Equipment,https://devpost.com/software/ecological-medica...,Ecological medical coat,inspiration\nwhat it does\neconomic medical co...,ecological-medical-coat-bfe2jp,economic medical combination in a single patte...,"[economic, medical, combination, single, patte..."
3,Health,Equipment,https://devpost.com/software/ecological-medica...,Ecological medical coat,inspiration\nthe simplicity and the economical...,ecological-medical-coat,the simplicity and the economical way of desig...,"[simplicity, economical, way, designing, garme..."
4,Health,Equipment,https://devpost.com/software/innovative-respir...,Respire Action,inspiration\n• a recent study shows that over ...,innovative-respiratory-mask-with-nanocellulose...,• a recent study shows that over 20% of caregi...,"[recent, study, shows, 20, caregivers, nurses,..."
...,...,...,...,...,...,...,...,...
2154,Other,Other,https://devpost.com/software/computer-modeling...,Computer discovery of novel anti COVID-19 drugs,inspiration\nthe methods of virtual screening ...,computer-modeling-of-new-drugs-against-covid-19,the methods of virtual screening and molecular...,"[methods, virtual, screening, molecular, model..."
2155,Other,Other,https://devpost.com/software/beahero,BeAHero,"inspiration\nwe live among heroes. in fact, we...",beahero,"we live among heroes. in fact, we are all hero...","[live, among, heroes, fact, heroes, stay, home..."
2156,Other,Other,https://devpost.com/software/solution-blockcha...,Solution Blockchain API,inspiration\nthe future of companies will be c...,solution-blockchain-api,"the future of companies will be collaboration,...","[future, companies, collaboration, distributio..."
2157,Other,Other,https://devpost.com/software/civictechhub-rgp8zk,CivicTechHub,please join us in our slack channel: https://j...,civictechhub-rgp8zk,please join us in our slack channel: https://j...,"[please, join, us, slack, channel, https, join..."


In [14]:
#from nltk.stem.porter import PorterStemmer
#from nltk.stem.snowball import SnowballStemmer
#from nltk.stem.lancaster import LancasterStemmer
ps = PorterStemmer()
ss = SnowballStemmer('english')
ls = LancasterStemmer()
# low = list of words
data = data.assign(stemmed_words = (data.words.apply(lambda low: [ps.stem(word) for word in low])
                                              #.apply(lambda low: [ss.stem(word) for word in low])
                                              #.apply(lambda low: [ls.stem(word) for word in low])
                                    )
                  )

In [15]:
data.stemmed_words

0       [huge, shortag, suppli, chain, ppe, sinc, viru...
1       [stori, technic, univers, liberec, tul, made, ...
2       [econom, medic, combin, singl, pattern, 180x16...
3       [simplic, econom, way, design, garment, quickl...
4       [recent, studi, show, 20, caregiv, nurs, docto...
                              ...                        
2154    [method, virtual, screen, molecular, model, pl...
2155    [live, among, hero, fact, hero, stay, home, aw...
2156    [futur, compani, collabor, distribut, consensu...
2157    [pleas, join, us, slack, channel, http, join, ...
2158    [media, convers, fundament, crisi, precaut, al...
Name: stemmed_words, Length: 1952, dtype: object

In [16]:
# low = list of words
tfidf_model = TfidfVectorizer(strip_accents='unicode', stop_words='english')
tfidf = tfidf_model.fit_transform(data.stemmed_words.apply(lambda low: ' '.join(low)))

There are way too many 'words' which are just numbers. These also need to be excluded.

In [19]:
tfidf_model.get_feature_names()[:10]

['00',
 '000',
 '0000',
 '0000000000000952',
 '0001',
 '0002',
 '000eur',
 '000gbp',
 '000usd',
 '001']

In [20]:
tfidf_vocab = pd.DataFrame({'words': tfidf_model.get_feature_names()})
print(len(tfidf_vocab))
tfidf_vocab[:1000]

25445


Unnamed: 0,words
0,00
1,000
2,0000
3,0000000000000952
4,0001
...,...
995,3976
996,39800840840
997,3a03de0452
998,3a05


In [21]:
tfidf_vocab = tfidf_vocab.assign(start_end_letter = lambda df: df.words.apply(lambda word: False if re.search('^[a-z].*[a-z]$',word) is None else True)
) # boolean for if the word starts and ends in a letter and contains zero or more characters between the first and last letter

In [22]:
tfidf_vocab.loc[tfidf_vocab.start_end_letter]

Unnamed: 0,words,start_end_letter
1708,a2w,True
1712,a63f,True
1714,a7891b8a6li,True
1717,a879124b,True
1718,a9rad,True
...,...,...
25424,zwischen,True
25425,zxol,True
25426,zytokin,True
25427,zzp,True


In [23]:
# low = list of words
tfidf_model = TfidfVectorizer(strip_accents='unicode', stop_words='english', vocabulary=tfidf_vocab.loc[tfidf_vocab.start_end_letter,'words'])
tfidf = tfidf_model.fit_transform(data.words.apply(lambda low: ' '.join(low)))

In [24]:
hc100 = AgglomerativeClustering(n_clusters=100,affinity = 'cosine', linkage='complete')
hc100.fit(tfidf.toarray())
#took approx 1min to run with sparse matrix of 1952 rows and 23084 words

AgglomerativeClustering(affinity='cosine', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='complete', memory=None, n_clusters=100,
                        pooling_func='deprecated')

In [25]:
px.histogram(x=hc100.labels_, nbins = 100)

In [26]:
hc200 = AgglomerativeClustering(n_clusters=200,affinity = 'cosine', linkage='complete')
hc200.fit(tfidf.toarray())
#took approx 2min to run with sparse matrix of 1952 rows and 23084 words

AgglomerativeClustering(affinity='cosine', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='complete', memory=None, n_clusters=200,
                        pooling_func='deprecated')

In [27]:
px.histogram(x=hc200.labels_, nbins = 200)

In [28]:
data = data.assign(hc100_labels = hc100.labels_,
                   hc200_labels = hc200.labels_)

In [70]:
x = data[['ProjURL','hc100_labels']]
x.columns = ['ID','cl']
x.to_csv(os.path.join(data_dir,'ja-clust100.csv'))

In [71]:
datax = data[['ProjURL','hc200_labels']]
x.columns = ['ID','cl']
x.to_csv(os.path.join(data_dir,'ja-clust200.csv'))

In [65]:
data_dir

'/Users/jillianaugustine/Dropbox/EUvsVirus'

In [157]:
def topwords(series,n):
    ''' returns the top_n words of a series as a list'''
    return series.value_counts().head(n).index.tolist()

In [158]:
split_words_and_labels.groupby('hc100_labels').word.agg(topwords, 5)

hc100_labels
0       [use, ventil, air, pressur, devic]
1           [use, task, time, work, solut]
2       [work, remot, peopl, use, teacher]
3           [risk, data, user, use, covid]
4     [european, need, develop, educ, dog]
                      ...                 
95         [bot, chatbot, use, help, user]
96     [tourism, solut, busi, peopl, need]
97     [devic, distanc, use, busi, social]
98           [help, app, need, peopl, use]
99    [student, learn, educ, use, teacher]
Name: word, Length: 100, dtype: object

In [112]:
split_words_and_labels.groupby('hc200_labels').word.agg(topwords, 5)

hc200_labels
0              [tool, data, use, develop, user]
1              [women, app, need, help, health]
2          [network, data, event, system, peer]
3          [custom, busi, merchant, need, fund]
4      [use, research, data, develop, platform]
                         ...                   
195             [app, user, use, develop, work]
196    [research, 19, remedi, develop, project]
197           [screener, dasha, set, next, way]
198    [kan, strategi, talent, opportun, thank]
199     [peopl, young, project, platform, work]
Name: word, Length: 200, dtype: object