In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import pyLDAvis

import matplotlib.pyplot as plt

In [2]:
dataIn = pd.read_csv('hopesAndFearsShort.csv')
dataIn.head()

Unnamed: 0,Hopes,Fears
0,Get a good data science project that I can put...,Not being able to be productive due to online ...
1,That I get to learn advanced concepts of big d...,That I come from a background of R programming...
2,I am able to master deep learning techniques,That I do not possess the necessary skillset r...
3,"I have had a good background in statistics, d...",My biggest fear is the time is not enough to l...
4,Take away solid skills and technologies which ...,"A project which isn't relevant or impactful, a..."


# Hopes

In [3]:
data=dataIn.Hopes

In [4]:
# put everything to lower case
data=data.str.lower()
# replace lines and tabs
data=data.str.replace('\n',' ')
data=data.str.replace('\t',' ')
#regularize all the characters
data=data.str.replace(r"[^\w\s']",' ')
data=data.str.strip()

In [5]:
#bring in stop words
stops=stopwords.words('english')
stemmer=PorterStemmer()

In [6]:
stops.extend(["i'd",'yet','via','also','along','way'])

In [7]:
x=data.iloc[10]

In [8]:
#stemmer eliminates different conditions of one word
# join put the splitted words into one string
' '.join([stemmer.stem(word) for word in x.split() if word not in stops])

'main goal class hone data scienc skill appli solv real world problem hope achiev work fun meaning project group hope showcas data scienc chop potenti employ group project like pick new skill'

In [9]:
# one line function lambda to apply above
data=data.apply(lambda x:' '.join([stemmer.stem(word) for word in x.split() if word not in stops]))

In [10]:
data

0                get good data scienc project put resum
1     get learn advanc concept big data complet data...
2                        abl master deep learn techniqu
3     good background statist data analysi machin le...
4     take away solid skill technolog give improv em...
5     understand well enough abl explain interview w...
6     know method follow solv real world big data pr...
7     biggest hope would learn neural network realli...
8     accomplish real project first experi data scie...
9     hope big learn curv whole data pipelin opportu...
10    main goal class hone data scienc skill appli s...
11    biggest hope learn work unstructur data hope g...
12    would help learn variou technolog handl huge a...
13    biggest hope cours learn data scienc techniqu ...
14    enhanc skill understand import machin learn co...
15    answer data scienc question end end test well ...
16    complet project proud well becom prepar futur ...
17    hope i'm abl contribut project i'm proud s

In [11]:
vec=CountVectorizer()

In [12]:
vec.fit(data)
counts=vec.transform(data)

In [13]:
counts=counts.toarray()

In [15]:
counts=pd.DataFrame(counts,columns=vec.get_feature_names())
counts.shape

(32, 273)

In [16]:
tfIdfVec=TfidfVectorizer()
tfIdf=tfIdfVec.fit_transform(data)

In [17]:
tfIdf=pd.DataFrame(tfIdf.toarray(),columns=tfIdfVec.get_feature_names())

In [18]:
#find "data science"
counts.iloc[10]['skill'],tfIdf.iloc[10]['skill']

(2, 0.2810494032626764)

In [19]:
counts.iloc[10]['data'],tfIdf.iloc[10]['data']

(2, 0.1452799381655528)

In [20]:
counts.iloc[10]['scienc'],tfIdf.iloc[10]['scienc']

(2, 0.21283769753750711)

although the words appear the same times in a document, their frequencies are different in this dataset

In [21]:
lda=LDA(n_components=3, random_state=11)
lda.fit(counts)
ldaOut=lda.transform(counts)

In [22]:
ldaOut.sum(axis=1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [23]:
import pyLDAvis.sklearn
counts=vec.transform(data)
p=pyLDAvis.sklearn.prepare(lda, counts, vec)

In [24]:
pyLDAvis.save_html(p,'Hopes.html')

# Fears

In [25]:
data=dataIn.Fears

In [26]:
# put everything to lower case
data=data.str.lower()
# replace lines and tabs
data=data.str.replace('\n',' ')
data=data.str.replace('\t',' ')
#regularize all the characters
data=data.str.replace(r"[^\w\s']",' ')
data=data.str.strip()

In [27]:
#bring in stop words
stops=stopwords.words('english')

## different stemmers

In [None]:
#compare the three different stemmer
stemmerPort=PorterStemmer()
stemmerSnow=EnglishStemmer()
stemmerLanc=LancasterStemmer()

In [30]:
stops.extend(["i'll"])

In [31]:
# one line function lambda to apply above
data=data.apply(lambda x:' '.join([stemmer.stem(word) for word in x.split() if word not in stops]))

In [35]:
vec=CountVectorizer()
vec.fit(data)
counts=vec.transform(data)

In [36]:
counts=counts.toarray()

In [37]:
counts=pd.DataFrame(counts,columns=vec.get_feature_names())
counts.shape

(32, 320)

In [38]:
tfIdfVec=TfidfVectorizer()
tfIdf=tfIdfVec.fit_transform(data)

In [39]:
tfIdf=pd.DataFrame(tfIdf.toarray(),columns=tfIdfVec.get_feature_names())

In [62]:
data[27]

'biggest fear imt 575 lectur onlin due covid 19 pandem gener ds assign time consum intens imt 574 winter 2020 hope attend lectur remot enough us solv assign abl avail equal time attent effort cours spring quarter'

In [41]:
counts.iloc[10]['internship'],tfIdf.iloc[10]['internship']

(1, 0.1438704231463137)

In [56]:
counts.iloc[29]['project'],tfIdf.iloc[29]['project']

(1, 0.1755100196111007)

In [55]:
counts.iloc[29]['final'],tfIdf.iloc[29]['final']

(1, 0.29388328711423456)

although the words appear the same times in a document, their frequencies are different in this dataset

In [57]:
lda=LDA(n_components=3, random_state=11)
lda.fit(counts)
ldaOut=lda.transform(counts)

In [59]:
counts=vec.transform(data)
p=pyLDAvis.sklearn.prepare(lda, counts, vec)

In [60]:
pyLDAvis.save_html(p,'Fears.html')

# abstract

In [63]:
abstracts=pd.read_csv('abstracts2.txt',delimiter='\t')

In [64]:
abstracts

Unnamed: 0,pID,abstract
0,4618374,The article addresses Vera's unusual foregroun...
1,1609906,"Spermatocytes of the crane-fly,Nephrotoma sutu..."
2,2983758,The problem of estimating distance to a stella...
3,4397894,The article is based on a study aimed to under...
4,1731317,Brains of rats undernourished from midgestatio...
...,...,...
9995,1129316,By using behavioral observations and sociometr...
9996,1612345,Cells with polyploid nuclei are generally larg...
9997,20063747,Old-growth bottomland hardwood-Pinus taeda L. ...
9998,3088351,"In my original study, ""Long-Run Convergence of..."


In [65]:
data=abstracts.abstract
# put everything to lower case
data=data.str.lower()
# replace lines and tabs
data=data.str.replace('\n',' ')
data=data.str.replace('\t',' ')
#regularize all the characters
data=data.str.replace(r"[^\w\s']",' ')
data=data.str.strip()
#bring in stop words
stops=stopwords.words('english')
stemmer=PorterStemmer()
data=data.apply(lambda x:' '.join([stemmer.stem(word) for word in x.split() if word not in stops]))

0       articl address vera' unusu foreground violent ...
1       spermatocyt crane fli nephrotoma suturali atta...
2       problem estim distanc stellar system measur ap...
3       articl base studi aim understand analys tenanc...
4       brain rat undernourish midgest kill wean conta...
                              ...                        
9995    use behavior observ sociometr method stabl dom...
9996    cell polyploid nuclei gener larger cell organ ...
9997    old growth bottomland hardwood pinu taeda l fo...
9998    origin studi long run converg ethnic skill dif...
9999    scholar often use roll call vote studi legisl ...
Name: abstract, Length: 10000, dtype: object

In [70]:
vec=CountVectorizer()
vec.fit(data)
counts=vec.transform(data)
counts=counts.toarray()
counts=pd.DataFrame(counts,columns=vec.get_feature_names())

In [79]:
tfIdfVec=TfidfVectorizer()
tfIdf=tfIdfVec.fit_transform(data)
tfIdf=pd.DataFrame(tfIdf.toarray(),columns=tfIdfVec.get_feature_names())

In [69]:
data[200]

"two laboratori experi investig mate guard sperm alloc pattern adult male virgin femal snow crab chionoecet opilio relat sex ratio although femal outnumb male treatment oper sex ratio male bias femal matur asynchron limit period sexual attract matur molt male guard femal significantli longer sex ratio increas mean time per femal 2 9 2 20 treatment compar 5 6 6 20 treatment femal injuri mortal scale posit sex ratio male guard greatest number day significantli larger experiment' end significantli smaller vasa deferentia suggest greater sperm expens male guard fewer day experi spermathec load sl quantiti ejacul store female' spermatheca independ molt date except femal bias treatment neg relat sl increas sex ratio increas mainli femal accumul ejacul howev similarli size male smaller vasa deferentia pass smaller ejacul given sex ratio mean sl 55 less one experi femal extrud clutch fertil egg median sl 3 4 mg one order magnitud smaller femal well fertil clutch 31 50 mg indic sperm limit male

In [80]:
counts.iloc[200]['femal'],tfIdf.iloc[200]['femal']

(11, 0.3222182601512589)

In [81]:
counts.iloc[200]['sperm'],tfIdf.iloc[200]['sperm']

(9, 0.4324161584591311)

In [85]:
lda=LDA(n_components=16, random_state=0)
lda.fit(counts)
ldaOut=lda.transform(counts)
p=pyLDAvis.sklearn.prepare(lda, counts, vec)
pyLDAvis.save_html(p,'Abstract.html')