In [2]:
# Module Imports
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [3]:
cluster = LocalCluster() 
cluster.scale(8) 

# Sets the number of workers 
cluster.adapt(minimum=1, maximum=8) 

# Allows the cluster to auto scale to 10 when tasks are computed 
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 59534 instead
2023-05-19 10:45:52,767 - tornado.application - ERROR - Exception in callback functools.partial(<function TCPServer._handle_connection.<locals>.<lambda> at 0x000002F5CEC36AF0>, <Task finished name='Task-2061' coro=<BaseTCPListener._handle_stream() done, defined at C:\Users\gv9\AppData\Local\pypoetry\Cache\virtualenvs\portfolio-oVyD_NMl-py3.9\lib\site-packages\distributed\comm\tcp.py:605> exception=ValueError('invalid operation on non-started TCPListener')>)
Traceback (most recent call last):
  File "C:\Users\gv9\AppData\Local\pypoetry\Cache\virtualenvs\portfolio-oVyD_NMl-py3.9\lib\site-packages\tornado\ioloop.py", line 738, in _run_callback
    ret = callback()
  File "C:\Users\gv9\AppData\Local\pypoetry\Cache\virtualenvs\portfolio-oVyD_NMl-py3.9\lib\site-packages\tornado\tcpserver.py", line 387, in <lambda>
    gen.convert_yielded(future), lambda f: f.result()
  File "C:\Users\gv9\AppData\Local\p

In [4]:
defined_dtypes = {'id': str, 'title': str, 'comments':  str, 'journal-ref':  str, 'id':  str}

df = dd.read_csv('without_covid_cleaned.csv', blocksize="256 MiB", dtype=defined_dtypes)
df.head(5)

Unnamed: 0,id,title,comments,journal-ref_original,categories_original,journal-ref_cleaned,first_category,category,sub_category,feild
0,704.0001,calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","phys.rev.d76:013009,2007",hep-ph,physics.review,hep-ph,hep-ph,,physics
1,704.0002,sparsity-certifying graph decompositions,to appear in graphs and combinatorics,,math.co cs.cg,,math.co,math,co,math
2,704.0003,the evolution of the earth-moon system based o...,"23 pages, 3 figures",,physics.gen-ph,,physics.gen-ph,physics,gen-ph,physics
3,704.0004,a determinant of stirling cycle numbers counts...,11 pages,,math.co,,math.co,math,co,math
4,704.0006,bosonic characters of atomic cooper pairs acro...,"6 pages, 4 figures, accepted by pra",,cond-mat.mes-hall,,cond-mat.mes-hall,cond-mat,mes-hall,physics


In [5]:
titles = df['title'].compute()

In [6]:
titles = titles.str.lower()

titles = titles.str.replace('[^A-Za-z\s]', '', regex=True)
titles = titles.str.replace('  ', ' ', regex=True)

for i in range(5):
    print(titles[i])

calculation of prompt diphoton production cross sections at tevatron and lhc energies
sparsitycertifying graph decompositions
the evolution of the earthmoon system based on the dark matter field fluid model
a determinant of stirling cycle numbers counts unlabeled acyclic singlesource automata
bosonic characters of atomic cooper pairs across resonance


In [10]:
from nltk.corpus import stopwords

# get enlish stop words
english_stopwords = stopwords.words('english')

# remove any non alphabetic, non whitespace characters
my_english_stopwords = [word.replace('[^A-Za-z\s]', '') for word in english_stopwords]

my_english_stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [11]:
# Make a list with each word as an item
titles_as_lists = titles.str.split()
# Make each word its own item in a series
individual_words = titles_as_lists.explode()

print(individual_words.head(10))

0    calculation
0             of
0         prompt
0       diphoton
0     production
0          cross
0       sections
0             at
0       tevatron
0            and
Name: title, dtype: object


In [14]:
import nltk
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

# Applying lemmatisation to series
individual_word_stems = individual_words.apply(lambda word: stemmer.stem(word))

individual_word_stems.head(10)

0      calcul
0          of
0      prompt
0    diphoton
0     product
0       cross
0     section
0          at
0    tevatron
0         and
Name: title, dtype: object

In [24]:
import nltk
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

for i in ['model', 'modelling', 'modelled', 'models']:
    print(i, ':', stemmer.stem(i))

model : model
modelling : model
modelled : model
models : model


In [23]:
individual_words.value_counts().head(10)

title
of      898989
the     689104
and     540033
in      537331
a       346170
for     337990
on      217099
with    196194
to      121485
from     98387
Name: count, dtype: int64

In [19]:
# Caluclate frequency of each word in series
value_counts = individual_word_stems.value_counts()

**Word Rankings before stop words have been removed**
- of      899003
- the     689110
- and     540041
- in      537339
- a       346181
- for     337998
- on      217102
- with    196198
- to      121488
- from     98387

In [25]:
value_counts_no_stopwords = value_counts.drop(index=my_english_stopwords, errors='ignore')
value_counts_no_stopwords.head(10)

title
model      130451
quantum     98291
system      72570
theori      64817
field       58532
effect      57007
gener       56889
dynam       54976
use         53465
network     52084
Name: count, dtype: int64

Before lemmitisation

quantum     98155

model       74674

theory      52267

using       50263

systems     46438

learning    42634

models      39951

field       39216

analysis    34508

magnetic    33020


In [26]:
final_df = pd.DataFrame(value_counts_no_stopwords)

final_df['rank'] = final_df['count'].rank(ascending=False)
final_df.head(10)

Unnamed: 0_level_0,count,rank
title,Unnamed: 1_level_1,Unnamed: 2_level_1
model,130451,1.0
quantum,98291,2.0
system,72570,3.0
theori,64817,4.0
field,58532,5.0
effect,57007,6.0
gener,56889,7.0
dynam,54976,8.0
use,53465,9.0
network,52084,10.0


In [27]:
final_df.to_csv("word_rankings.csv", index=True)

In [28]:
client.close()