In [66]:
# Module Imports
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import matplotlib.pyplot as plt
import seaborn as sns
from util import clean_journal_ref

In [67]:
cluster = LocalCluster() 
cluster.scale(8) 

# Sets the number of workers 
cluster.adapt(minimum=1, maximum=8) 

# Allows the cluster to auto scale to 10 when tasks are computed 
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 55990 instead


In [68]:
defined_dtypes = {'id': str, 'title': str, 'comments':  str, 'journal-ref':  str, 'journal-ref_cleaned':  str,
                  'categories':  str, 'categories_original':str, "journal-ref_original":str,
                  'feild':str, 'category':str, 'sub_category':str, 'first_category':str}

df = dd.read_csv('without_covid.csv', blocksize="256 MiB", dtype=defined_dtypes)
df.head(5)

2023-05-18 13:08:04,536 - tornado.application - ERROR - Exception in callback functools.partial(<function TCPServer._handle_connection.<locals>.<lambda> at 0x0000020601094790>, <Task finished name='Task-1976289' coro=<BaseTCPListener._handle_stream() done, defined at C:\Users\gv9\AppData\Local\pypoetry\Cache\virtualenvs\portfolio-oVyD_NMl-py3.9\lib\site-packages\distributed\comm\tcp.py:605> exception=ValueError('invalid operation on non-started TCPListener')>)
Traceback (most recent call last):
  File "C:\Users\gv9\AppData\Local\pypoetry\Cache\virtualenvs\portfolio-oVyD_NMl-py3.9\lib\site-packages\tornado\ioloop.py", line 738, in _run_callback
    ret = callback()
  File "C:\Users\gv9\AppData\Local\pypoetry\Cache\virtualenvs\portfolio-oVyD_NMl-py3.9\lib\site-packages\tornado\tcpserver.py", line 387, in <lambda>
    gen.convert_yielded(future), lambda f: f.result()
  File "C:\Users\gv9\AppData\Local\pypoetry\Cache\virtualenvs\portfolio-oVyD_NMl-py3.9\lib\site-packages\distributed\comm\t

Unnamed: 0,id,title,comments,journal-ref,categories
0,704.0001,Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",hep-ph
1,704.0002,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,math.CO cs.CG
2,704.0003,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,physics.gen-ph
3,704.0004,A determinant of Stirling cycle numbers counts...,11 pages,,math.CO
4,704.0006,Bosonic characters of atomic Cooper pairs acro...,"6 pages, 4 figures, accepted by PRA",,cond-mat.mes-hall


In [69]:
for col in df.columns:
    # make all text entries lower case
    df[col] = df[col].str.lower()
    
    # remove new line characters
    df[col] = df[col].str.replace('\n', '')
    
    # strip leading and trailing spaces
    df[col] = df[col].str.strip()

In [70]:
df.head(5)

Unnamed: 0,id,title,comments,journal-ref,categories
0,704.0001,calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","phys.rev.d76:013009,2007",hep-ph
1,704.0002,sparsity-certifying graph decompositions,to appear in graphs and combinatorics,,math.co cs.cg
2,704.0003,the evolution of the earth-moon system based o...,"23 pages, 3 figures",,physics.gen-ph
3,704.0004,a determinant of stirling cycle numbers counts...,11 pages,,math.co
4,704.0006,bosonic characters of atomic cooper pairs acro...,"6 pages, 4 figures, accepted by pra",,cond-mat.mes-hall


In [71]:
df['journal-ref_cleaned'] = df['journal-ref'].apply(clean_journal_ref, meta=('journal-ref', str))

In [72]:
# Removing rows where comments are logged as NaN
df = df.dropna(subset=['comments'])  
# Removing rows where comments are logged as None
df = df[df['comments']!='None']

In [73]:
# Select only the first category mentioned
df['first_category'] = df['categories'].str.split(' ').str[0]
print(type(df['first_category'][0]))
df.head(5)

<class 'dask.dataframe.core.Series'>


Unnamed: 0,id,title,comments,journal-ref,categories,journal-ref_cleaned,first_category
0,704.0001,calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","phys.rev.d76:013009,2007",hep-ph,physics.review,hep-ph
1,704.0002,sparsity-certifying graph decompositions,to appear in graphs and combinatorics,,math.co cs.cg,,math.co
2,704.0003,the evolution of the earth-moon system based o...,"23 pages, 3 figures",,physics.gen-ph,,physics.gen-ph
3,704.0004,a determinant of stirling cycle numbers counts...,11 pages,,math.co,,math.co
4,704.0006,bosonic characters of atomic cooper pairs acro...,"6 pages, 4 figures, accepted by pra",,cond-mat.mes-hall,,cond-mat.mes-hall


In [74]:
df[['category', 'sub_category']] = df['categories'].str.split(".", expand=True, n=2)

df['sub_category'] = df['sub_category'].str.split(' ').str[0]
df.head(5)

Unnamed: 0,id,title,comments,journal-ref,categories,journal-ref_cleaned,first_category,category,sub_category
0,704.0001,calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","phys.rev.d76:013009,2007",hep-ph,physics.review,hep-ph,hep-ph,
1,704.0002,sparsity-certifying graph decompositions,to appear in graphs and combinatorics,,math.co cs.cg,,math.co,math,co
2,704.0003,the evolution of the earth-moon system based o...,"23 pages, 3 figures",,physics.gen-ph,,physics.gen-ph,physics,gen-ph
3,704.0004,a determinant of stirling cycle numbers counts...,11 pages,,math.co,,math.co,math,co
4,704.0006,bosonic characters of atomic cooper pairs acro...,"6 pages, 4 figures, accepted by pra",,cond-mat.mes-hall,,cond-mat.mes-hall,cond-mat,mes-hall


In [75]:
df.head(15)

Unnamed: 0,id,title,comments,journal-ref,categories,journal-ref_cleaned,first_category,category,sub_category
0,704.0001,calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","phys.rev.d76:013009,2007",hep-ph,physics.review,hep-ph,hep-ph,
1,704.0002,sparsity-certifying graph decompositions,to appear in graphs and combinatorics,,math.co cs.cg,,math.co,math,co
2,704.0003,the evolution of the earth-moon system based o...,"23 pages, 3 figures",,physics.gen-ph,,physics.gen-ph,physics,gen-ph
3,704.0004,a determinant of stirling cycle numbers counts...,11 pages,,math.co,,math.co,math,co
4,704.0006,bosonic characters of atomic cooper pairs acro...,"6 pages, 4 figures, accepted by pra",,cond-mat.mes-hall,,cond-mat.mes-hall,cond-mat,mes-hall
5,704.0007,polymer quantum mechanics and its continuum limit,"16 pages, no figures. typos corrected to match...","phys.rev.d76:044016,2007",gr-qc,physics.review,gr-qc,gr-qc,
6,704.0008,numerical solution of shock and ramp compressi...,minor corrections,"journal of applied physics, vol 104, 073536 (2...",cond-mat.mtrl-sci,journal.of.applied.physics,cond-mat.mtrl-sci,cond-mat,mtrl-sci
7,704.001,"partial cubes: structures, characterizations, ...","36 pages, 17 figures",,math.co,,math.co,math,co
8,704.0011,computing genus 2 hilbert-siegel modular forms...,14 pages; title changed; to appear in experime...,,math.nt math.ag,,math.nt,math,nt
9,704.0014,iterated integral and the loop product,"18 pages, 1 figure",,math.ca math.at,,math.ca,math,ca


In [76]:
field_dict = {'astro-ph':'physics',
             'cond-mat':'physics',
             'gr-qc':'physics',
             'hep-ex':'physics',
             'hep-lat':'physics',
             'hep-ph':'physics',
             'hep-th':'physics',
             'math-ph':'physics',
             'nlin':'physics',
             'nucl-ex':'physics',
             'nucl-th':'physics',
             'physics':'physics',
             'quant-ph':'physics',
              'math':'math',
              'CoRR':'computer science',
              'q-bio':'quantative biology',
              'q-fin':'quantative finance',
              'stat':'statistics',
              'eess':'electrical engineering and systems science',
              'econ':'economics'}

df['feild'] = df['category'].map(field_dict)

In [77]:
df = df.rename(columns={"categories": "categories_original", "journal-ref": "journal-ref_original"}) 

In [78]:
final_df = df.compute()

In [79]:
final_df.head(5)

Unnamed: 0,id,title,comments,journal-ref_original,categories_original,journal-ref_cleaned,first_category,category,sub_category,feild
0,704.0001,calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","phys.rev.d76:013009,2007",hep-ph,physics.review,hep-ph,hep-ph,,physics
1,704.0002,sparsity-certifying graph decompositions,to appear in graphs and combinatorics,,math.co cs.cg,,math.co,math,co,math
2,704.0003,the evolution of the earth-moon system based o...,"23 pages, 3 figures",,physics.gen-ph,,physics.gen-ph,physics,gen-ph,physics
3,704.0004,a determinant of stirling cycle numbers counts...,11 pages,,math.co,,math.co,math,co,math
4,704.0006,bosonic characters of atomic cooper pairs acro...,"6 pages, 4 figures, accepted by pra",,cond-mat.mes-hall,,cond-mat.mes-hall,cond-mat,mes-hall,physics


In [80]:
final_df.to_csv("without_covid_cleaned.csv", index=False)

In [81]:
cluster.close()

In [82]:
def clean_journal_ref(journal):
    # Select only strings to this cleaning is not applied to Nan values
    if type(journal) == str:
        # Errtum shows changes and is usually followed by the journal name 
        # again and a corrected page number or edition number
        # Split string on 'erratum' and remove second half
        journal = journal.split("erratum")[0]

        # make all strings start and end with a dot (.0)
        journal = '.' + journal + '.'

        # replace any spacial character or space with a .
        journal = re.sub('[\W_]', '.', journal)

        # Remove numbers
        journal = re.sub('\d+', '.', journal)

        words_to_replace = ['pp', 'no', 'volume', 'vol', 'issue', 'pages',]
        # Remove month names
        month_names = list(month_name[1:])
        month_names_short = [x[3:] for x in month_names]

        words_to_replace += month_names
        words_to_replace += month_names_short


        for mini_word in words_to_replace:
            journal = journal.replace('.'+mini_word+'.', '.')

        # replace common abbreviations
        for key, value in {'.j.': '.journal.of.', '.rev.': '.review.',
                           '.phys.': '.physics.', '.physical.': '.physics.'}.items():
            journal = journal.replace(key, value)

        # Remove single letters
        journal = re.sub('[.][a-zA-Z](?:[.]|$)', '.', journal)

        # remove repeated dots
        journal = re.sub('\.{2,}', '.', journal)

        # strip dots at the start and end
        journal = journal.strip('.')

    return journal

Traceback (most recent call last):
  File "C:\Users\gv9\.pyenv\pyenv-win\versions\3.9.10\lib\multiprocessing\queues.py", line 247, in _feed
    send_bytes(obj)
  File "C:\Users\gv9\.pyenv\pyenv-win\versions\3.9.10\lib\multiprocessing\connection.py", line 205, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "C:\Users\gv9\.pyenv\pyenv-win\versions\3.9.10\lib\multiprocessing\connection.py", line 285, in _send_bytes
    ov, err = _winapi.WriteFile(self._handle, buf, overlapped=True)
BrokenPipeError: [WinError 232] The pipe is being closed
2023-05-19 05:37:21,335 - distributed.nanny - ERROR - Error in Nanny killing Worker subprocess
Traceback (most recent call last):
  File "C:\Users\gv9\.pyenv\pyenv-win\versions\3.9.10\lib\asyncio\tasks.py", line 452, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\gv9\AppData\Local\pypoetry\Cac

Traceback (most recent call last):
  File "C:\Users\gv9\.pyenv\pyenv-win\versions\3.9.10\lib\multiprocessing\queues.py", line 247, in _feed
    send_bytes(obj)
  File "C:\Users\gv9\.pyenv\pyenv-win\versions\3.9.10\lib\multiprocessing\connection.py", line 205, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "C:\Users\gv9\.pyenv\pyenv-win\versions\3.9.10\lib\multiprocessing\connection.py", line 285, in _send_bytes
    ov, err = _winapi.WriteFile(self._handle, buf, overlapped=True)
BrokenPipeError: [WinError 232] The pipe is being closed
