In [1]:
import sys
import time
import bibtexparser
import itertools
import requests
import re
import matplotlib as plt
import pandas as pd
from bs4 import BeautifulSoup
from tabulate import tabulate
import numpy as np
import requests

In [2]:
import bibtexparser
bibtex_file = open('gap-publishednicer.bib.txt', encoding='utf-8')
bib_data = bibtexparser.load(bibtex_file)

In [3]:
bib = bib_data.entries

### Here are the 3 datasets we will start with.

In [4]:
bib_df = pd.DataFrame.from_dict(bib) # large one from the Bibliography
review_df = pd.read_csv('no_citation_text.csv', dtype='str') # MR numbers who came back with no `GAP` text found inside
corpus_df = pd.read_csv('gap_citations_corpus.csv', dtype='str') # CItations scraped from MathSciNet website

# Larger dataset from Bibliography

* how to deal with NaN missing values ? *

We only need some of these columns, hence we `drop` the rest.

In [5]:
bib_df.columns

Index(['printedkey', 'doi', 'url', 'mrreviewer', 'mrnumber', 'mrclass', 'issn',
       'fjournal', 'pages', 'year', 'volume', 'journal', 'title', 'author',
       'ENTRYTYPE', 'ID', 'number', 'school', 'booktitle', 'isbn', 'note',
       'publisher', 'day', 'keywords', 'month', 'series', 'annote', 'type',
       'address', 'institution', 'howpublished', 'editor', 'bookeditor',
       'edition', 'key', 'organization'],
      dtype='object')

In [6]:
bib_df.drop(bib_df.columns[[0, 1, 2, 3, 6, 7, 8, 10, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]], axis = 1, inplace = True)

In [7]:
bib_df.columns

Index(['mrnumber', 'mrclass', 'year', 'journal', 'author', 'ENTRYTYPE'], dtype='object')

We reorder the columns.

In [8]:
bib_df = bib_df[['mrnumber', 'author', 'journal', 'year', 'ENTRYTYPE', 'mrclass']]

Then we format the names accordingly. We change the `mrnumber` coulmn name to `MR` so we cane later merge this dataframe with the other one.

In [9]:
bib_df.columns = ['MR', 'Author', 'Journal', 'Year', 'Publication Type', 'MSC']
bib_df

Unnamed: 0,MR,Author,Journal,Year,Publication Type,MSC
0,4056124,"Abas, M. and Vetrík, T.",Theoret. Comput. Sci.,2020,article,05C25 (05C20 20F05)
1,3942387,"Abbas, A. and Assi, A. and García-Sánchez, P. A.",Rev. R. Acad. Cienc. Exactas Fís. Nat. Ser. A ...,2019,article,13F20 (05E15 14H50)
2,,"Abdeljaouad, I.",RAIRO-INF THEOR APPL,1999,article,
3,3354065,"Abdolghafourian, A. and Iranmanesh, M. A.",Comm. Algebra,2015,article,05C25 (20B30 20E45)
4,3646312,"Abdolghafourian, A. and Iranmanesh, M. A. and ...",J. Pure Appl. Algebra,2017,article,20G40 (05C25)
...,...,...,...,...,...,...
3362,2647300,"Zusmanovich, P.",J. Geom. Phys.,2010,article,17B60
3363,2735394,"Zusmanovich, P.",J. Algebra,2010,article,17B40
3364,3201064,"Zusmanovich, P.",J. Algebra,2014,article,17B40
3365,3598575,"Zusmanovich, P.",Linear Algebra Appl.,2017,article,17C10 (17-08 17A30 17C55)


Data types and count of non-null values for each columns.

In [10]:
bib_df.info(show_counts  = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3367 entries, 0 to 3366
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   MR                3159 non-null   object
 1   Author            3367 non-null   object
 2   Journal           3047 non-null   object
 3   Year              3367 non-null   object
 4   Publication Type  3367 non-null   object
 5   MSC               3252 non-null   object
dtypes: object(6)
memory usage: 79.0+ KB


Looking at the `MRN` columns, some cells are empty - `NaN`

In [11]:
bib_df.iloc[3274]

MR                            NaN
Author                 Wegner, A.
Journal                       NaN
Year                         1989
Publication Type    mastersthesis
MSC                        Thesis
Name: 3274, dtype: object

In [12]:
bib_df.describe()

Unnamed: 0,MR,Author,Journal,Year,Publication Type,MSC
count,3159,3367,3047,3367,3367,3252
unique,3158,2511,384,43,10,2268
top,3656296,"Eick, B.",J. Algebra,2017,article,Thesis
freq,2,18,387,188,2976,99


In [13]:
bib_df['MSC'].value_counts()

Thesis                       99
20C15                        36
20C20                        33
20N05                        30
20D15                        22
                             ..
05E30 (05B10)                 1
20F55 (20C20)                 1
20M14 (20M05)                 1
20J05 (20J06)                 1
20C05 (16S34 20C20 20E45)     1
Name: MSC, Length: 2268, dtype: int64

### We will look at the `year` column. There are several anomalies and we need just 4 digits in each cell.

In [14]:
bib_df.sort_values('Year', ascending=False)

Unnamed: 0,MR,Author,Journal,Year,Publication Type,MSC
3165,3973299,"Then, H.",,[2019] \copyright 2019,incollection,11F12 (11R06)
1546,3898507,"Greer, M.",,[2019] \copyright 2019,incollection,20N05
3133,3898514,"Stuhl, I. and Vojtěchovský, P.",,[2019] \copyright 2019,incollection,20N05 (57M27)
3147,3782458,"Swinarski, D.",,[2018] \copyright 2018,incollection,30F20 (14H37 14H45 14Q05)
1981,4167659,"Kaushik, R. and Yadav, M. K.",J. Algebra,2021,article,20D15 (20F12)
...,...,...,...,...,...,...
3274,,"Wegner, A.",,1989,mastersthesis,Thesis
2689,,"Niemeyer, A.",,1988,mastersthesis,Thesis
2673,,"Nickel, W.",,1988,mastersthesis,Thesis
3018,,"Schönert, M.",,1987,mastersthesis,Thesis


We will use `.str` and a regular expresion `(r'^(\d{4})'` which first converts all year cells to strings then takes the first 4 digits from each. We then replace the old values with the filtered ones.

In [15]:
bib_df['Year'] = bib_df['Year'].str.extract(r'^(\d{4})', expand=False)

In [16]:
bib_df['Year'].value_counts()

2017    188
2013    175
2018    168
2020    166
2019    165
2010    163
2015    162
2016    158
2014    154
2011    152
2007    142
2012    142
2008    132
2004    131
2005    128
2009    124
2006    118
2001    107
2003    101
1999     84
2002     84
2000     78
1997     76
1998     58
1995     56
2021     39
1996     34
1994     28
1993     25
1992     13
1991      5
1987      2
1988      2
1989      2
1990      1
Name: Year, dtype: int64

Now we convert the column Data type back to numeric.

bib_df['Year'] = pd.to_numeric(bib_df['Year'])

bib_df['Year'] = bib_df['Year'].astype(np.int64)

In [17]:
# we use this option to remove the trailing zeors behind the year.
# pd.options.display.float_format = '{:.0f}'.format 

We inspect the result and no more anomalies are visible. Data type is integer which is also fine.

#  Corpus and Review

What are the GAP Citation practices ???
The longer the citation is the more details it contains - measure lenght ?

To do
+ check Citations strings lenghts to isolate too short ones

+ website count citations hacing `gap.com` string to measure how many mention the website
+ remaining HTML characters

Two citations containing Singapore instead of GAP present.



In [18]:
corpus_df

Unnamed: 0,MR,Citation
0,MR4056124,"GAP – Groups, algorithms, programming - a syst..."
1,MR3942387,"Delgado, M., García-Sánchez, P.A., Morais, J.:..."
2,MR3942387,"The GAP Group: GAP—groups, algorithms, and pro..."
3,MR3354065,"The GAP – Groups, Algorithms and Programming. ..."
4,MR3646312,"The $\ssf{GAP}$ Group, $\ssf{GAP}$–Groups, Alg..."
...,...,...
3537,MR3988630,"M. Delgado, P. A. García-Sánchez and J. Morais..."
3538,MR1801202,"L.H. Soicher, GRAPE: a system for computing wi..."
3539,MR2558870,"L. Bartholdi, Functionally recursive groups, h..."
3540,MR2824780,"X. Sun, C. Liu, D. Li and J. Gao, On duality g..."


In [19]:
pd.options.display.max_colwidth = 157

In [20]:
# Python lectures by Dr Konovalov 
# https://studres.cs.st-andrews.ac.uk/CS2006/Lectures/Python/L08-dataset.pdf

def get_citation(mrno):
    r = corpus_df[corpus_df['MR'] == mrno]
    return r.at[r.index[0],'Citation'], r.at[r.index[0],'Version']

In [21]:
# Python lectures by Dr Konovalov 
# https://studres.cs.st-andrews.ac.uk/CS2006/Lectures/Python/L08-dataset.pdf

def get_c(mrno):
    r = merged_df[merged_df['MR'] == mrno]
    return r

In [22]:
# Python lectures by Dr Konovalov 
# https://studres.cs.st-andrews.ac.uk/CS2006/Lectures/Python/L08-dataset.pdf

unknown_ver = []

def get_version(s):
    match = re.search("(?:(\d+\.(?:\d+\.)*\d+))", s, re.IGNORECASE)
    if match != None:
        return match.group(1)
        print('* VERSION FOUND *')
    else:
        print('* No VERSION found *', s)
        unknown_ver.append(s)
        return 'Unknown'

In [23]:
def is_package(series):
    mrno = series['MR']
    citation = series['Citation']
    version = series['Version']
    if re.search("package", citation, re.IGNORECASE) != None:
        print('***Package***:', mrno, citation)
        return 'Package'
    else:
        print('***Not a Package***:', mrno, citation)
        return series['Version']

In [24]:
def website_check(series):
    mrno = series['MR']
    citation = series['Citation']
    version = series['Version']
    if re.search("www|\.net|http", citation, re.IGNORECASE) != None:
        print('***Provided Website***:', mrno, citation)
        return 'Yes'
    else:
        print('***Not Provided***:', mrno, citation)
        return 'No'

* First we create the version column. Then we fill its cells with the version, where provided.

In [25]:
corpus_df.insert(loc=2, column='Version', value=' ')

In [26]:
corpus_df['Version'] = corpus_df['Citation'].map(get_version)

* No VERSION found * GAP – Groups, algorithms, programming - a system for computational discrete algebra, www.gap-system.org.
* No VERSION found * Delgado, M., García-Sánchez, P.A., Morais, J.: "Numerical Sgps", A GAP package for numerical semi-groups. https://gap-packages.github.io/numericalsgps. Accessed 19 Aug 2017 
MR3493240
* No VERSION found * M. Schönert et al. GAP - Groups, Algorithms, and Programming (Lehrsthul D für Mathematik, Reinisch-Westflische Technische Hochschule, Aachen, Germany, fifth ed., 1995.)
* No VERSION found * W. Nickel, NQ, 1998, A refereed GAP 4 package, see [10].
* No VERSION found * W. Nickel, NQ, 1998, A refereed GAP 4 package, see [8].
* No VERSION found * Gamble, G., Nickel, W., O'Brien, E.A.: ANU p-Quotient–p-Quotient and p-Group Generation Algorithms (2006). An accepted GAP 4 package, available also in MAGMA
* No VERSION found * M. Schönert et al, GAP: groups, algorithm and programming, © 1992 by Lehrstuhl D für Mathematik, distributed with the GAP so

* No VERSION found * T. Moede. "Coclass Graphs for Nilpotent Associative Algebras." A GAP 4 package, see [The GAP Group 05], available online (http://www.icm.tubs.de/~tobmoede/ccalgs/), 2015.
* No VERSION found * V. Felsch and F. Gähler, CrystCap – A libarary of crystallographic groups. A refereed GAP 4 package, 2000.
* No VERSION found * H.U. Besche, B. Eick, E. O'Brien, Small Groups – a library of groups of small order. A GAP 4 package. Webpage available at www.icm.tu-bs.de/ag_algebra/software/small/small.html, 2005.
* No VERSION found * M. Vaughan-Lee, B. Eick, LiePRing – Database and Algorithms for Lie p-Rings, 2015. A GAP 4 package, see [22].
* No VERSION found * Vaughan-Lee, M., Eick, B., 2015. LiePRing - Database and Algorithms for Lie p-rings. A GAP 4 package, see The GAP Group (2019).
* No VERSION found * G. Olteanu, Á. del Río, An algorithm to compute the Wedderburn decomposition of semisimple group algebras implemented in the GAP package wedderga, J. Symb. Comput. 44 (5) (20

* No VERSION found * P. Moravec, GAP code for computing the unramified brauer groups of finite solvable groups, www.fmf.unilj.si/moravec/b0g.g, 2010.
* No VERSION found * Moreau (A.) - Calculs explicites dans une algébre de Lie semi-simple effectués avec GAP4, arXiv: math.RT/0503019, 2004.
* No VERSION found * [GAP] The GAP group, `GAP—groups, algorithms and programming', http://www.gap-system.org.
* No VERSION found * Cooperman: Gene. Parallel GAP/MPI (ParGAP/MPI), Version 1, College of Computer Science, Northeastern University http://www.ccs.neu.edu/home/gene/pargap.html (1999) 
MR1829475
* No VERSION found * Soicher, L. H. (2003), "The Design Package for GAP," available at designtheory.org/software/gap\_design.
* No VERSION found * B. Eick and W. Nickel. Polycyclic - Computing with polycyclic groups, 2002. A GAP Package, see [8]. 
MR1829476
* No VERSION found * Greg Gamble, Werner Nickel, and Eamonn O'Brien. ANUPQ - p-quotient, 2006. A GAP Package, see [8].
* No VERSION found * W. N

In [27]:
corpus_df['Version'].value_counts()

Unknown    895
4.4        460
4.4.12     310
4.3        232
4.4.10     136
          ... 
1.13         1
3.3.0        1
4.9.0        1
4.4.1        1
3.4.0        1
Name: Version, Length: 197, dtype: int64

Then we apply the function which chekcs if it is a case of citing GAP Package. Packages are connected to GAP, but technically is a separate piece of software. In entries citing GAP package there is no version of GAP provided hence we will fill the `Version` cell with `Package`.

In [28]:
corpus_df['Version'] = corpus_df.apply(is_package,axis=1)

***Not a Package***: MR4056124 GAP – Groups, algorithms, programming - a system for computational discrete algebra, www.gap-system.org.
***Package***: MR3942387 Delgado, M., García-Sánchez, P.A., Morais, J.: "Numerical Sgps", A GAP package for numerical semi-groups. https://gap-packages.github.io/numericalsgps. Accessed 19 Aug 2017 
MR3493240
***Not a Package***: MR3942387 The GAP Group: GAP—groups, algorithms, and programming, version 4.7.5 (2014). http://www.gap-system.org. Accessed 19 Aug 2017
***Not a Package***: MR3354065 The GAP – Groups, Algorithms and Programming. Version 4.4.12, 2008. www.gap-system.org.
***Not a Package***: MR3646312 The $\ssf{GAP}$ Group, $\ssf{GAP}$–Groups, Algorithms, and Programming, 4.7.8, 2015, http://www.gap-system.org.
***Not a Package***: MR1864795 M. Schönert et al. GAP - Groups, Algorithms, and Programming (Lehrsthul D für Mathematik, Reinisch-Westflische Technische Hochschule, Aachen, Germany, fifth ed., 1995.)
***Not a Package***: MR2287843 The G

***Package***: MR2273130 L.H. Soicher, The GRAPE package for GAP, (http://www.maths.qmul.ac.uk/$\sim$leonard/grape/).
***Not a Package***: MR2271685 The GAP Group (2004) GAP–Groups, algorithms, and programming, Version 4.4. (http://www.gap-system.org)
***Not a Package***: MR2356140 The GAP Group. (2004). GAP - Groups, Algorithms, and Programming, Version 4.4; (http://www.gap-system.org).
***Not a Package***: MR2519160 The GAPGroup, GAP—Groups Algorithms, and Programming Version 4.4, 2004. http://www.gap-system.org.
***Not a Package***: MR2408485 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.6; Aachen, St Andrews, 2005, available at http://www.gap-system.org/.
***Package***: MR2408485 L. H. Soicher, The DESIGN package for GAP, available at http://designtheory.org/software/gap\_design/.
***Not a Package***: MR2684077 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.4; 2004, http://www.gap-system.org.
***Package***: MR2684077 L.H. Soicher, The GRAPE pa

***Not a Package***: MR2032439 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.2 (http://www.gap-system.org).
***Not a Package***: MR4119158 The GAP Group. GAP – Groups, Algorithms, and Programming, Version 4.10.0, 2018. URL https://www.gap-system.org.
***Not a Package***: MR2081056 The GAP Group, Gap—groups, algorithms, and programming, version 4.3 (2002, available at http://www.gap-system.org).
***Package***: MR2501513 B. Eick, W. Nickel, Polycyclic–Computing with polycyclic groups, 2002. A GAP Package, see [16]. 
MR1829476
***Not a Package***: MR2501513 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.11, http://www.gap-system.org, 2008.
***Package***: MR2501513 W. Nickel, $\roman{np}$–Nilpotent Quotients of Finitely Presented Groups, 2003. A GAP Package, see [16].
***Package***: MR2478412 B. Eick and W. Nickel. Polycyclic - Computing with polycyclic groups, 2002. A GAP Package, see [12]. 
MR1829476
***Not a Package***: MR2478412 The GAP Group. GAP

***Package***: MR3493477 Soicher, L.H.: The DESIGN package for GAP, version 1.6. http://designtheory.org/software/gap_design/(2011)
***Not a Package***: MR3493477 The GAP group: $\bold{GAP}$—Groups, algorithms, programming, Version 4.4 (2004); http://www.gap-system.org
***Not a Package***: MR3609191 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.4.12; 2008, http://www.gap-system.org
***Not a Package***: MR3666831 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.7.5; 2014, http://www.gap-system.org.
***Package***: MR3666831 L. H. Soicher, The GRAPE package for GAP, Version 4.6.1, 2012, http://www.maths.qmul.ac.uk/~leonard/grape/.
***Not a Package***: MR2228630 The GAP Group, GAP—Groups, Algorithms, and Programming, version 4.4, http://www.gap-system.org, 2004.
***Not a Package***: MR1775788 M. Schönert et al., GAP: Groups, algorithm and programming, Lehrstuhl D für Mathematik, RWTH Aachen, 1994.
***Package***: MR3040874 W. Nickel, NQ-nilpotent quoti

***Not a Package***: MR2825262 The GAP Group, GAP—Groups, algorithms, programming, ver. 4.4.10, 2007, http: //www.gap-system.org .
***Not a Package***: MR2861751 The GAP Group, 'GAP—groups, algorithms, and programming, version 4.4.12', Manual, 2008, http://www.gap-system.org.
***Not a Package***: MR3096504 GAP Group, GAP–groups, algorithms, and programming, version 4.4.12, 2008, http://www.gap-system.org.
***Not a Package***: MR3651588 GAP—Groups, Algorithms, and Programming, Version 4.7.7. http://www.gap-system.org (2015)
***Not a Package***: MR3914542 The GAP Group: GAP–Groups, Algorithms, and Programming. Version 4.7.7, Feb. 2105. http://www.gap-system.org.
***Not a Package***: MR3996938 GAP—Groups, Algorithms, and Programming, Version 4.7.7 (2015). http://www.gap-system.org
***Not a Package***: MR3208394 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.5.7, http://www.gap-system.org, 2012.
***Not a Package***: MR1849331 The GAP Group, Aachen, St Andrews, GAP–Groups

***Package***: MR2422322 B. Eick, W. Nickel, Polycyclic—Computing with polycyclic groups, version 2.2, a refereed GAP 4 package, see [21], available at http://www.icm.tu-bs.de/ag\_algebra/software/eick/polycyclic, 2007.
***Package***: MR2422322 W. Nickel, NQ, 1998, A refereed GAP 4 package, see [21].
***Not a Package***: MR2422322 The GAP Group, GAP—Groups, Algorithms and Programming, version 4.4, available at http://www.gap-system.org, 2005.
***Package***: MR2531220 B. Eick, W. Nickel, Polycyclic—computing with polycyclic groups, 2005. A refereed GAP 4 package, see [20].
***Package***: MR2531220 B. Eick and E. O'Brien, AutPGrp—Computing the automorphism group of a $p$-group, 2005. A refereed GAP 4 package, see [20]. 
MR2050106
***Not a Package***: MR2531220 The GAP Group, GAP—Groups, Algorithms and Programming, Version 4.4, Available from http://www.gap-system.org, 2005.
***Not a Package***: MR2430430 The GAP Group, GAP - Groups, Algorithms and Programming, Version 4.4, Availablefrom 

***Not a Package***: MR4082918 The GAP Group: GAP – Groups, Algorithms, and Programming, Version 4.8.3; 2016. (http://www.gap-system.org).
***Not a Package***: MR2275584 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.3, 2002; http://www.gap-system.org.
***Not a Package***: MR2356445 The GAP Group. (2002). GAP - Groups, Algorithms, and Programming Version 4.3. (http://www.gap-system.org).
***Not a Package***: MR2505101 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.4, 2006. (http://www.gap-system.org)
***Not a Package***: MR2729364 The GAP Group, GAP-Groups, Algorithms, and Programming, Version 4.3, 2002, http: www.gap-system.org.
***Not a Package***: MR3449244 The GAP group, `\tt GAP - groups, algorithms, and programming', Version 4.4, 2004, http://www.gap-system.org.
***Not a Package***: MR3570078 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.7.2; 2013, (http://www.gap-system.org).
***Not a Package***: MR2854791 The GAP Gro

***Package***: MR3834965 Ellis G, HAP-homological algebra programming, a refereed GAP 4 package (GAP Group 2005), available at http://hamilton.nuigalway.ie/Hap/www (2008)
***Not a Package***: MR3834965 The GAP Group, GAP-Groups Algorithms and Programming, version 4.4, available at http://www.gap-system.org (2005)
***Not a Package***: MR3975686 The GAP Group, Groups algorithms and programming. Version 4.8.10, (2018), http://www.gap-system.org.
***Not a Package***: MR4208094 The GAP Group, GAP—Groups, Algorithms, and Programming. http://www.gap-system.org, Version 4.10.2 (2019)
***Package***: MR3279952 A. Distler: RadiRoot: roots of a polynomial as radicals – a GAP package, version 2.6, www.icm.tu-bs.de/agalgebra/software/radiroot, 2011.
***Not a Package***: MR1924512 M. Schönert et al. GAP–Groups, Algorithms and Programming. Lehrstuhl D für Math., Rheinisch–Westfälische Hochschule, Aachen, 5th ed. (1995).
***Package***: MR3418031 S. Gutsche, `GAP package ToricVarieties', available at ht

***Not a Package***: MR2831230 The GAP group, GAP—groups, algorithms, and programming, version 4.4.12, 2008, available at http://www.gap-system.org.
***Not a Package***: MR2831230 G. P. Nagy and P. Vojtěchovský, `Loops: computing with quasigroups and loops in GAP, version 2.1.0', available at http://www.math.du.edu/loops.
***Package***: MR2831230 L. H. Soicher, `GRAPE, Graph algorithms using permutation groups, version 4.3', package for GAP, available at http://www.maths.qmul.ac.UK/$\sim$leonard/grape/.
***Not a Package***: MR2643896 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.4, 2006; http://www.gap-system.org.
***Not a Package***: MR2419170 The GAP Group, GAP—Groups, Algorithms, and Programming, version 4.4, 2006; http://www.gap-system.org.
***Not a Package***: MR2595748 The GAP Group, `GAP–groups, algorithms, and programming', http://www.gap-system.org.
***Not a Package***: MR2745634 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.4, 2006,

***Package***: MR3533336 J. Cramwinckel, et al. GUAVA, a GAP package for computing with error-correcting codes, Version 3.12, 2012. http://www.gapsystem.org/Packages/guava.html.
***Not a Package***: MR3764283 The GAP Group, GAP-Groups, algorithms, and programming, Version 4.8.7, 2017, available at http://www.gap-system.org
***Not a Package***: MR3683021 The GAP Group, GAP—groups, algorithms, and programming, version 4.7.6 (2014). http://www.gap-system.org
***Not a Package***: MR3158649 G. Nagy and P. Vojtěchovský, LOOPS; Computing with quasigroups and loops in GAP, http://web.cs.du.edu/petr/loops/.
***Not a Package***: MR3158649 The GAP Group, GAP—Groups, algorithms, and programming, http://www.gap-system.org.
***Not a Package***: MR3239290 The GAP Group: GAP—Groups, Algorithms, and Programming, Version 4.6.4 (2013). http://www.gap-system.org
***Not a Package***: MR3789845 The GAP group, `GAP — groups, algorithms, and programming, version 4.7.8', 2015, http://www.gap-system.org.
***Not

***Not a Package***: MR3908699 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.8.5, 2016. http://www.gap-system.org.
***Not a Package***: MR4145799 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.8.3, 2016, http://www.gap-system.org.
***Not a Package***: MR2175369 Martin Schönert et al. GAP—Groups, Algorithms, and Programming. Lehrstuhl D für Mathematik, Rheinisch Westfälische Technische Hochschule, Aachen, Germany, fifth edition.
***Not a Package***: MR2175369 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.3; 2002, (http://www.gap-system.org).
***Not a Package***: MR3095221 GAP - groups, algorithms, and programming, Version 4.4, 2004, available at http://www.gap-system.org.
***Not a Package***: MR3841520 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.8.8, https://www.gap-system.org, 2017.
***Not a Package***: MR2137971 The GAP Group. GAP—Groups, Algorithms, Programming, Version 4.2 (Aachen, St Andrews, 1999)

***Not a Package***: MR4155175 The GAP Group, GAP Groups, Algorithms, and Programming, Version 4.4; 2004, http://www.gap-system.org.
***Not a Package***: MR4102105 The GAP group, GAP - Groups, Algorithms, and Programming, Version 4.4, http://www.gap-system.org, 2004.
***Not a Package***: MR4201484 The GAP group, GAP - groups, algorithms, and programming, Version 4.10.0, http://www.gap-system.org, 2018.
***Not a Package***: MR4155421 The GAP Group, GAP — Groups, Algorithms and Programming, Version 4.3 (2002), http://www.gap-system.org.
***Not a Package***: MR1806297 M. Schönert et al., GAP: Groups, algorithms, and programming, 4th ed., Lehrstuhl D für Mathematik, RWTH Aachen, 1994. See http://www-gap.dcs.st-and.ac.uk/$\sim$gap.
***Package***: MR3574443 Laurent Bartholdi, FR, GAP package functionally recursive groups. http://laurentbartholdi.github.io/fr/chap0.html, 2014.
***Package***: MR3574443 Y. Muntyan and D. Savchuk. Autom Grp GAP package for computation in groups and semigroups ge

***Not a Package***: MR2258669 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.2, 2000. http://www.gap-system.org
***Not a Package***: MR2735402 The GAP Group, G.A.P.—Groups, Algorithms, and Programming, Version 4.4, http://www.gap-system.org, 2009.
***Not a Package***: MR3773504 The GAP Group, GAP Groups, Algorithms, and Programming, version 4.4.10, 2007. http://www.gap-system.org.
***Not a Package***: MR3029481 The GAP Group (2008) GAP—Groups, Algorithms, and Programming. http://www.gap-system.org
***Not a Package***: MR1807654 M. Schönert, et al., "GAP—Groups, Algorithms, and Programming," 4th ed., 1994.
***Not a Package***: MR2049014 The GAP-Group. GAP—Groups, algorithms, and programming, version 4.3 (2002). http://www-gap.dcs.st-and.ac.uk/gap/gap.html
***Not a Package***: MR4150255 The GAP Group, "GAP: groups, algorithms, and programming", 2004, Available at http://www.gap-system.org. Version 4.4.
***Not a Package***: MR3854485 The GAP group: GAP-Groups, Algori

***Not a Package***: MR4077413 The {\ssf GAP} Group, {\ssf GAP} — Groups, Algorithms, and Programming, Version 4.10.2 (2019), https://www.gap-system.org.
***Not a Package***: MR3578801 The GAP Group, GAP Groups, Algorithms, and Programming, Version 4.6.4, 2013, http://www.gap-system.org.
***Not a Package***: MR3634298 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.8.4; 2016. (http://www.gap-system.org)
***Package***: MR3197173 G. Ellis, Hap - Homological Algebra Programming, Version 1.8, an official package for the GAP computational algebra system, http://www.gap-system.org/Packages/hap.html, 2008. 
MR2478414
***Not a Package***: MR3197173 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.4.9, http://www.gap-system.org, 2006.
***Not a Package***: MR4032800 The GAP group, GAP –Groups, Algorithms, and Programming, version 4.8.10, 2018. https://www.gap-system.org
***Not a Package***: MR2423808 The GAP Group, GAP: Groups, Algorithms, and Programming, 

***Not a Package***: MR1695079 M. Schönert et al., "Gap: groups, algorithms, and programming," Lehrstuhl D für Mathematik, RWTH Aachen, 3.4.4 edition, 1997.
***Not a Package***: MR1967616 Schönert M. et al., Groups, Algorithms and Programming (1997), http://www-gap.dcs.st-and.ac.uk/gap.
***Not a Package***: MR2531764 Groups, algorithms and programming, Lehrstuhl $\roman{D}$ für Mathematik, RWTH Aachen, 1994. Available at http://www.gap-system.org/.
***Not a Package***: MR2592492 The Gap Group, Gap - Groups, Algorithms, and Programming, Version 4.4.10, 2007. http://www.gap-system.org.
***Package***: MR2592492 L.H. Soicher, The Grape package for Gap, Version 4.3, 2006. http://www.maths.qmul.ac.uk/$\sim$leonard/grape/.
***Not a Package***: MR4187238 The Gap Group, Gap–groups, algorithms, and programming, http://www.gap-system.org, 2015.
***Not a Package***: MR1615333 M. Schönert (ed.), Groups, Algorithms and Programming. Lehrstuhl D für Mathe- matik, RWTH Aachen, Germany (1994) available 

In [29]:
corpus_df['Version'].value_counts()

Package    730
Unknown    539
4.4        454
4.4.12     310
4.3        212
          ... 
0.97         1
10.1109      1
1.1.3        1
4.46         1
1.0          1
Name: Version, Length: 101, dtype: int64

Now we will create a `website` coulmn to indicate if such is provided. Then we fill each cell using  a Regex to search citations for the GAP website.

In [30]:
corpus_df.insert(loc=3, column='Website', value=' ')

In [31]:
corpus_df['Website'] = corpus_df.apply(website_check, axis=1)

***Provided Website***: MR4056124 GAP – Groups, algorithms, programming - a system for computational discrete algebra, www.gap-system.org.
***Provided Website***: MR3942387 Delgado, M., García-Sánchez, P.A., Morais, J.: "Numerical Sgps", A GAP package for numerical semi-groups. https://gap-packages.github.io/numericalsgps. Accessed 19 Aug 2017 
MR3493240
***Provided Website***: MR3942387 The GAP Group: GAP—groups, algorithms, and programming, version 4.7.5 (2014). http://www.gap-system.org. Accessed 19 Aug 2017
***Provided Website***: MR3354065 The GAP – Groups, Algorithms and Programming. Version 4.4.12, 2008. www.gap-system.org.
***Provided Website***: MR3646312 The $\ssf{GAP}$ Group, $\ssf{GAP}$–Groups, Algorithms, and Programming, 4.7.8, 2015, http://www.gap-system.org.
***Not Provided***: MR1864795 M. Schönert et al. GAP - Groups, Algorithms, and Programming (Lehrsthul D für Mathematik, Reinisch-Westflische Technische Hochschule, Aachen, Germany, fifth ed., 1995.)
***Provided Webs

***Provided Website***: MR2684077 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.4; 2004, http://www.gap-system.org.
***Provided Website***: MR2684077 L.H. Soicher, The GRAPE package for GAP, Version 4.3, 2006, http://www.maths.qmul.ac.uk/$\sim$leonard/grape/.
***Provided Website***: MR2956331 The GAP Group, GAP - Groups, Algorithms, and Programming, version 4.4, 2004, http://www.gap-system.org.
***Provided Website***: MR3010114 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4, 2004. http://www.gap-system.org.
***Not Provided***: MR3337174 J. Bamberg et al., FinInG—a finite geometry package for GAP, Version 1.0, 2011; cage.ugent.be/fining/.
***Provided Website***: MR3337174 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.6.2, 2013, www.gap-system.org.
***Not Provided***: MR3337174 A. Hanaki, Elementary functions for association schemes on GAP, math.shinshu-u.ac.jp/$\sim$hanaki/as/gap/.
***Provided Website***: MR3337174 C. Pech and

***Provided Website***: MR3512657 The GAP Group. GAP – Groups, Algorithms, and Programming, Version 4.7.5, 2014, http://www.gap-system.org.
***Provided Website***: MR3537913 Bishnoi, A., De Bruyn, B.: GAP-code for "On semi-finite hexagons of order $(2, t)$ containing a subhexagon". Online available document, http://cage.ugent.be/geometry/preprints.php 
cf. MR3962866
***Provided Website***: MR3537913 The GAP Group: GAP — Groups, Algorithms, and Programming, Version 4.4.12 (2008) http://www.gap-system.org
***Provided Website***: MR3654197 Bishnoi A., De Bruyn B.: GAP-code for "Characterizations of the Suzuki tower near polygons". http://cage.ugent.be/geometry/preprints.php. 
MR2161636
***Provided Website***: MR3654197 The GAP Group: GAP—Groups, Algorithms, and Programming, Version 4.7.5. http://www.gap-system.org (2014).
***Provided Website***: MR3621727 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.7.5; 2014. http://www.gap-system.org.
***Not Provided***: MR3710034

***Provided Website***: MR3489755 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4.12 (2008). http://www.gap-system.org.
***Provided Website***: MR3880098 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.7.5, http://www.gap-system.org, 2014.
***Provided Website***: MR2944381 GAP-Group, GAP - Groups, Algorithms, Programming - a System for Computational Discrete Algebra, 2010. http://www.gap-system.org/.
***Provided Website***: MR3557650 The GAP Group: GAP – Groups, Algorithms, and Programming, Version 4.8.3 (2016). http://www.gap-system.org
***Provided Website***: MR3355546 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.4.12, 2008 http://www.gap-system.org.
***Provided Website***: MR2555951 GAP. Groups, algorithms, programming - a system for computational discrete algebra, at http://www.gap-system.org/.
***Provided Website***: MR2831973 The Gap Group. GAP—Groups, Algorithms, and Programming, Version 4.4.12 (2008). http://www.gap

***Provided Website***: MR3169552 The GAP Group. (2002). GAP-Groups, Algorithms and Programming, Aachen, St. Andrews. Available at http://www-gap.dcs.st-and.ac.uk/gap
***Provided Website***: MR2489325 Cohen A.M., Gijsbers D.A.H.–GBNP 0.9.4, A GAP package for computing Gröbner bases of non-commutative polynomials, Eindhoven, 2007, available at: http://www.mathdox.org/products/gbnp/.
***Provided Website***: MR3918046 The GAP Group, GAP — Groups, Algorithms, and Programming, Version 4.8.7 (2017); http://www.gap-system.org.
***Not Provided***: MR2169384 Gap, 2000. GAP Reference Manual. The GAP Group, School of Mathematical and Computational Sciences, University of St. Andrews.
***Provided Website***: MR3768257 M. Delgado, P. García-Sánchez and J. Morais, NumericalSgps, A package for numerical semigroups, Version 0.980 dev (2013), (GAP package), http://www.fc.up.pt/cmup/mdelgado/numericalsgps/. 
MR3493240
***Not Provided***: MR1655469 M. Schönert et al., "GAP: Groups, Algorithms and Program

***Provided Website***: MR2054400 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.3, http://www.gap-system.org, 2002.
***Provided Website***: MR2150939 The GAP Group, GAP—groups, algorithms, and programming, (Version 4.4 3, 2004) (http://www.gap-system.org).
***Provided Website***: MR2118762 The GAP Group, GAP—Groups, Algorithms, and Programming. Version 4.3 (2002) (http://www.gap-system.org).
***Provided Website***: MR2426157 The GAP Group, GAP—Groups, Algorithms, and Programming. Version 4.4 5 (2005) (http://www.gap-system.org).
***Provided Website***: MR3015643 The GAP Group, Gap–groups, algorithms, and programming, version 4.4, (2005), http://www.gap-system.org.
***Not Provided***: MR1785438 M. Schönert et al., GAP: groups, algorithms and programming. Lehrstuhl D für Mathematik, RWTH, Aachen 1992.
***Provided Website***: MR2341997 S. Dolfi, Intersections of odd order Hall subgroups, Bull. London Math. Soc. 37 (2005), 61–66. GAP. The GAP Group, GAP – Groups, Algori

***Provided Website***: MR2151423 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.3 (2002). http://www.gap-system.org
***Provided Website***: MR4082423 E. Aichinger, F. Binder, J. Ecker, P. Mayr and C. Nöbauer, SONATA — system of near-rings and their applications, GAP package, Version 2.8 (2015), http://www.algebra.uni-linz.ac.at/Sonata/.
***Not Provided***: MR4082423 The GAP Group, GAP — Groups, algorithms, and programming, version 4.9.3 (2018).
***Provided Website***: MR3262358 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.5.5, 2012. (http://www.gap-system.org).
***Provided Website***: MR2848965 The GAP Group, GAP-Groups, Algorithms and Programming, Version 4.2, Aachen, St. Andrews, 2000, (http://www-gap.dcs.st-and.ac.uk/$\sim$gap).
***Not Provided***: MR1997750 M. Schönert et al., GAP—Groups, Algorithms, and Programming, fifth edition, Lehrstuhl D für Mathematik, (Rheinisch Westfälische Technische Hochschule, Aachen, Germany, 1995).
***Provided 

***Provided Website***: MR2911879 The GAP group: GAP—Groups, Algorithms, and Programming. Version 4.4 (2004); http://www.gap-system.org
***Provided Website***: MR2927804 The GAP group, GAP–groups, algorithms, and programming, Version 4.4 2004, http://www.gap-system.org.
***Provided Website***: MR3049563 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.4, http://www.gap-system.org, 2005.
***Provided Website***: MR3317762 GAP–Groups, Algorithms, and Programming, Version 4.4, The GAP Group, 2004, http://www.gap-system.org.
***Provided Website***: MR3318256 The GAP Group, GAP—Groups, Algorithms, and Programming, version 4.4 (2004). http://www.gap-system.org
***Provided Website***: MR3405871 GAP group, Gap – groups, algorithms, programming, version 4.4, http://www.gap-system.org, 2004.
***Provided Website***: MR3626555 The GAP group: GAP—groups, algorithms, and programming. Version 4.4 (2004), http://www.gap-system.org
***Provided Website***: MR3620702 The GAP Group, GAP – 

***Provided Website***: MR2831230 L. H. Soicher, `GRAPE, Graph algorithms using permutation groups, version 4.3', package for GAP, available at http://www.maths.qmul.ac.UK/$\sim$leonard/grape/.
***Provided Website***: MR2643896 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.4, 2006; http://www.gap-system.org.
***Provided Website***: MR2419170 The GAP Group, GAP—Groups, Algorithms, and Programming, version 4.4, 2006; http://www.gap-system.org.
***Provided Website***: MR2595748 The GAP Group, `GAP–groups, algorithms, and programming', http://www.gap-system.org.
***Provided Website***: MR2745634 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.4, 2006, http://www.gap-system.org.
***Provided Website***: MR2793220 GAP—Groups, Algorithms, and Programming, Version 4.4.12 http://www.gap-system.org.
***Provided Website***: MR2833508 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.4, 2006, (http://www.gap-system.org).
***Provided Websit

***Not Provided***: MR2014018 M. Schönert et al., GAP 3.4 Manual (Groups, Algorithms and Programming), RWTH Aachen, Aachen, Germany, 1994.
***Provided Website***: MR2838908 The GAP Group, GAP—Groups, algorithms, and programming, version 4.4.12, 2008, http://www.gap-system.org.
***Provided Website***: MR3533336 J. Cramwinckel, et al. GUAVA, a GAP package for computing with error-correcting codes, Version 3.12, 2012. http://www.gapsystem.org/Packages/guava.html.
***Provided Website***: MR3764283 The GAP Group, GAP-Groups, algorithms, and programming, Version 4.8.7, 2017, available at http://www.gap-system.org
***Provided Website***: MR3683021 The GAP Group, GAP—groups, algorithms, and programming, version 4.7.6 (2014). http://www.gap-system.org
***Provided Website***: MR3158649 G. Nagy and P. Vojtěchovský, LOOPS; Computing with quasigroups and loops in GAP, http://web.cs.du.edu/petr/loops/.
***Provided Website***: MR3158649 The GAP Group, GAP—Groups, algorithms, and programming, http://w

***Provided Website***: MR3053363 GAP library, www.math.rwth-aachen.de/homes/MOC/decomposition.
***Provided Website***: MR2554934 GAP library, http://www.math.rwth-aachen.de/homes/MOC/decomposition.
***Provided Website***: MR4063323 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.9.2; 2018. (https://www.gap-system.org)
***Provided Website***: MR2823587 The GAP Group, GAP-Groups, Algorithms, and Programming (Version 4.4), 2005, http://www.gap-system.org.
***Provided Website***: MR1884468 The GAP Group, GAP-Groups, Algorithms, and Programming, Version 4.1, Aachen, St. Andrews http://www.math.rwth-aachen.de/$\sim$GAP/, 1999.
***Provided Website***: MR2124813 The GAP Group, GAP-Groups, Algorithms, and Programming, Version 4.4 (2004); available at http://www.gap-system.org.
***Provided Website***: MR2802525 The GAP Group, GAP - Groups, Algorithms, and Programming, version 4.4.10, 2007. Available at http://www.gap-system.org
***Provided Website***: MR2087093 The GAP Group

***Provided Website***: MR2232860 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4, 2005 (http://www.gap-system.org).
***Provided Website***: MR2275096 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4; 2005 (http://www.gap-system.org).
***Provided Website***: MR2665772 The GAP Group, GAP—Groups, Algorithms, and Programming, http://www.gap-system.org.
***Provided Website***: MR2494367 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.4.10; 2007 http://www.gap-system.org.
***Provided Website***: MR2673749 The GAP group, GAP - groups, algorithms, and programming, http://www.gap-system.org.
***Provided Website***: MR2777023 The GAP Group, GAP—Groups, Algorithms, and Programming, http://www.gap-system.org.
***Provided Website***: MR3248732 GAP Group, GAP—Groups, algorithms, programming (2013) Available at http://www.gap-system.org
***Provided Website***: MR4171568 The GAP group, GAP: Groups, algorithms and programming (2018) Version 4.

***Provided Website***: MR3570568 The GAP Group, GAP –Groups, Algorithms, and Programming, Version 4.6.3; 2013. (http://www.gap-system.org).
***Provided Website***: MR2950664 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4.12 (2008), http://www.gap-system.org
***Provided Website***: MR3857663 GAP., GAP - Groups, Algorithms, and Programming, Version 4.8.4. The GAP Group, http://www.gap-system.org, 2016.
***Provided Website***: MR2001274 S. Egner and M. Püschel, AREP—Constructive Representation Theory and Fast Signal Transforms, GAP software share package, 1998; available online at http://www.ece.cmu.edu/$\sim$smart/arep/arep.html.
***Provided Website***: MR2001274 The GAP Team, GAP—Groups, Algorithms, and Programming, University of St. Andrews, Scotland, 1997; available online at http://www-gap.dcs.st-and.ac.uk/$\sim$gap/.
***Provided Website***: MR1943043 Egner, S., Püschel, M. (1998). AREP—A Package for Constructive Representation Theory and Fast Signal Transforms,

***Provided Website***: MR3017840 Nagy G., Vojtěchovský P., LOOPS: Computing with quasigroups and loops in GAP, Version 2.0.0, (2008), (http://www.math.du.edu/loops). 
MR2379126
***Provided Website***: MR3298668 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.4.12; 2008, http://www.gap-system.org
***Provided Website***: MR3404474 GAP. GAP–Groups, Algorithms, and Programming, Version 4.7.4. The GAP Group, 2014. URL http://www.gap-system.org.
***Provided Website***: MR3100139 GAP group, Algorithms and Programming, Version 4.4.9, 2006, http://www.gap-system.org.
***Provided Website***: MR2228647 The GAP Group, GAP—Groups, Algorithms, and Programming, version 4.4, http://www.gap-system.org/, 2005.
***Provided Website***: MR2228647 L.H. Soicher, The GRAPE 4.2 Package for GAP 4.4, http://www.maths.qmul.ac.uk/$\sim$leonard/grape/, 2003.
***Provided Website***: MR2652095 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.4, 2006, http://www.gap-system.org/.
*

***Not Provided***: MR3976195 Kauffman, L.H.: Knots and Physics. World Scientific, Singapore (2001) 
MR1858113
***Provided Website***: MR3976195 Gap - groups, algorithms, and programming, version 4.10.0. https://www.gap-system.org
***Provided Website***: MR3451663 http://www.gap-system.org/.
***Provided Website***: MR3057352 Delgado, M., García-Sánchez, P.A., Morais, J.: "Numericalsgps": a ${\bold{gap}}$ package on numerical semigroups. http://www.gap-system.org/Packages/numericalsgps.html
***Provided Website***: MR3593640 M. Delgado, P. A. García-Sánchez, J. Morais, "numericalsgps": A gap package on numerical semigroups, (http://www.gap-system.org/Packages/numericalsgps.html).
***Provided Website***: MR3849636 M. Delgado, P. A. García-Sánchez and J. Morais, "Numericalsgps": A gap package on numerical semigroups, http://www.gap-system.org/Packages/numericalsgps.html.
***Not Provided***: MR1658168 M. Schönert (ed.), Gap-3.4, manual, RWTH Aachen, 1994.
***Not Provided***: MR1769294 M. Sc

# Merging the two dataframes with the equivalent of SQL `join`

df = pd.DataFrame(lis_dic)
df = df[df.ID.isin(targets)]
df_filtered = df[df.mrnumber.isin(mrs)]

The `MR` column in `corpus_df` dataframe has the letters "MR" preceeding each number, first we will remove these letters, the the MR number format is the same in both datasets.

In [32]:
corpus_df['MR'] = corpus_df['MR'].str.extract('(\d+)', expand=False)
corpus_df

Unnamed: 0,MR,Citation,Version,Website
0,4056124,"GAP – Groups, algorithms, programming - a system for computational discrete algebra, www.gap-system.org.",Unknown,Yes
1,3942387,"Delgado, M., García-Sánchez, P.A., Morais, J.: ""Numerical Sgps"", A GAP package for numerical semi-groups. https://gap-packages.github.io/numericalsgps. A...",Package,Yes
2,3942387,"The GAP Group: GAP—groups, algorithms, and programming, version 4.7.5 (2014). http://www.gap-system.org. Accessed 19 Aug 2017",4.7.5,Yes
3,3354065,"The GAP – Groups, Algorithms and Programming. Version 4.4.12, 2008. www.gap-system.org.",4.4.12,Yes
4,3646312,"The $\ssf{GAP}$ Group, $\ssf{GAP}$–Groups, Algorithms, and Programming, 4.7.8, 2015, http://www.gap-system.org.",4.7.8,Yes
...,...,...,...,...
3537,3988630,"M. Delgado, P. A. García-Sánchez and J. Morais. Numericalsgps: a ${\ssf gap}$ package on numerical semigroups, (http://www.gap-system.org/Packages/numeri...",Package,Yes
3538,1801202,"L.H. Soicher, GRAPE: a system for computing with graphs and groups, in: L. Finkelstein and W.M. Kantor, eds., Groups and Computation, DIMACS Series in Di...",Unknown,Yes
3539,2558870,"L. Bartholdi, Functionally recursive groups, http://www.gap-systems.org/Manuals/pkg/fr/doc/manual.pdf.",Unknown,Yes
3540,2824780,"X. Sun, C. Liu, D. Li and J. Gao, On duality gap in binary quadratic programming, Available from: http://www.optimization-online.org/DB_FILE/2010/01/2512...",Unknown,Yes


In [33]:
corpus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3542 entries, 0 to 3541
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   MR        3542 non-null   object
 1   Citation  3542 non-null   object
 2   Version   3542 non-null   object
 3   Website   3542 non-null   object
dtypes: object(4)
memory usage: 55.4+ KB


In [34]:
bib_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3367 entries, 0 to 3366
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   MR                3159 non-null   object
 1   Author            3367 non-null   object
 2   Journal           3047 non-null   object
 3   Year              3363 non-null   object
 4   Publication Type  3367 non-null   object
 5   MSC               3252 non-null   object
dtypes: object(6)
memory usage: 79.0+ KB


* With the following code we are joining the two datasets on the `MR` column and using `corpus_df` as a base. The resulting dataset will have as many lines as `corpus_df` but all columns from `bib_df` will be added, hence we will have much more information to work with.

In [139]:
merged_df = pd.merge(bib_df, corpus_df, on='MR', how='right', indicator=True)

merged_df = merged_df.dropna()
merged_df.info()

* We need to convert the `Year` column cells to integers, in order for the visualisations to work fine. We will use the `.astype` method, but before taht we have to remove all `NaN` entries, from that column.

In [36]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3543 entries, 0 to 3542
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   MR                3543 non-null   object  
 1   Author            3536 non-null   object  
 2   Journal           3440 non-null   object  
 3   Year              3531 non-null   object  
 4   Publication Type  3536 non-null   object  
 5   MSC               3536 non-null   object  
 6   Citation          3543 non-null   object  
 7   Version           3543 non-null   object  
 8   Website           3543 non-null   object  
 9   _merge            3543 non-null   category
dtypes: category(1), object(9)
memory usage: 155.8+ KB


In [140]:
merged_df = merged_df.dropna(subset=['Year'])

In [141]:
type(merged_df['Year'][3])

str

In [142]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3531 entries, 0 to 3542
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   MR                3531 non-null   object  
 1   Author            3531 non-null   object  
 2   Journal           3440 non-null   object  
 3   Year              3531 non-null   object  
 4   Publication Type  3531 non-null   object  
 5   MSC               3531 non-null   object  
 6   Citation          3531 non-null   object  
 7   Version           3531 non-null   object  
 8   Website           3531 non-null   object  
 9   _merge            3531 non-null   category
dtypes: category(1), object(9)
memory usage: 252.3+ KB


In [143]:
merged_df['Year'] = merged_df['Year'].astype(np.int64)

In [41]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3531 entries, 0 to 3542
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   MR                3531 non-null   object  
 1   Author            3531 non-null   object  
 2   Journal           3440 non-null   object  
 3   Year              3531 non-null   int64   
 4   Publication Type  3531 non-null   object  
 5   MSC               3531 non-null   object  
 6   Citation          3531 non-null   object  
 7   Version           3531 non-null   object  
 8   Website           3531 non-null   object  
 9   _merge            3531 non-null   category
dtypes: category(1), int64(1), object(8)
memory usage: 266.1+ KB


In [144]:
type(merged_df['Year'][3])

numpy.int64

* We can use the following iteration loop to browse the resulting merged dataframe. By borwsing the raw data we can make sure everything is alright and spot any remaining issues or anomalies. In our case there are some remaining special characters, which we will remove as best as we can.

In [43]:
for index, row in merged_df.iterrows():
    print(row['MR'], row['Citation'])

4056124 GAP – Groups, algorithms, programming - a system for computational discrete algebra, www.gap-system.org.
3942387 Delgado, M., García-Sánchez, P.A., Morais, J.: "Numerical Sgps", A GAP package for numerical semi-groups. https://gap-packages.github.io/numericalsgps. Accessed 19 Aug 2017 
MR3493240
3942387 The GAP Group: GAP—groups, algorithms, and programming, version 4.7.5 (2014). http://www.gap-system.org. Accessed 19 Aug 2017
3354065 The GAP – Groups, Algorithms and Programming. Version 4.4.12, 2008. www.gap-system.org.
3646312 The $\ssf{GAP}$ Group, $\ssf{GAP}$–Groups, Algorithms, and Programming, 4.7.8, 2015, http://www.gap-system.org.
1864795 M. Schönert et al. GAP - Groups, Algorithms, and Programming (Lehrsthul D für Mathematik, Reinisch-Westflische Technische Hochschule, Aachen, Germany, fifth ed., 1995.)
2287843 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.3; 2002, (http://www.gap-system.org).
2175389 The GAP Group, GAP-Groups, Algorithms, and pro

3272384 John Bamberg, S.P. Glasby, Eric Swartz, AS-configurations and skew-translation generalised quadrangles (including supporting GAP code), arXiv:1405.5063v2.
3272384 GAP—groups, algorithms, and programming, version 4.7.2, http://www.gap-system.org, 2014.
3739185 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.7.8, 2015.
3795627 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.8.7, http://www.gap-system.org, 2017.
4125850 The GAP group, GAP — Groups, Algorithms, Programming, version 4.4, 2004, http://www.gap-system.org.
4125850 J. Michel, The development version of the CHEVIE package of GAP3, J. Algebra, 435 (2015), 308–336. 
MR3343221
4125850 M. Schönert et al., GAP – Groups, Algorithms, and Programming, sixth edition, Lehrstuhl D für Mathematik, RWTH Aachen, Germany, 1997.
2868897 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4.9, 2006, http://www.gap-system.org.
2795737 M. Barakat, The homalg Package—A GAP4 meta-package fo

2646433 The GAP Group, GAP-groups, algorithms and programming, version 4.4; Aachen, St Andrews, 2006 (http://www.gap-system.org).
2646433 L. H. Soicher, The DESIGN package for GAP, Version 1.3, 2006, http://designtheory.org/software/gap\_design/.
2646433 L. H. Soicher, The GRAPE package for GAP, Version 4.3, 2006, http://www.maths.qmul.ac.uk/$\sim$leonard/grape/.
2604639 The Group GAP, GAP—groups, algorithms, and programming, version 4.4, Aachen, St. Andrews, 2006, http://www.gap-system.org.
2604639 L. H. Soicher, The package DESIGN for GAP, Version 1.3, 2006, http://designtheory.org/software/gap\_design/.
2753302 The GAP Group, GAP - groups, algorithms, and programming, version 4.4; Aachen, St. Andrews, 2006, http://www.gap-system.org.
2753302 L. H. Soicher, The DESIGN package for GAP, Version 1.3, 2006, http:// designtheory.org/software/gap\_design/.
2844687 The GAP Group, GAP—Groups, algorithms, and programming, version 4.4, Aachen, St. Andrews, 2006, http://www.gap-system.org
28446

3943860 GAP – Groups, Algorithms, and Programming, Version 4.9.2, 2018, https://www.gap-system.org.
4083408 The GAP Group: GAP–Groups, Algorithms, and Programming, Version 4.8.3, 2016, http://www.gapsystem.org
4184341 GAP Groups, Algorithms, and Programming, Version 4.7.8, The GAP Group, 2015. [Online]. Available: http://www.gap-system.org, 2015.
3271175 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4, 2005. (http://www.gap-system.org).
3449012 The GAP Group: GAP – Groups, Algorithms, and Programming, Version 4.4. (2005). The GAP Group. http://www.gap-system.org
3973105 GAP Group, "GAP—Groups, Algorithms, and Programming, Version 4.8.7." 2017, http://www.gap-system.o.
2434096 The GAP group, `GAP—groups, algorithms, and programming, version 4.4', 2006, package Grape, http://www.gap-system.org
2242478 GAP—Groups, Algorithms and Programming, Version 4.3, 2002.
3879476 The GAP Group.: GAP - Groups, Algorithms, and Programming, Version 4.8.6 (2016). https://www.gap-syste

4090490 GAP—Groups, Algorithms, and Programming. Version 4-10-0. gap-system.org
4187252 The GAP Group, GAP – groups, algorithms, and programming, available at gap-system.org.
2981138 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.3; 2002, http://www.gap-system.org.
3187647 J. Michel, The development version of the CHEVIE package of GAP3, arXiv:1310.7905. 
cf. MR3343221
2397403 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4.9; http://www.gap-system.org, 2006.
2535999 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.4.9; 2006. (http://www.gap-system.org).
3169609 The GAP Group. (2012). GAP–Groups, Algorithms, and Programming, Version 4.5.4. Available at http://www.gap-system.org. Accessed July 16, 2012.
4069973 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.8.8, 2017, https://www.gap-system.org.
2946109 A. Distler and J. D. Mitchell. Smallsemi—a GAP package, version 0.6.4, 2011. http://tinyurl.com/jdmitchel

4019320 The GAP Group (2018). GAP – Groups, Algorithms, and Programming, Version 4.8.10. (http://www.gap-system.org)
4082053 The GAP Group, GAP – Groups, Algorithms, and Programming, version 4.9.3, http://www.gap-system.org, 2018.
3677606 The GAP Group: GAP - Groups, Algorithms, and Programming (2016). http://www.gap-system.org
3993993 The GAP Group, GAP – Groups, Algorithms, and Programming, http://www.gap-system.org, 2016.
1831507 Linton, S., G. Pfeiffer, E. Robertson and N. Ruškuc, "Monoid v2.0", GAP package, 1997; http://www-gap.dcs.st-and.ac.uk/$\sim$gap/
1831507 Pfeiffer, G., "fpmonoid.g v2.0", GAP package, 1997; http://www-gap.dcs.st-and.ac.uk/$\sim$gap/
1831507 Schönert, M., et al., "GAP–Groups, Algorithms, and Programming", Lehrstuhl D für Mathematik, Rheinisch Westfälische Technische Hochschule, Aachen, Germany, fifth edition, 1995; http://www-gap.dcs.st-and.ac.uk/$\sim$gap/
1831507 Widi, M. O., "Semigroup functions for GAP v1.25", GAP package, 1994; http://www-gap.dcs.st-and

2737675 The GAP Group: GAP—Groups, Algorithms and Programming, Version 4.4, 2006 (http://www.gap-system.org)
2326329 The GAP Group, gap—Groups, Algorithms, Programming, Version 4.4.7, 2006 (http://www.gap-system.org).
2326329 T. Breuer, Manual for the GAP Character Table Library Version 1.1 (Lehrstuhl D für Mathematik, Rheinisch West-fälische Hochschule, Aachen, 2004).
1656569 M. Schönert et al., "GAP—Groups, Algorithms and Programming," 5th ed., Lehrstuhl D für Mathematik, Rheinisch Westfälische Technische Hochschule, Aachen, Germany, 1995.
2028065 The GAP Group, Aachen, St. Andrews, GAP—Groups, Algorithms, and Programming, Version 4.2, http://www-gap.dcs.st-and.ac.uk/$\sim$gap, 2000.
2568350 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.4.9, http://www.gap-system.org, 2006.
2610749 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.4.9, 2006, $<$http://www.gap-system.org$>$
2727434 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.

2478417 The $\ssf{GAP}$ Group, http://www.gap-system.org, $\ssf{GAP}$ – Groups, Algorithms, and Programming, Version 4.4, 2004.
2478417 Izumi Miyamoto, An improvement of $\ssf{GAP}$ normalizer function for permutation groups, Proceedings of the 31st International Symposium on Symbolic and Algebraic Computation held in Genova, July 9–12, 2006 (Jean-Guillaume Dumas, ed.), ACM Press, New York, 2006. 
MR2289125
2139260 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.3, 2002 (http://www.gap-system.org).
4068978 The GAP Group, GAP – groups, algorithms, and programming, version 4.6.3, http://www.gap-system.org, 2013.
4081499 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.10.1, https://www.gap-system.org, 2019.
1952439 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.2, Aachen, St Andrews, 2000, (http://www-gap.dcs.st-and.ac.uk/$\sim$gap).
3248801 `GAP–Groups, Algorithms and Programming', www.gap-system.org.
2654520 "The GAP group," in: G

2852247 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4.12, http://www.gap-system.org.
2852247 R. Wilson, R. Parker, S. Nickerson, J. Bray, T. Breuer, GAP package AtlasRep—A GAP Interface to the Atlas of Group Representations, Version 1.4.0, http://www.gap-system.org/Packages/atlasrep.html.
3003722 T. Breuer, GAP4-package CTblLib - The GAP Character Table Library, Version 1.1.3, http://www.gap-system.org/Packages/ ctbllib.html, 2004.
3003722 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.4.12, http://www.gap-system.org, 2008.
3003722 R. Wilson, R. Parker, S. Nickerson, J. Bray, T. Breuer, GAP4-package Atlas Rep - A GAP Interface to the Atlas of Group Representations, Version 1.4.0, http://www.gap-system.org/Packages/atlasrep.html, 2008.
3123776 H. Besche, B. Eick, E. O'Brien, GAP-package The SmallGroups Library, http://www.gap-system.org/Packages/sgl.html, 2002.
3123776 T. Breuer, GAP-package CTblLib—the GAP Character Table Library, Version 1.2.

3257828 The GAP Group, GAP—Groups, Algorithms and Programming, version 4.4.12 (2008), http://www.gap-system.org.
3928514 The GAP Group, GAP–Groups, Algorithms and Programming, version 4.4.12 (2008), http://www.gap-system.org.
1872796 (GAP, 1995): M. Schönert et al.: GAP-Groups, Algorithms and Programming, Lehrstuhl für Mathematik, Rheinisch Westfälische Technische Hochschule, Aachen, Germany.
3859967 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.8.2, http://www.gap-system.org/, 2016.
3184100 GAP—Groups, Algorithms and Programming. Version 4.4.12. 2008. (http://www.gap-system.org).
3476382 The GAP Group (GAP — Groups, Algorithms, and Programming, Version 4.4.12), http://www.gap-system.org.
3849532 The GAP Group, GAP—Groups, Algorithms, Programming—A System for Computational Discrete Algebra, vers. 4.8.7 (2017); http://www.gap-system.org.
3853331 The GAP Group (GAP—Groups, Algorithms, and Programming), Version 4.8.10. http://www.gap-system.org
2847514 Y. Muntyan and

3805718 The GAP Group, GAP-Groups, Algorithms and Programming, Version 4.6.4, 2013, http://www.gap-system.org/.
2945164 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.4.12, 2008, http://www.gap-system.org.
2945164 Tomlib, Version 1.2.1, GAP package, 2011, http://schmidt.nuigalway.ie/tomlib.
3065337 GAP - Groups, Algorithms, and Programming, Version 4.4.12, http://www.gap-system.org, 2008.
2221127 Schönert, M., others: GAP - Groups, Algorithms, and Programming, Lehrstuhl D für Mathematik. RWTH Aachen, Germany, 3rd ed., (1993–1997)
2144975 The Gap Group, GAP: Groups, Algorithms, and Programming, Version 4.2, 2000; http://www.gap-system.org
2628806 The GAP group, GAP - groups, algorithms, and programming, Version 4.4, 2004, http://www.gap-system.org.
2520106 The GAP group, GAP - groups, algorithms, and programming, Version 4.4, 2004, http://www.gap-system.org.
3210919 The GAP group, GAP–Groups, Algorithms, and Programming, Version 4.7.4, http://www.gap-system.org, 201

3556125 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.7.8, The GAP Group, 2015, http://www.gap-system.org.
3859427 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.7.8, The GAP Group. http://www.gap-system.org (2015)
2838012 GAP - Groups, Algorithms and Programming, version 4.4.9 (2006), http://www.gap.system.org.
3924435 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.7.6, http://www.gapsystem.org, 2014.
1951391 The GAP Group, Aachen, St Andrews. GAP—Groups, Algorithms, and Programming, Version 4.2, 2000. http://www-gap.dcs.st-and.ac.uk/$\sim$gap.
4103836 The G.A.P. Group, GAP — Groups, Algorithms, and Programming, Version 4.3, 2002, http://www.gap-system.org.
2192253 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4; 2004, http://www.gap-system.org.
2453960 The GAP Group 2004 GAP—Groups, Algorithms, and Programming, Version 4.4 (http://www.gap-system.org)
2968928 The GAP Group. (2008). GAP - Groups, Algorithms

3592012 The GAP Group, GAP – Groups, Algorithms, and Programming, version 4.7.9, http://www.gap-system.org, 2015.
4039426 G.A.P. The, Group. GAP–groups, algorithms, and programming, software, www.gap-system.org.
4039426 G. Ellis, GAP package HAP–homological algebra programming, software, hamilton.nuigalway.ie/Hap/www.
2188351 M. Schonert et al., GAP: Groups, algorithms, and programming, Lehrstuhl D fur Mathematik, RWTH Aachen, 2002.
2485036 The GAP Group, GAP-Groups, Algorithms, and Programming, Version 4.4; 2005, (http://www.gap-system.org)
1780211 M. Schönert et al., GAP: Groups, algorithms, and programming, 5th ed., Lehrstuhl D für Mathematik, RWTH Aachen, 1996. See http://www-gap.dcs.st-and.ac.uk/$\sim$gap.
1935039 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.2, 2000, http://www.gap-system.org.
4068900 The GAP Group (2018). GAP – Groups, Algorithms, and Programming, Version 4.10.0. http://www.gap-system.org.
3124212 J. Michel, Homepage of the development versio

1801202 L.H. Soicher, GRAPE: a system for computing with graphs and groups, in: L. Finkelstein and W.M. Kantor, eds., Groups and Computation, DIMACS Series in Discrete Mathematics and Theoretical Computer Science Vol. 11, AMS, (1993) pp. 287–291. GRAPE is available from http://www-gap.dcs.st-and.ac.uk/gap/Share/grape.html. 
MR1235810
2558870 L. Bartholdi, Functionally recursive groups, http://www.gap-systems.org/Manuals/pkg/fr/doc/manual.pdf.
2824780 X. Sun, C. Liu, D. Li and J. Gao, On duality gap in binary quadratic programming, Available from: http://www.optimization-online.org/DB_FILE/2010/01/2512.pdf.
1981371 Schönert M. et al., Groups, Algorithms and Programming (1997), http://www-gap.dcs.st-and.ac.uk/gap.


We use Regex to further purify the `Citation` column, removing some remaining special characters.

In [145]:
merged_df['Citation'] = merged_df['Citation'].str.replace(r'[\\\$\{\}\^]', '')
merged_df['Citation'] = merged_df['Citation'].str.replace(r'(ssf)', '')

  merged_df['Citation'] = merged_df['Citation'].str.replace(r'[\\\$\{\}\^]', '')
  merged_df['Citation'] = merged_df['Citation'].str.replace(r'(ssf)', '')


##### We remove the unnecessary `merge` column and add a `lenght` column to reflect the character lenght of each citation.

In [146]:
merged_df = merged_df.drop(['_merge'], axis=1)
merged_df['Lenght'] = merged_df['Citation'].apply(len)
merged_df = merged_df.dropna()
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3440 entries, 0 to 3542
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   MR                3440 non-null   object
 1   Author            3440 non-null   object
 2   Journal           3440 non-null   object
 3   Year              3440 non-null   int64 
 4   Publication Type  3440 non-null   object
 5   MSC               3440 non-null   object
 6   Citation          3440 non-null   object
 7   Version           3440 non-null   object
 8   Website           3440 non-null   object
 9   Lenght            3440 non-null   int64 
dtypes: int64(2), object(8)
memory usage: 188.1+ KB


### Adding the Accuracy Score column

I have decided to give one accuracy point for:
* providing some kind of version (either GAP version or some sort of package version)
* providing a website (either the official GAP website or a package website)
* Citation longer than 90 characters (because too short citations do not contain enough information)

In [147]:
merged_df['Accuracy Score'] = 0

In [148]:
merged_df['Accuracy Score'] = merged_df['Accuracy Score'].astype(int)

In [149]:
def accuracy_calculator(series):
    mrno = series['MR']
    citation = series['Citation']
    version = series['Version']
    website = series['Website']
    score = series['Accuracy Score']
    dal = series['Lenght']
    
    if version != 'Unknown':
        score += 1
        
    if website != 'No':
        score += 1
        
    if dal >= 90:
        score += 1
        
    return score

In [150]:
merged_df['Accuracy Score'] = merged_df.apply(accuracy_calculator, axis=1)

In [151]:
merged_df['Accuracy Score'].value_counts()

3    2642
2     404
1     369
0      25
Name: Accuracy Score, dtype: int64

### MSC Codes
* Looking at the `MSC` column we will only analysethe primary MSC codes, therefore we will keep the first 5 symbols only.

In [152]:
for index, row in merged_df.iterrows():
    print(row['MSC'])

05C25 (05C20 20F05)
13F20 (05E15 14H50)
13F20 (05E15 14H50)
05C25 (20B30 20E45)
20G40 (05C25)
20F45 (20D60 20F19)
20D60
20F99
20D15
20D60
20B30 (20D60)
20F05 (20F28)
16S50 (16P10 16U70)
20F45 (20F18)
20F45 (20F12)
20D45 (20D15)
20D60 (51E21)
20D45
20D60 (05C25)
05C25 (05C50 20D99)
20D60
20F45 (20F18)
20F45 (20F18)
20D60 (05C25 20D15)
20D15 (20J05 20J06)
20F45 (20F19)
05C25
05C25
20F45
20F45
20C15 (05C25 05C50)
05C25 (05C50 15A18 20D60)
05C25 (05C50 15A18 20D60)
05E30 (05C25 20B25 51E12)
20D45 (20D15)
20F10 (20F16)
20D15 (20J06)
20C07 (16S34)
20C15 (05C25)
20C07 (16S34)
20D15 (20D45)
20D15 (20D45)
20D60 (11B13 11P70 16S34 20C05)
20D15 (20E18 20F05)
05B05 (20D10 20F16)
05B05 (05B10)
20N05
20N05
05B10
05B10
05B10
20G40 (20E45)
20C15
57Q15 (52B70 55N35 55U10 57N10 57N13 57Q05)
14G35 (11F32 11G20 14H45 20B25 20C34)
14H30 (14H10 20B25)
05C25 (05C10)
51E15 (05B25)
51E21 (05B25)
11E39 (94B05)
05B25 (05B15 51E15)
11D07 (05B45 05C90 11D45)
11D07 (05B45 05C90 11D45)
05E18
05E18
05C90 (11D45 11P21

20C05 (16S34 20C08)
20F18
20F18
20D08 (20F05)
20D06 (20D60)
20D06 (20D60)
20D99 (20P05)
52B70 (57Qxx)
68W30 (68N15 68T15)
68Q19 (20B15 68Q45)
68Q19 (20B15 68Q45)
20N20
20N20
51E20 (05B25 51E21)
51E20 (05B25 51E21)
51E20 (51E21)
51E20 (51E21)
51E23
51E23
05B25 (51E12 94B25)
51A45 (05E99 51A50 51E12)
51A45 (05B25)
05Exx (51E12)
51E12 (05B25 51E20)
51E21
51E21 (05B25)
51E12 (05B25 20C15 20D08 51E14 94B27)
51E12 (20C20 51A45 94B05)
20C34 (17Axx 20D08)
17B30 (17B55)
17B30 (17B55)
20G30 (11Y16)
20G30 (11Y16)
17B20 (17B25)
17B20 (17B25)
17B08
17B08
14L30 (17B20 17B40)
14L30 (17B20 17B40)
17B08 (14L24 14L30)
17B08 (14L24 14L30)
16Z05 (13P10 16S20)
16Z05 (13P10 16S20)
05C10 (68R10 90C22 90C25)
90C27 (90C22)
90C22 (16Z05 65K05)
90C22 (16Z05 65K05)
90B22 (90C35)
20P05 (20D60)
20-04 (20D06 20D10)
17B99 (68W30)
17B99 (68W30)
14H10 (14H20 14H30)
14H50 (14H20 14H57)
14H45 (14H20 14H57)
14H50 (14H20 14H30)
14J27 (14H57 14P25 20F36)
14H30 (14H50)
14J28 (14J27 14N25 14P05)
14J28
14J28 (14J27 14N25)
11R2

20M10 (05C25)
20D15 (20D45)
20P05 (20D60)
20C15 (20C30 20C33)
20C15 (20C20 20C30 20C33)
20C15
16Y60 (16Y30)
16U60 (16S34 20C05)
16U60 (16S34 20C10 20G40)
16S34 (16U60 20C05)
20C05 (16U60 20D06)
16S34 (16U60 20D06)
05B05 (05B25 05C65 20B20 20B25 20D08 20F05 51E20)
20B15 (05B05 20B25)
94B25
20D60 (05C25)
17B20 (17B35)
30F10 (05C25 14H55)
14H57 (05E14 30F10)
05C25
20B05 (20D08)
20D60 (11B75)
05E30
20B05 (20H30)
05C25 (20D60)
05B05 (05C51)
15B52 (20-04 20C40)
20D15 (20C20)
11T22 (11Y40 20G05 20H20)
11T22 (11Y40 20G05 20H20)
20B15 (20B40 20C30)
20C33 (20C20)
20C20
20F50 (20D15 20F05)
57M27 (20F36)
05A05 (05D05 05E10 05E30)
20C30 (20C15)
20C15
05B10 (20D60)
05B05 (05B10 51E05)
20M05 (20M30 20M35 68Q70)
52B15 (20D15 20F05 20F65 51M20)
20F36
16S34 (16U60 20C05)
16U60 (20C07)
46L80 (19K14 37B50 52C23)
20E22 (20D15)
20G15 (14L30 17B30 17B45)
14L30 (17B45)
20G15
20G15
20G40 (20-04 20E45)
17B37 (17B08 17B25)
20G40 (20E45)
20G15 (20G40)
20C15 (20D06 20D20)
20C33 (20C15)
20G40 (20E45)
57Q15
57Q15
20

16U60 (05E10 16S34 20C05 20C20)
16U60 (05E10 16S34 20C05 20C20)
16U60 (05E10 16S34 20C05 20C20)
16U60 (16S34 20C05 20C10)
16U60 (16S34 20C05 20C10)
16S35 (20C05)
16S35 (16S34 20C05)
20F55 (20C15)
20F55 (20C15)
20C15 (20C33)
16S34 (16S99)
20E45 (20G40)
20B30 (20D60)
20P05 (20B25 20B30 20D06)
20D05
20C15 (20D10)
11F55 (13A50 13H10 13P10)
11F70
05E10 (06A07 16S99 82B20)
20C15 (11F80 20C33)
20C20
20C15 (20D05)
11F80 (11Y40 14G10 14G15 14G20 14H40 14Q05)
20D60 (05C25)
20F65 (20J05)
20E07 (20D10)
15A69 (17B01 20F45)
37G40 (20C40 37M20)
20M14 (13A18 14H20)
16Y30 (08A30 20D30)
57M60 (20B25 20H10)
11R29 (11R11 11R37)
11R29 (11R11 11R37)
11R29 (11R11 11R37)
11R29 (11R11 11R37)
11R29 (11R11 11R16 11Y40 13C20)
11R29 (11R11 11R16 11Y40 13C20)
11R29 (11R11 11R16 11Y40 13C20)
11R16 (11R20 11R27 11R29 11Y40)
16Y30
08A40 (20D10)
08A40 (20D10)
20F50
20D06 (20D60)
05C70 (05C25 05C38)
20G40
20G40
20G40 (20E45)
20G40 (20E45)
20C08
20C08
20C08
20C30 (20C08)
20C30 (20C08)
20C30 (05E10 20F55)
05B05 (51E14)
94

20F99


In [209]:
merged_df['MSC Primary'] = [x[:5] for x in merged_df['MSC']]
merged_df['MSC Secondary'] = [x[5:] for x in merged_df['MSC']]

In [210]:
merged_df['MSC Primary'].head()

0    05C25
1    13F20
2    13F20
3    05C25
4    20G40
Name: MSC Primary, dtype: object

In [211]:
merged_df['MSC Secondary'].head()

0     (05C20 20F05)
1     (05E15 14H50)
2     (05E15 14H50)
3     (20B30 20E45)
4           (05C25)
Name: MSC Secondary, dtype: object

* Now we need the science area names corresponding to each MSC code.

* First we create the dictionary, using this PDF https://mathscinet.ams.org/msnhtml/msc2020.pdf

In [197]:
msc_text = """00 General and overarching topics; collections
01 History and biography
03 Mathematical logic and foundations
05 Combinatorics
06 Order, lattices, ordered algebraic structures
08 General algebraic systems
11 Number theory
12 Field theory and polynomials
13 Commutative algebra
14 Algebraic geometry
15 Linear and multilinear algebra; matrix theory
16 Associative rings and algebras
17 Nonassociative rings and algebras
18 Category theory; homological algebra
19 K-theory
20 Group theory and generalizations
22 Topological groups, Lie groups
26 Real functions
28 Measure and integration
30 Functions of a complex variable
31 Potential theory
32 Several complex variables and analytic spaces
33 Special functions
34 Ordinary differential equations
35 Partial differential equations
37 Dynamical systems and ergodic theory
39 Difference and functional equations
40 Sequences, series, summability
41 Approximations and expansions
42 Harmonic analysis on Euclidean spaces
43 Abstract harmonic analysis
44 Integral transforms, operational calculus
45 Integral equations
46 Functional analysis
47 Operator theory
49 Calculus of variations and optimal control; optimization
51 Geometry
52 Convex and discrete geometry
53 Differential geometry
54 General topology
55 Algebraic topology
57 Manifolds and cell complexes
58 Global analysis, analysis on manifolds
60 Probability theory and stochastic processes
62 Statistics
65 Numerical analysis
68 Computer science
70 Mechanics of particles and systems
74 Mechanics of deformable solids
76 Fluid mechanics
78 Optics, electromagnetic theory
80 Classical thermodynamics, heat transfer
81 Quantum theory
82 Statistical mechanics, structure of matter
83 Relativity and gravitational theory
85 Astronomy and astrophysics
86 Geophysics
90 Operations research, mathematical programming
91 Game theory, economics, social and behavioral sciences
92 Biology and other natural sciences
93 Systems theory; control
94 Information and communication, circuits
97 Mathematics education"""

In [198]:
msc_dict = {l[:2]: l[2:] for l in msc_text.splitlines()}

In [187]:
msc_dict

{'00': ' General and overarching topics; collections',
 '01': ' History and biography',
 '03': ' Mathematical logic and foundations',
 '05': ' Combinatorics',
 '06': ' Order, lattices, ordered algebraic structures',
 '08': ' General algebraic systems',
 '11': ' Number theory',
 '12': ' Field theory and polynomials',
 '13': ' Commutative algebra',
 '14': ' Algebraic geometry',
 '15': ' Linear and multilinear algebra; matrix theory',
 '16': ' Associative rings and algebras',
 '17': ' Nonassociative rings and algebras',
 '18': ' Category theory; homological algebra',
 '19': ' K-theory',
 '20': ' Group theory and generalizations',
 '22': ' Topological groups, Lie groups',
 '26': ' Real functions',
 '28': ' Measure and integration',
 '30': ' Functions of a complex variable',
 '31': ' Potential theory',
 '32': ' Several complex variables and analytic spaces',
 '33': ' Special functions',
 '34': ' Ordinary differential equations',
 '35': ' Partial differential equations',
 '37': ' Dynamical s

* Secondary

In [221]:
merged_df["MSC Secondary"] = (
    merged_df["MSC Secondary"]
    .str.extractall(r"[ \(](\w{2})")[0]
    .map(msc_dict)
    .groupby(level=0).agg(list)
)

In [222]:
type(merged_df["MSC Secondary"][0])

list

In [230]:
merged_df["MSC Sec"] = merged_df["MSC Secondary"].astype(str)
type(merged_df["MSC Sec"][3])

str

* Primary

In [216]:
#merged_df['MSC Primary'] = [x[:2] for x in merged_df['MSC Primary']]
merged_df["MSC Primary"] = merged_df["MSC Primary"].map(msc_dict)
merged_df["MSC Primary"].head()

0                        Combinatorics
1                  Commutative algebra
2                  Commutative algebra
3                        Combinatorics
4     Group theory and generalizations
Name: MSC Primary, dtype: object

In [236]:
merged_df.head(35)

Unnamed: 0,MR,Author,Journal,Year,Publication Type,MSC,Citation,Version,Website,Lenght,Accuracy Score,MSC Primary,MSC Secondary,MSC Sec
0,4056124,"Abas, M. and Vetrík, T.",Theoret. Comput. Sci.,2020,article,05C25 (05C20 20F05),"GAP – Groups, algorithms, programming - a system for computational discrete algebra, www.gap-system.org.",Unknown,Yes,104,2,Combinatorics,"[ Combinatorics, Group theory and generalizations]","[' Combinatorics', ' Group theory and generalizations']"
1,3942387,"Abbas, A. and Assi, A. and García-Sánchez, P. A.",Rev. R. Acad. Cienc. Exactas Fís. Nat. Ser. A Mat. RACSAM,2019,article,13F20 (05E15 14H50),"Delgado, M., García-Sánchez, P.A., Morais, J.: ""Numerical Sgps"", A GAP package for numerical semi-groups. https://gap-packages.github.io/numericalsgps. A...",Package,Yes,183,3,Commutative algebra,"[ Combinatorics, Algebraic geometry]","[' Combinatorics', ' Algebraic geometry']"
2,3942387,"Abbas, A. and Assi, A. and García-Sánchez, P. A.",Rev. R. Acad. Cienc. Exactas Fís. Nat. Ser. A Mat. RACSAM,2019,article,13F20 (05E15 14H50),"The GAP Group: GAP—groups, algorithms, and programming, version 4.7.5 (2014). http://www.gap-system.org. Accessed 19 Aug 2017",4.7.5,Yes,125,3,Commutative algebra,"[ Combinatorics, Algebraic geometry]","[' Combinatorics', ' Algebraic geometry']"
3,3354065,"Abdolghafourian, A. and Iranmanesh, M. A.",Comm. Algebra,2015,article,05C25 (20B30 20E45),"The GAP – Groups, Algorithms and Programming. Version 4.4.12, 2008. www.gap-system.org.",4.4.12,Yes,87,2,Combinatorics,"[ Group theory and generalizations, Group theory and generalizations]","[' Group theory and generalizations', ' Group theory and generalizations']"
4,3646312,"Abdolghafourian, A. and Iranmanesh, M. A. and Niemeyer, A. C.",J. Pure Appl. Algebra,2017,article,20G40 (05C25),"The GAP Group, GAP–Groups, Algorithms, and Programming, 4.7.8, 2015, http://www.gap-system.org.",4.7.8,Yes,95,3,Group theory and generalizations,[ Combinatorics],[' Combinatorics']
5,1864795,"Abdollahi, A.",Houston J. Math.,2001,article,20F45 (20D60 20F19),"M. Schönert et al. GAP - Groups, Algorithms, and Programming (Lehrsthul D für Mathematik, Reinisch-Westflische Technische Hochschule, Aachen, Germany, fi...",Unknown,No,168,1,Group theory and generalizations,"[ Group theory and generalizations, Group theory and generalizations]","[' Group theory and generalizations', ' Group theory and generalizations']"
6,2287843,"Abdollahi, A. and Jafarian Amiri, S. M. and Hassanabadi, A. M.",Houston J. Math.,2007,article,20D60,"The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.3; 2002, (http://www.gap-system.org).",4.3,Yes,105,3,Group theory and generalizations,,
7,2175389,"Abdollahi, A. and Ataei, M. J. and Jafarian Amiri, S. M. and Hassanabadi, A. M.",Comm. Algebra,2005,article,20F99,"The GAP Group, GAP-Groups, Algorithms, and programming, Version 4.3; 2002, (http://www.gap-system.org).",4.3,Yes,103,3,Group theory and generalizations,,
8,2149067,"Abdollahi, A. and Hassanabadi, A. M.",Comm. Algebra,2005,article,20D15,"The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.3; 2002, (http://www.gap-system.org).",4.3,Yes,105,3,Group theory and generalizations,,
9,2293309,"Abdollahi, A. and Jafarian Amiri, S. M.",J. Pure Appl. Algebra,2007,article,20D60,"The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.3, 2002, http://www.gap-system.org.",4.3,Yes,101,3,Group theory and generalizations,,


In [None]:
def rem_dup(series):
    mrno = series['MR']
    citation = series['Citation']
    version = series['Version']
    msc_sec = series["MSC Secondary"]
    msc_sec = pd.unique(msc_sec).tolist()
    if re.search("www|\.net|http", citation, re.IGNORECASE) != None:
        print('***Provided Website***:', mrno, citation)
        return 'Yes'
    else:
        print('***Not Provided***:', mrno, citation)
        return 'No'
my_list = [1,2,2,3,1,4,5,1,2,6]
myFinalList = pd.unique(my_list).tolist()
print(myFinalList)



In [240]:
for index in merged_df.iterrows():
    merged_df["MSC Secondary"] = pd.unique(merged_df["MSC Secondary"]).tolist()

TypeError: unhashable type: 'list'

In [235]:
msc_uni = merged_df['MSC Primary'].unique()
second = merged_df['MSC Sec'].str.unique()
second

AttributeError: 'StringMethods' object has no attribute 'unique'

### Now we split the extended dataset in two dataframes for further analysis

### Pure GAP citations  

In [None]:
gap_df = merged_df[merged_df['Version'] != 'Package']
gap_df = gap_df.dropna()
gap_df.info()

We add two more columns that we will need later in the analysis `ReleaseYear` and `Delay`

In [None]:
release_dates = { 
    # dates from archive timestamps
    '4.8.2': 2016, # 2016/02/20
    '4.7.9': 2015, # 2015/11/29
    '4.7.8': 2015, # 2015/06/09
    '4.7.7': 2015, # 2015/02/13
    '4.7.6': 2014, # 2014/11/15
    '4.7.5': 2014, # 2014/05/24
    '4.7.4': 2014, # 2014/02/20
    '4.7.3': 2013, # 2014/02/15
    '4.7.2': 2013, # 2013/12/01
    '4.7': 2013,
    '4.6.5': 2013, # 2013/07/20
    '4.6.4': 2013, # 2013/05/04
    '4.6.3': 2013, # 2013/03/18
    '4.6.2': 2013, # 2013/02/02
    '4.5.7': 2012, # 2012/12/14
    '4.5.6': 2012, # 2012/09/16
    '4.5.5': 2012, # 2012/07/16
    '4.5.4': 2012, # 2013/06/04
    '4.5': 2012, 
    # dates below from file creation
    '4.4.12': 2008, # 2008/12/16
    '4.4.11': 2008, # 2008/12/08
    '4.4.10': 2007, # 2007/10/05
    '4.4.9': 2006,  # 2006/11/02
    '4.4.8': 2006,  # 2006/09/29
    '4.4.7': 2006,  # 2006/03/17
    '4.4.6': 2005,  # 2005/09/02
    '4.4.5': 2005,  # 2005/05/13
    '4.4.4': 2004,  # 2004/12/22
    # dates below from http://www.gap-system.org/Download/Updates/index.html
    '4.4.3': 2004,   # May 2004
    '4.4.2': 2004,  # April 2004
    # dates from http://www.gap-system.org/Doc/History/history.html
    # if not stated otherwise
    '4.4': 2004, 
    '4.3': 2002, 
    '4.2': 2000, # http://www.gap-system.org/ForumArchive/Linton.1/Steve.1/Release_.3/1.html
    '4.1': 1999, 
    '3.4': 1997, # assume http://www.gap-system.org/Doc/History/preface_3.4.4.html
}

In [None]:
def release_year(version):
    if version in release_dates.keys():
        return release_dates[version]
    else:
        return 'Unknown'

In [None]:
release_year('3.4')

In [None]:
gap_df['ReleaseYear'] = gap_df['Version'].map(release_year)

* Delai column we will use later to analyse the difference between publication year and the year of GAP release cited by this publication.

In [None]:
gap_df['Delay'] = 0

In [None]:
def set_delay(series):
    rel_year = series['ReleaseYear']
    year = series['Year']
    delay = series['Delay']
    if rel_year != 'Unknown':
        #print('***Package***:')
        delay = year - rel_year
    return delay

In [None]:
gap_df['Delay'] = gap_df.apply(set_delay, axis=1)

In [None]:
gap_df.info()

### GAP Packages Citations

In [None]:
pac_df = merged_df[merged_df['Version'] == 'Package']
pac_df = pac_df.dropna()
pac_df.info()

In [None]:
sma = gap_df[gap_df['Lenght'] < 90]
big = gap_df[gap_df['Lenght'] > 90]
print(len(sma))
print(len(big))

In [None]:
get_c('3092787')

In [None]:
merged_df.loc[354]

Exporting the pre-processed data to `CSV` files to be picked up by the final *Data Visualisations and Analysis* notebook.

In [None]:
merged_df.to_csv('full.csv', index=False, encoding='utf-8')
gap_df.to_csv('gap.csv', index=False, encoding='utf-8')
pac_df.to_csv('pac.csv', index=False, encoding='utf-8')

In [137]:
pip install pandasgui

Collecting pandasgui
  Downloading pandasgui-0.2.12.tar.gz (208 kB)
Collecting PyQt5
  Downloading PyQt5-5.15.4-cp36.cp37.cp38.cp39-none-win32.whl (5.4 MB)
Collecting PyQt5-sip
  Downloading PyQt5_sip-12.9.0-cp38-cp38-win32.whl (51 kB)
Collecting PyQtWebEngine
  Downloading PyQtWebEngine-5.15.4-cp36.cp37.cp38.cp39-none-win32.whl (157 kB)
Collecting wordcloud
  Downloading wordcloud-1.8.1-cp38-cp38-win32.whl (145 kB)



  ERROR: Command errored out with exit status 1:
   command: 'c:\users\fliqp_000\appdata\local\programs\python\python38-32\python.exe' 'C:\Users\fliqp_000\AppData\Local\Temp\pip-standalone-pip-gfzdgvzc\__env_pip__.zip\pip' install --ignore-installed --no-user --prefix 'C:\Users\fliqp_000\AppData\Local\Temp\pip-build-env-247j7ykb\overlay' --no-warn-script-location --no-binary :none: --only-binary :none: -i https://pypi.org/simple -- 'cython >= 0.29' 'numpy==1.16.6; python_version<'"'"'3.9'"'"'' 'numpy==1.19.4; python_version>='"'"'3.9'"'"'' setuptools setuptools_scm wheel
       cwd: None
  Complete output (296 lines):
  Ignoring numpy: markers 'python_version >= "3.9"' don't match your environment
  Collecting cython>=0.29
    Downloading Cython-0.29.23-cp38-cp38-win32.whl (1.6 MB)
  Collecting numpy==1.16.6
    Downloading numpy-1.16.6.zip (5.1 MB)
  Collecting setuptools
    Downloading setuptools-57.1.0-py3-none-any.whl (818 kB)
  Collecting setuptools_scm
    Downloading setuptools

Collecting pynput
  Downloading pynput-1.7.3-py2.py3-none-any.whl (99 kB)
Collecting pyarrow
  Downloading pyarrow-4.0.1.tar.gz (711 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'
  Downloading pyarrow-4.0.0.tar.gz (710 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'
  Downloading pyarrow-3.0.0.tar.gz (682 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'
  Downloading pyarrow-2.0.0.tar.gz (58.9 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'
  Downloading pyarrow-1.0.1.tar.gz (1.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'
  Downloading pyarrow-1.0.0.tar.gz (1.2 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'
  Downloading pyarrow-0.

      lapack_opt_info:
      lapack_mkl_info:
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries mkl_rt not found in ['c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\lib', 'C:\\', 'c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\libs']
        NOT AVAILABLE
  
      openblas_lapack_info:
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries openblas not found in ['c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\lib', 'C:\\', 'c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\libs']
        NOT AVAILABLE
  
      openblas_clapack_info:
      No module named 'numpy.distutils._msvccompiler' in numpy.distuti

  Complete output (296 lines):
  Ignoring numpy: markers 'python_version >= "3.9"' don't match your environment
  Collecting cython>=0.29
    Using cached Cython-0.29.23-cp38-cp38-win32.whl (1.6 MB)
  Collecting numpy==1.16.6
    Using cached numpy-1.16.6.zip (5.1 MB)
  Collecting setuptools
    Using cached setuptools-57.1.0-py3-none-any.whl (818 kB)
  Collecting setuptools_scm
    Using cached setuptools_scm-6.0.1-py3-none-any.whl (27 kB)
  Collecting wheel
    Using cached wheel-0.36.2-py2.py3-none-any.whl (35 kB)
  Using legacy 'setup.py install' for numpy, since package 'wheel' is not installed.
  Installing collected packages: setuptools, wheel, setuptools-scm, numpy, cython
      Running setup.py install for numpy: started
      Running setup.py install for numpy: finished with status 'error'
      ERROR: Command errored out with exit status 1:
       command: 'c:\users\fliqp_000\appdata\local\programs\python\python38-32\python.exe' -u -c 'import io, os, sys, setuptools, tokeniz

      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries lapack_atlas not found in c:\users\fliqp_000\appdata\local\programs\python\python38-32\lib
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries tatlas,tatlas not found in c:\users\fliqp_000\appdata\local\programs\python\python38-32\lib
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries lapack_atlas not found in C:\
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries tatlas,tatlas not found in C:\
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries lapack_atlas not found in c:\users\fliqp_

  
      atlas_blas_info:
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries f77blas,cblas,atlas not found in ['c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\lib', 'C:\\', 'c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\libs']
        NOT AVAILABLE
  
      accelerate_info:
        NOT AVAILABLE
  
          Atlas (http://math-atlas.sourceforge.net/) libraries not found.
          Directories to search for the libraries can be specified in the
          numpy/distutils/site.cfg file (section [atlas]) or by setting
          the ATLAS environment variable.
        self.calc_info()
      blas_info:
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries blas not found in ['c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\lib', 'C:\\', 'c:\\users\\f

        NOT AVAILABLE
  
      'svnversion' is not recognized as an internal or external command,
      operable program or batch file.
      non-existing path in 'numpy\\distutils': 'site.cfg'
      lapack_opt_info:
      lapack_mkl_info:
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries mkl_rt not found in ['c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\lib', 'C:\\', 'c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\libs']
        NOT AVAILABLE
  
      openblas_lapack_info:
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries openblas not found in ['c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\lib', 'C:\\', 'c:\\us

      Running setup.py install for numpy: finished with status 'error'
      ERROR: Command errored out with exit status 1:
       command: 'c:\users\fliqp_000\appdata\local\programs\python\python38-32\python.exe' -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\fliqp_000\\AppData\\Local\\Temp\\pip-install-pl970w09\\numpy_3c36023c9235410cb161ef8d0c897bb2\\setup.py'"'"'; __file__='"'"'C:\\Users\\fliqp_000\\AppData\\Local\\Temp\\pip-install-pl970w09\\numpy_3c36023c9235410cb161ef8d0c897bb2\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' install --record 'C:\Users\fliqp_000\AppData\Local\Temp\pip-record-n4bk1y43\install-record.txt' --single-version-externally-managed --prefix 'C:\Users\fliqp_000\AppData\Local\Temp\pip-build-env-6wn7222r\o

        libraries lapack_atlas not found in c:\users\fliqp_000\appdata\local\programs\python\python38-32\lib
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries ptf77blas,ptcblas,atlas not found in c:\users\fliqp_000\appdata\local\programs\python\python38-32\lib
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries lapack_atlas not found in C:\
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries ptf77blas,ptcblas,atlas not found in C:\
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries lapack_atlas not found in c:\users\fliqp_000\appdata\local\programs\python\python38-32\libs
      No module named 'numpy.distutils._msvccompiler' 

  
      openblas_clapack_info:
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries openblas,lapack not found in ['c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\lib', 'C:\\', 'c:\\users\\fliqp_000\\appdata\\local\\programs\\python\\python38-32\\libs']
        NOT AVAILABLE
  
      atlas_3_10_threads_info:
      Setting PTATLAS=ATLAS
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries lapack_atlas not found in c:\users\fliqp_000\appdata\local\programs\python\python38-32\lib
      No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
      customize MSVCCompiler
        libraries tatlas,tatlas not found in c:\users\fliqp_000\ap

  copying pyarrow\include\arrow\util\compression_zlib.h -> build\lib.win32-3.8\pyarrow\include\arrow\util
  copying pyarrow\include\arrow\util\compression_zstd.h -> build\lib.win32-3.8\pyarrow\include\arrow\util
  copying pyarrow\include\arrow\util\config.h -> build\lib.win32-3.8\pyarrow\include\arrow\util
  copying pyarrow\include\arrow\util\cpu_info.h -> build\lib.win32-3.8\pyarrow\include\arrow\util
  copying pyarrow\include\arrow\util\decimal.h -> build\lib.win32-3.8\pyarrow\include\arrow\util
  copying pyarrow\include\arrow\util\formatting.h -> build\lib.win32-3.8\pyarrow\include\arrow\util
  copying pyarrow\include\arrow\util\functional.h -> build\lib.win32-3.8\pyarrow\include\arrow\util
  copying pyarrow\include\arrow\util\hash_util.h -> build\lib.win32-3.8\pyarrow\include\arrow\util
  copying pyarrow\include\arrow\util\hashing.h -> build\lib.win32-3.8\pyarrow\include\arrow\util
  copying pyarrow\include\arrow\util\int_util.h -> build\lib.win32-3.8\pyarrow\include\arrow\util
  c