# Module 2
# Data preparation and cleaning
We will start by loading all the librarieswe will need.

In [1]:
import sys
import time
import bibtexparser
import itertools
import requests
import re
import matplotlib as plt
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

In [2]:
bibtex_file = open('gap-publishednicer.bib.txt', encoding='utf-8')
bib_data = bibtexparser.load(bibtex_file)
bib = bib_data.entries # we prepare the GAP Bibliography file, ready to be loaded as Pandas DataFrame.

### Here are the 3 datasets we will start with.

In [3]:
bib_df = pd.DataFrame.from_dict(bib) # large one from the Bibliography
review_df = pd.read_csv('no_citation_text.csv', dtype='str') # MR numbers who came back with no `GAP` text found inside
corpus_df = pd.read_csv('gap_citations_corpus.csv', dtype='str') # CItations scraped from MathSciNet website

# Larger dataset from Bibliography
We will start by filtering the data, let us look at all the columns at our disposal.

In [4]:
bib_df.columns

Index(['printedkey', 'doi', 'url', 'mrreviewer', 'mrnumber', 'mrclass', 'issn',
       'fjournal', 'pages', 'year', 'volume', 'journal', 'title', 'author',
       'ENTRYTYPE', 'ID', 'number', 'school', 'booktitle', 'isbn', 'note',
       'publisher', 'day', 'keywords', 'month', 'series', 'annote', 'type',
       'address', 'institution', 'howpublished', 'editor', 'bookeditor',
       'edition', 'key', 'organization'],
      dtype='object')

We only need some of these columns, hence we `drop` the rest.

In [5]:
bib_df.drop(bib_df.columns[[0, 1, 2, 3, 6, 7, 8, 10, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]], axis = 1, inplace = True)

In [6]:
bib_df.columns

Index(['mrnumber', 'mrclass', 'year', 'journal', 'author', 'ENTRYTYPE'], dtype='object')

We reorder the columns. Then we format the names accordingly.  We change the `mrnumber` coulmn name to `MR` so we cane later merge this dataframe with the other one.

In [7]:
bib_df = bib_df[['mrnumber', 'author', 'journal', 'year', 'ENTRYTYPE', 'mrclass']]
bib_df.columns = ['MR', 'Author', 'Journal', 'Year', 'Publication Type', 'MSC']
bib_df

Unnamed: 0,MR,Author,Journal,Year,Publication Type,MSC
0,4056124,"Abas, M. and Vetrík, T.",Theoret. Comput. Sci.,2020,article,05C25 (05C20 20F05)
1,3942387,"Abbas, A. and Assi, A. and García-Sánchez, P. A.",Rev. R. Acad. Cienc. Exactas Fís. Nat. Ser. A ...,2019,article,13F20 (05E15 14H50)
2,,"Abdeljaouad, I.",RAIRO-INF THEOR APPL,1999,article,
3,3354065,"Abdolghafourian, A. and Iranmanesh, M. A.",Comm. Algebra,2015,article,05C25 (20B30 20E45)
4,3646312,"Abdolghafourian, A. and Iranmanesh, M. A. and ...",J. Pure Appl. Algebra,2017,article,20G40 (05C25)
...,...,...,...,...,...,...
3362,2647300,"Zusmanovich, P.",J. Geom. Phys.,2010,article,17B60
3363,2735394,"Zusmanovich, P.",J. Algebra,2010,article,17B40
3364,3201064,"Zusmanovich, P.",J. Algebra,2014,article,17B40
3365,3598575,"Zusmanovich, P.",Linear Algebra Appl.,2017,article,17C10 (17-08 17A30 17C55)


We can inspect Data types and count of non-null values for each column.

In [8]:
bib_df.info(show_counts  = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3367 entries, 0 to 3366
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   MR                3159 non-null   object
 1   Author            3367 non-null   object
 2   Journal           3047 non-null   object
 3   Year              3367 non-null   object
 4   Publication Type  3367 non-null   object
 5   MSC               3252 non-null   object
dtypes: object(6)
memory usage: 79.0+ KB


Looking at a single entry from the `MRN` column, some cells contain `NaN`

In [9]:
bib_df.iloc[3274]

MR                            NaN
Author                 Wegner, A.
Journal                       NaN
Year                         1989
Publication Type    mastersthesis
MSC                        Thesis
Name: 3274, dtype: object

* this method is used usually for numerical columns but we can try it to get an overview of our data

In [10]:
bib_df.describe() 

Unnamed: 0,MR,Author,Journal,Year,Publication Type,MSC
count,3159,3367,3047,3367,3367,3252
unique,3158,2511,384,43,10,2268
top,3656296,"Sambale, B.",J. Algebra,2017,article,Thesis
freq,2,18,387,188,2976,99


* this gives us an overview of a column, displaying top 5 most frequent values and the 5 least frequent, with their counts

In [11]:
bib_df['MSC'].value_counts() 

Thesis                       99
20C15                        36
20C20                        33
20N05                        30
20D15                        22
                             ..
57M25 (20H25 22E40)           1
20C30 (05E05 05E10)           1
20J05 (20-04 20E22 20F05)     1
20D10 (20D20 20D35)           1
20D60 (20F05)                 1
Name: MSC, Length: 2268, dtype: int64

### We will process the `year` column.  There are several anomalies and we need just 4 digits in each cell.

In [12]:
bib_df.sort_values('Year', ascending=False)

Unnamed: 0,MR,Author,Journal,Year,Publication Type,MSC
3165,3973299,"Then, H.",,[2019] \copyright 2019,incollection,11F12 (11R06)
1546,3898507,"Greer, M.",,[2019] \copyright 2019,incollection,20N05
3133,3898514,"Stuhl, I. and Vojtěchovský, P.",,[2019] \copyright 2019,incollection,20N05 (57M27)
3147,3782458,"Swinarski, D.",,[2018] \copyright 2018,incollection,30F20 (14H37 14H45 14Q05)
1981,4167659,"Kaushik, R. and Yadav, M. K.",J. Algebra,2021,article,20D15 (20F12)
...,...,...,...,...,...,...
3274,,"Wegner, A.",,1989,mastersthesis,Thesis
2689,,"Niemeyer, A.",,1988,mastersthesis,Thesis
2673,,"Nickel, W.",,1988,mastersthesis,Thesis
3018,,"Schönert, M.",,1987,mastersthesis,Thesis


We will use `.str` and a regular expresion `(r'^(\d{4})'` which first converts all year cells to strings then takes the first 4 digits from each. We then replace the old values with the filtered ones.

In [13]:
bib_df['Year'] = bib_df['Year'].str.extract(r'^(\d{4})', expand=False)

In [14]:
bib_df['Year'].value_counts() # to inspect the results

2017    188
2013    175
2018    168
2020    166
2019    165
2010    163
2015    162
2016    158
2014    154
2011    152
2012    142
2007    142
2008    132
2004    131
2005    128
2009    124
2006    118
2001    107
2003    101
2002     84
1999     84
2000     78
1997     76
1998     58
1995     56
2021     39
1996     34
1994     28
1993     25
1992     13
1991      5
1987      2
1988      2
1989      2
1990      1
Name: Year, dtype: int64

We inspect the result and no more anomalies are visible. Data type is integer which is exactly what we need for futher operations.

#  Now we will focus on the other two input files, produced from Module 1 - the Web-scraping tool.
We already loaded them at the beginning of the notbook. We will only work with the main data - `corpus_df`. <br>
The other file `no_citation_text.csv` containing few anomalies we will not handle in this project, in real-life scenario each entry there will be manually investigated by staff who work in the GAP Group, or whichever is the institution or company we are working with.

In [15]:
corpus_df

Unnamed: 0,MR,Citation
0,MR4056124,"GAP – Groups, algorithms, programming - a syst..."
1,MR3942387,"Delgado, M., García-Sánchez, P.A., Morais, J.:..."
2,MR3942387,"The GAP Group: GAP—groups, algorithms, and pro..."
3,MR3354065,"The GAP – Groups, Algorithms and Programming. ..."
4,MR3646312,"The $\ssf{GAP}$ Group, $\ssf{GAP}$–Groups, Alg..."
...,...,...
3537,MR3988630,"M. Delgado, P. A. García-Sánchez and J. Morais..."
3538,MR1801202,"L.H. Soicher, GRAPE: a system for computing wi..."
3539,MR2558870,"L. Bartholdi, Functionally recursive groups, h..."
3540,MR2824780,"X. Sun, C. Liu, D. Li and J. Gao, On duality g..."


In [16]:
pd.options.display.max_colwidth = 157 # increasing column width for better readability

We start by defining two functions, to help us browse the data by MR number. The base for the functions was borrowed from the second year Python course CS2006 by Dr Konovalov, but they were modifed to better fir this project. The first function displays just Citation text and Version. The second function displays the whole row for given MR number.

In [17]:
# Python lectures by Dr Konovalov 
# https://studres.cs.st-andrews.ac.uk/CS2006/Lectures/Python/L08-dataset.pdf

def get_citation(mrno):
    r = corpus_df[corpus_df['MR'] == mrno]
    return r.at[r.index[0],'Citation'], r.at[r.index[0],'Version']

In [18]:
# Python lectures by Dr Konovalov 
# https://studres.cs.st-andrews.ac.uk/CS2006/Lectures/Python/L08-dataset.pdf
# slightly modified so it can return all citations with the specified MRN
# on the other hand the result is a dataframe and if we want to read the full citation text we need to ise `.iloc` with the index
def get_c(mrno):
    r = corpus_df[corpus_df['MR'] == mrno]
    return r

# Version
Version is a very important feature and we need to have it in a separate column. We will achieve this by parsing each citation cell with a Regex and extracting the version, where provided.
* First we create the version column.

In [19]:
corpus_df.insert(loc=2, column='Version', value=' ')

This is the function that we will use to parse each citation and extract the version.<br>
It is based on the lectures from CS2006 by Dr Konovlov.<br>
 It also prints the outputs, which was used while testing and modifying the function until it worked fine for our purposes. 

In [20]:
# Python lectures by Dr Konovalov 
# https://studres.cs.st-andrews.ac.uk/CS2006/Lectures/Python/L08-dataset.pdf

unknown_ver = []

def get_version(s):
    match = re.search("(?:(\d+\.(?:\d+\.)*\d+))", s, re.IGNORECASE)
    if match != None:
        return match.group(1)
        print('* VERSION FOUND *')
    else:
        print('* No VERSION found *', s)
        unknown_ver.append(s)
        return 'Unknown'

We appl it to the `Citation` column.

In [21]:
corpus_df['Version'] = corpus_df['Citation'].map(get_version)

* No VERSION found * GAP – Groups, algorithms, programming - a system for computational discrete algebra, www.gap-system.org.
* No VERSION found * Delgado, M., García-Sánchez, P.A., Morais, J.: "Numerical Sgps", A GAP package for numerical semi-groups. https://gap-packages.github.io/numericalsgps. Accessed 19 Aug 2017 
MR3493240
* No VERSION found * M. Schönert et al. GAP - Groups, Algorithms, and Programming (Lehrsthul D für Mathematik, Reinisch-Westflische Technische Hochschule, Aachen, Germany, fifth ed., 1995.)
* No VERSION found * W. Nickel, NQ, 1998, A refereed GAP 4 package, see [10].
* No VERSION found * W. Nickel, NQ, 1998, A refereed GAP 4 package, see [8].
* No VERSION found * Gamble, G., Nickel, W., O'Brien, E.A.: ANU p-Quotient–p-Quotient and p-Group Generation Algorithms (2006). An accepted GAP 4 package, available also in MAGMA
* No VERSION found * M. Schönert et al, GAP: groups, algorithm and programming, © 1992 by Lehrstuhl D für Mathematik, distributed with the GAP so

* No VERSION found * U. Jezernik and P. Moravec. "GAP Code for Computing Bogomolov Multipliers of Finite Solvable Groups." Available online (http://www.fmf.uni-lj.si/$\sim$moravec/Papers/computing-bog.g), 2014. 
MR3314282
* No VERSION found * The GAP Group, `GAP–groups, algorithms, and programming', http://www.gap-system.org.
* No VERSION found * J.-G. Dumas, F. Heckenbach, B. D. Saunders, and V. Welker, Simplicial Homology, A Share Package for GAP, manual, 2000.
* No VERSION found * J.-G. Dumas, F. Heckenbach, B. D. Saunders, and V. Welker, Simplicial Homology, a share package for GAP, manual, 2000.
* No VERSION found * The GAP Group, 1986. The GAP package. See http://www.gap-system.org/.
* No VERSION found * Joyner, D.: MAGMA code: toric.mag code, http://cadigweb.ew.usna.edu/~wdj/papers/toric.mag GAP code: toric.g code, http://cadigweb.ew.usna.edu/~wdj/papers/toric.g
* No VERSION found * GAP user manual. https://www.gap-system.org/Manuals/doc/ref/chap39.html.
* No VERSION found * M. 

* No VERSION found * F. Lutz. Gap-program BISTELLAR. Available at http://www.math.tu-berlin.de/diskregeom/stellar/BISTELLAR, 2000.
* No VERSION found * Gap: Gap system for computational discrete algebra, http://www.gap-system.org.
* No VERSION found * http://www-gap.dcs.st-and.ac.uk/gap.
* No VERSION found * http://www.gap-system.org/.
* No VERSION found * http://www.gap-system.org/.
* No VERSION found * M. Delgado, P. A. García-Sánchez and J. Morais, "Numericalsgps": A gap package on numerical semigroups, http://www.gap-system.org/Packages/numericalsgps.html.
* No VERSION found * M. Delgado, P.A. García-Sánchez, J. Morais, "NumericalSgps": A gap package on numerical semigroups, http://www.gap-system.org/Packages/numericalsgps.html.
* No VERSION found * Kauffman, L.H.: Knots and Physics. World Scientific, Singapore (2001) 
MR1858113
* No VERSION found * http://www.gap-system.org/.
* No VERSION found * Delgado, M., García-Sánchez, P.A., Morais, J.: "Numericalsgps": a ${\bold{gap}}$ pack

In [22]:
corpus_df['Version'].value_counts() # to inspect results

Unknown    895
4.4        460
4.4.12     310
4.3        232
4.4.10     136
          ... 
4.4.5        1
1.10.8       1
0.997        1
0.2.1        1
1.1.11       1
Name: Version, Length: 197, dtype: int64

Then we will further process the `Version` column by finding and labelling GAP Packages. <br>
Packages are connected to GAP, but technically is a separate piece of software, having its own Version tree. Therefore, in entries citing GAP package there is no version of GAP and we will fill the `Version` cell with the string `Package`.
We will create and apply a function which chekcs if it is a case of citing GAP Package. It will search citations for the word "package" in order to determine if they are citing GAP or a GAP Package, in the latter case the `Version` cell value will be replaced with 'Package'.
* First we create a list of all GAP Package names, adding the ones already out of use, just in case.

In [23]:
f = open('packages.txt', 'r')
pac_name = []
for line in f:
        mat = line.split(" ",1)[0]
        pac_name.append(mat)
pac_name.append('magma')
pac_name.append('anu')
pac_name.append('Carat')
pac_name.append('Citrus')
pac_name.append('Convex')
pac_name.append('Gpd')
pac_name.append('MONOID')
pac_name.append('NQL')
pac_name.append('ParGAP')
pac_name.append('PolymakeInterface')
pac_name.append('QaoS')
pac_name.append('recogbase')
pac_name.append('RAMEGA')
#-fr modules

* We use a regex expression combined with the list we compiled so the function searches citations either for the word "Package" ignoring case or fo any of the Package names.<br>
We also add a case if the citation contains "manual" - in such cases it is not package, but counts as a GAP citation and we leave the Version unchanged.
This function also prints the output, which was used in the tuning, debugging and polishing the function to perfection.

In [24]:
def is_package(series):
    mrno = series['MR']
    citation = series['Citation']
    version = series['Version'] 
    manu = re.search("manual", citation, re.IGNORECASE)
    m = re.search(r"(?=(\b" + '\\b|\\b'.join(pac_name) + r"\b))", citation, re.IGNORECASE)
    if re.search("package", citation, re.IGNORECASE) != None:
        print('***Package***:', mrno, citation)
        return 'Package'
    elif manu != None:
        print('& Manual &', citation, version)
        return series['Version'] 
    elif m != None:
        print('* Package *:', mrno, citation, version)
        return 'Package'
    else:
        print('***Not a Package***:', mrno, citation, version)
        return series['Version']

In [25]:
corpus_df['Version'] = corpus_df.apply(is_package,axis=1)

***Not a Package***: MR4056124 GAP – Groups, algorithms, programming - a system for computational discrete algebra, www.gap-system.org. Unknown
***Package***: MR3942387 Delgado, M., García-Sánchez, P.A., Morais, J.: "Numerical Sgps", A GAP package for numerical semi-groups. https://gap-packages.github.io/numericalsgps. Accessed 19 Aug 2017 
MR3493240
***Not a Package***: MR3942387 The GAP Group: GAP—groups, algorithms, and programming, version 4.7.5 (2014). http://www.gap-system.org. Accessed 19 Aug 2017 4.7.5
***Not a Package***: MR3354065 The GAP – Groups, Algorithms and Programming. Version 4.4.12, 2008. www.gap-system.org. 4.4.12
***Not a Package***: MR3646312 The $\ssf{GAP}$ Group, $\ssf{GAP}$–Groups, Algorithms, and Programming, 4.7.8, 2015, http://www.gap-system.org. 4.7.8
***Not a Package***: MR1864795 M. Schönert et al. GAP - Groups, Algorithms, and Programming (Lehrsthul D für Mathematik, Reinisch-Westflische Technische Hochschule, Aachen, Germany, fifth ed., 1995.) Unknown
*

***Not a Package***: MR3358042 The GAP Group, , GAP–Groups, Algorithms, and Programming, Version 4.4.12, 2008, http://www.gap-system.org. 4.4.12
***Package***: MR3358042 L. H. Soicher, The GRAPE package for GAP, Version 4.3, 2006, http://www.maths.qmul.ac.uk/~leonard/grape/.
***Not a Package***: MR3725240 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.8.6; 2016. http://www.gap-system.org 4.8.6
***Package***: MR3725240 J. D. Mitchell et al., Semigroups - GAP package, Version 2.6; 2015. Available from: http://www-groups.mcs.st-andrews.ac.uk/~jamesm/semigroups.php
***Package***: MR3724696 A. Distler and J. D. Mitchell. GAP package - smallsemi, v. 0.6.8 (2014). (Available at http://www-groups.mcs.st-andrews.ac.uk/~jamesm/smallsemi.)
***Not a Package***: MR3666217 The GAP Group, GAP – Groups, Algorithms, and Programming, version 4.8.2, x2016, www.gap-system.org. 4.8.2
***Not a Package***: MR3763896 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.8.7,

***Not a Package***: MR3084710 The GAP Group. GAP—Groups, Algorithms, and Programming, Version 4.4.10, 2008. http://www.gap-system.org. 4.4.10
***Not a Package***: MR3305041 The GAP Group. "GAP—Groups, Algorithms, and Programming," Version 4.4.10. Available at http://www.gap-system.org, 2008. 4.4.10
***Not a Package***: MR3746628 The GAP Group, "GAP —Groups, Algorithms, and Programming, Version 4.5", 2016, http://www.gap-system.org. 4.5
***Not a Package***: MR3641831 The GAP Group, GAP –Groups, Algorithms, and Programming. Version 4.4.10, 2008. http://www.gap-system.org 4.4.10
***Package***: MR3840333 Bartholdi, L.: FR-GAP package "Computations with functionally recursive groups", Version 2.4.3 http://www.gap-system.org/Packages/fr.html (2017)
***Not a Package***: MR2247887 The GAP Group, Aachen, St Andrews, GAP – Groups, Algorithms, and Programming, Version 4.2, 2000, (http://www-gap.dcs.st-and.ac.uk/~gap). 4.2
***Not a Package***: MR1626409 M. Schönert et al., GAP version 3.4, 4th ed

***Not a Package***: MR2753302 The GAP Group, GAP - groups, algorithms, and programming, version 4.4; Aachen, St. Andrews, 2006, http://www.gap-system.org. 4.4
***Package***: MR2753302 L. H. Soicher, The DESIGN package for GAP, Version 1.3, 2006, http:// designtheory.org/software/gap\_design/.
***Not a Package***: MR2844687 The GAP Group, GAP—Groups, algorithms, and programming, version 4.4, Aachen, St. Andrews, 2006, http://www.gap-system.org 4.4
***Package***: MR2844687 L. H. Soicher, The DESIGN package for GAP, Version 1.3, 2006, http://designtheory.org/software/gap\_design/
***Not a Package***: MR3361261 The GAP Group, GAP – groups, algorithms, and programming, version 4.4; Aachen, St. Andrews, 2006 (http://www.gap--system.org). 4.4
***Package***: MR3361261 L. H. Soicher, The DESIGN package for GAP, Version 1.3, 2006, http://designtheory.org/software/gap_design/.
***Package***: MR3994437 M. Delgado, P. A. García-Sánchez and J. Morais, "numericalsgps": A $\ssf{GAP}$ package on numer

***Not a Package***: MR1443190 Martin Schönert et al., GAP - Groups, Algorithms, and Programming, Release 3.4, Lehrstuhl D für Mathematik, Rheinisch-Westfälische Technische Hochschule, Aachen, Germany, 1995. 3.4
* Package *: MR1482983 G. Havas, M. F. Newman, and E. A. O'Brien, ANU $p$-Quotient program (version 1.4), written in C, available from maths.anu.edu.au by anonymous ftp in the directory pub PQ, as a share library with GAP and as part of Magma (1997). 1.4
***Not a Package***: MR1482983 M. Schönert et al., "GAP—Groups, Algorithms, and Programming," Release 3.4, Lehrstuhl D für Mathematik, Rheinisch-Westfälische Technische Hochschule, Aachen, Germany, 1995. 3.4
* Package *: MR2041537 G. Havas, M. F. Newman and E. A. O'Brien, ANU $p$-Quotient Program (version 1.4), written in C, available as a share library with GAP and as part of Magma, or from http://wwwmaths.anu.edu.au/services/ftp.html, School of Mathematical Sciences, Australian National University, Canberra, 1997. 1.4
***Not 

***Not a Package***: MR2216274 The GAP Group, GAP—Groups, algorithms, and programming, Version 4.3; 2002. (http://www.gap-system.org) 4.3
***Package***: MR2241339 J. De Beule, P. Govaerts, and L. Storme. Projective Geometries, a share package for GAP. (http://cage.ugent.be/~jdebeule/pg), submitted to GAP.
***Not a Package***: MR2241339 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.3 ; 2002. (http://www.gap-system.org) 4.3
***Package***: MR2319174 J. De Beule, P. Govaerts, L. Storme, Projective geometries, a share package for GAP, http://cage.ugent.be/$\sim$jdebeule/pg.
***Not a Package***: MR2319174 The GAP Group, GAP—Groups, Algorithms, and Programming, version 4.4, http://www.gap-system.org, 2004. 4.4
***Not a Package***: MR2469977 GAP—Groups, Algorithms, Programming—a System for Computational Discrete Algebra: http://www-gap.mcs.st-and.ac.uk/. Unknown
***Not a Package***: MR2679937 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4.12 (http://w

***Package***: MR2946109 A. Distler and J. D. Mitchell. Smallsemi—a GAP package, version 0.6.4, 2011. http://tinyurl.com/jdmitchell/smallsemi/.
***Not a Package***: MR2946109 The GAP Group. GAP - Groups, Algorithms, and Programming, Version 4.4.12, 2008. http://www.gap-system.org. 4.4.12
***Package***: MR3169623 Distler, A., Mitchell, J. D. (2011). Smallsemi - A Library of Small Semigroups. http://tinyurl.com/jdmitchell/smallsemi/, Oct A GAP 4 package [5], Version 0.6.4.
***Not a Package***: MR3169623 The GAP Group, (2008). (http://www.gap-system.org). GAP–Groups, Algorithms, and Programming, Version 4.4.12. 4.4.12
***Package***: MR3164153 Besche, U., Eick, B., O'Brien, E.: The SmallGroups Library. http://www-public.tu-bs.de:8080/$\sim$beick/soft/small/small.html (2002). An accepted, GAP 4 package [11]
***Package***: MR3164153 Distler, A., Mitchell, J.D.: Smallsemi—a library of small semigroups. A GAP 4 package [11], Version 0.6.0 (2010). http://www-history.mcs.st-and.ac.uk/$\sim$james

***Not a Package***: MR1931508 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.2, 2000. http://www.gap-system.org. 4.2
***Not a Package***: MR3084439 The GAP Group; GAP–Groups, Algorithms, and Programming, Version 4.4.12; 2008. (http://www.gap-system.org). 4.4.12
***Not a Package***: MR1981356 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.2, 2000. (http://www.gap-system.org). 4.2
* Package *: MR2240774 Steve Linton. GAP program to study classes of permutations generated by token passing networks via finite state automata, http://www-groups.dcs.st-and.ac.uk/$\sim$sal Unknown
***Package***: MR3917964 Delgado, M., García-Sánchez, P.A., Morais, J.: "Numericalsgps": a GAP package on numerical semigroups. http://www.gap-system.org/Packages/numericalsgps.html
***Package***: MR3998824 M. Delgado, P.A. García-Sánchez, J. Morais, NumericalSgps: a GAP package on numerical semi-groups, http://www.gap-system.org/Packages/numericalsgps.html.
***Not a Package***:

***Package***: MR3250444 T. Breuer, CTblLib—GAP's character table library package, version 1.2.1, http://www.math.rwth-aachen.de/$\sim$Thomas.Breuer/ctbllib, 2012.
***Package***: MR3250444 T. Breuer, Data provided by the GAP package mfer, http://www.math.rwth-aachen.de/$\sim$mfer/data/index.html.
***Not a Package***: MR3250444 The GAP Group, GAP—groups, algorithms, and programming, version 4.6.5, http://www.gap-system.org, 2013. 4.6.5
***Package***: MR3626710 T. Breuer, CTblLib-GAP's character table library package, version 1.2.1 (2012), http://www.math.rwth-aachen.de/~Thomas.Breuer/ctbllib.
* Package *: MR3626710 T. Breuer and J. Müller, GAP file tst/mferctbl.gap, a compiled database of character tables of endomorphism rings of multiplicity-free permutation modules of the sporadic simple groups and their cyclic and bicyclic extensions. Unknown
***Not a Package***: MR3626710 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.6.5 (2013), http://www.gap-system.org. 4.6.5
*

***Not a Package***: MR2911879 The GAP group: GAP—Groups, Algorithms, and Programming. Version 4.4 (2004); http://www.gap-system.org 4.4
***Not a Package***: MR2927804 The GAP group, GAP–groups, algorithms, and programming, Version 4.4 2004, http://www.gap-system.org. 4.4
***Not a Package***: MR3049563 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.4, http://www.gap-system.org, 2005. 4.4
***Not a Package***: MR3317762 GAP–Groups, Algorithms, and Programming, Version 4.4, The GAP Group, 2004, http://www.gap-system.org. 4.4
***Not a Package***: MR3318256 The GAP Group, GAP—Groups, Algorithms, and Programming, version 4.4 (2004). http://www.gap-system.org 4.4
***Not a Package***: MR3405871 GAP group, Gap – groups, algorithms, programming, version 4.4, http://www.gap-system.org, 2004. 4.4
***Not a Package***: MR3626555 The GAP group: GAP—groups, algorithms, and programming. Version 4.4 (2004), http://www.gap-system.org 4.4
***Not a Package***: MR3620702 The GAP Group, GA

***Not a Package***: MR3339804 The GAP Group, GAP – groups, algorithms, and programming, Version 4.7.4 (2014), http://www.gap-system.org. 4.7.4
***Not a Package***: MR3724268 The GAP Group, GAP – Groups, Algorithms, and Programming, version 4.8.4, 2016 (http://www.gap-system.org). 4.8.4
***Not a Package***: MR3177516 The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.4.12; 2008. (http://www.gap-system.org) 4.4.12
& Manual & I. M. Araújo, et. al., GAP Reference Manual, The GAP Group, http://www.gap-system.org. Unknown
& Manual & I. M. Araújo, et. al., GAP Reference Manual, The GAP Group, http://www.gap-system.org. Unknown
***Package***: MR2429460 J. Cramwinckel, et. al., GUAVA A GAP 4 Package for Computing with Error-Correcting Codes, http://www.gap-system.org/Packages/guava.html.
& Manual & I. M. Araújo, et. al., "GAP Reference Manual," The GAP Group, available at http://www.gap-system.org Unknown
***Package***: MR2327052 J. Cramwinckel, et. al., "GUAVA A GAP 4 Package f

MR1025760 Unknown
***Not a Package***: MR3891940 The GAP Group, `GAP—groups, algorithms, and programming, version 4.9.1', 2018, https://www.gap-system.org. 4.9.1
***Not a Package***: MR2354797 The GAP Group. GAP—Groups, Algorithms, and Programming, version 4.4 2006. Available at http://www.gap-system.org. 4.4
***Not a Package***: MR3422470 GAP: GAP – Groups, Algorithms, and Programming, Version 4.6.5. The GAP Group. www.gapsystem.org (2013) 4.6.5
***Not a Package***: MR3422470 GAP. GAP - Groups, Algorithms, and Programming, Version 4.8.4. http://www.gap-system.org (2016) 4.8.4
***Not a Package***: MR3723122 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.6.5, 2013, www.gapsystem.org. 4.6.5
***Not a Package***: MR3936683 GAP, GAP – Groups, Algorithms, and Programming, Version 4.8.4 http://www.gap-system.org, Jun 2016. 4.8.4
***Not a Package***: MR3910669 GAP, GAP — Groups, Algorithms, and Programming, Version 4.8.4. http://www.gap-system.org, Jun 2016. 4.8.4
***Packa

***Not a Package***: MR2522415 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.4.9, http://www.gap-system.org, 2006. 4.4.9
***Package***: MR2522415 L.H. Soicher, The DESIGN package for GAP, Version 1.3, http://designtheory.org/software/gap\_design/, 2006.
***Not a Package***: MR2221258 The GAP Group, `GAP - Groups, Algorithms, and Programming', Version 4.4, 2005, http://www.gap-system.org. 160 4.4
***Not a Package***: MR4217096 The GAP Group, GAP-Groups, Algorithms, and Programming, Version 4.9.3, 2018, https://www.gap-system.org 4.9.3
***Not a Package***: MR4205764 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.9.3, 2018, https://www.gap-system.org. 4.9.3
***Not a Package***: MR4208918 The GAP Group, GAP – Groups, Algorithms and Programming, Version 4.9.1, 2018, http://www.gap-system.org. 4.9.1
***Not a Package***: MR2998792 The GAP Group, GAP - Groups, Algorithms, and Programming, Version 4.4.10, http://www.gap-system.org, 2007. 4.4.10
***Not a 

***Not a Package***: MR3943466 The GAP Group, GAP-Groups: Algorithms and Programming, Version 4.7.7. http://www.gap-system.org. Accessed on 13 Feb 2015 4.7.7
***Not a Package***: MR2228215 The GAP Group, GAP—groups, algorithms, and programming, Version 4.4, 2004, http://www.gap-system.org. 4.4
***Not a Package***: MR2387535 The GAP Group (2004). GAP–Groups, Algorithms, and Programming, Version 4.4. Available at http://www.gap-system.org. 4.4
***Not a Package***: MR2442005 The GAP Group Collaboration, GAP - Groups, Algorithms, and Programming, Version 4.4, http://www.gap-system.org, 2004. 4.4
***Not a Package***: MR3739327 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.4, preprint (2004), www.gap-system.org. 4.4
***Not a Package***: MR3687100 GAP — Groups, Algorithms, and Programming, The GAP Group, 2016, Available at http://www.gap-system.org. version 4.8.4. 4.8.4
***Package***: MR3874852 J. Michel, The development version of the CHEVIE package of GAP3, Journal of 

***Not a Package***: MR1458331 M. Schönert et. al., GAP - Groups, Algorithms, and Programming, Lehrstuhl D für Mathematik, Rheinisch-Westfälische Techn. Hochschule, Aachen, Germany, fourth edition, 1994. Unknown
***Not a Package***: MR2255121 The GAP Group, GAP-4.3—Groups, Algorithms and Programming, Aachen, St. Andrews, http://www-gap.dcs.st-and.ac.uk/gap/, 2003. 4.3
***Not a Package***: MR1779890 The GAP Group: GAP—groups, algorithms, and programming, version 4.1, Lehrstuhl D für Mathematik, RWTH Aachen, and School of Mathematical and Computational Sciences, University of St. Andrews, 1999. 4.1
***Not a Package***: MR1916920 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.2 (Aachen/St. Andrews, 2000); http://www.gap-system.org/. 18 4.2
& Manual & M. Schönert et al., GAP—Groups, Algorithms, and Programming, Version 4.2, Manual, Lehrstuhl D für Mathematik, RWTH Aachen, 2000. 4.2
***Not a Package***: MR1995123 The GAP Group, GAP—Groups, Algorithms, and Programming, Ver

***Not a Package***: MR2403651 The GAP Group, GAP-Groups, Algorithms, and Programming, Version 4.4, 2006, http://www.gap-system.org. 4.4
***Not a Package***: MR2480297 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.4, 2006 (http://www.gap-system.org). 4.4
***Not a Package***: MR3021414 GAP Group, The, GAP: Groups, Algorithms, and Programming, Version 4.4 (2006; available at www.gap-system.org). 4.4
***Not a Package***: MR2228639 M. Schönert, et al., GAP—Groups, Algorithms and Programming, fourth ed., RWTH Aachen: Lehrstuhl D für Mathematik, 1994. Unknown
***Not a Package***: MR2046643 The GAP Group, Aachen, St Andrews. GAP—Groups, Algorithms, and Programming, Version 4.1, 1999 (http://www-gap.dcs.st-and.ac.uk/$\sim$gap). 4.1
***Not a Package***: MR1989461 The GAP Group, Aachen, St. Andrew, GAP—groups, algorithms, and programming, version 4.1, 1999 (http://www-gap.dcs.st-and.ac.uk/ gap). 4.1
***Not a Package***: MR3734529 The GAP Group: GAP - Groups, Algorithms, and

***Not a Package***: MR2799018 "The GAP Group, GAP—Groups, Algorithms, and Programming," 2008 [Online]. Available: http://www.gap-system.org. Version 4.4.12 4.4.12
***Not a Package***: MR4079536 The GAP, Group. (2016). GAP—Groups, algorithms, and programming, version 4.8.6. http://www.gap-system.org 4.8.6
***Not a Package***: MR4074590 The GAP Group, GAP – Groups, Algorithms, and Programming, Version 4.7.8, http://www.gap-system.org, 2015. 4.7.8
***Package***: MR3870955 T. Breuer, `CTblLib - a GAP package, version 1.1.3', 2004, http://www.math.rwth-aachen.de/Thomas.Breuer/ctbllib/.
***Not a Package***: MR3441221 The GAP Group, GAP: groups, algorithms, and programming, Version 4.7.4, 2014, Available at http://www.gap-system.org. 4.7.4
***Not a Package***: MR2205227 The GAP Group, GAP—Groups, Algorithms, and Programming. http://www.gap-system.org. Unknown
***Not a Package***: MR2684419 The GAP Group, GAP-Groups, Algorithms, and Programming, Version 4.4; 2006. http://www.gap-system.org 4.

***Not a Package***: MR2322471 The GAP Group, 2006. GAP—Groups, Algorithms, and Programming, Version 4.4, http://www.gap-system.org. 4.4
***Not a Package***: MR2067621 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.3, http://www.gap-system.org, 2002. 4.3
***Not a Package***: MR1838849 The GAP Group, "GAP—Groups, Algorithms, and Programming, Version 4.1," Aachen, St Andrews, 1999 (http://www-gap.dcs.st-and.ac.uk/$\sim$gap). [The specific libraries related to this paper are available at http://www.public.iastate.edu/$\sim$petr.] 4.1
***Not a Package***: MR2206479 The GAP Group, GAP—Groups, Algorithms, and Programming, Version 4.3, Aachen, St Andrews, 1999. Visit http://www-gap.dcs.st-and.ac.uk/$\sim$gap. 4.3
***Package***: MR2206479 G.P. Nagy, P. Vojtěchovský, LOOPS version 0.997, Package for GAP 4. Available at http://www.math.du.edu/loops.
***Package***: MR2075284 Nagy G.P., Vojtěchovský P., LOOPS, a package for GAP 4.3. Download GAP at http://www-gap.dcs.st-and.ac.u

In [26]:
corpus_df['Version'].value_counts() # for overview on the results

Package     819
Unknown     493
4.4         454
4.4.12      310
4.3         212
           ... 
10.14760      1
4.08.10       1
10.1109       1
10.01.16      1
1.9.6         1
Name: Version, Length: 84, dtype: int64

### Version Filter  
We need to filter out some anomalies in the version column, such as too long versions which are usually `arXiv` numbers, dates connected with version or other organizations' serial numbers.<br>
THe following function isolates any entries with Version value longer than 6 characters, then replaces it with the string 'Not GAP citation'.<br>
It also prints the output and we can see there are not many such entries, so we will inspect them manually.

In [27]:
def version_filter(series):
    mrno = series['MR']
    citation = series['Citation']
    version = series['Version']
    ind = series.name
    if version != 'Package' and version != 'Unknown' and len(version) > 6:
        print(ind, 'Too long Version *', mrno, citation)
        return 'Not GAP citation'
    else:
        return series['Version']

In [28]:
corpus_df['Version'] = corpus_df.apply(version_filter, axis=1)

125 Too long Version * MR4170882 F. Ali, M. Al-Kadhi, A. Aljouiee, M.A.F. Ibrahim, 2-Generations of finite simple groups in GAP, in: 2016 International Conference on Computational Science and Computational Intelligence (CSCI), IEEE Conf. Proc., 249, IEEE, Las Vegas, NV, 2016, pp. 1339–1344 (doi:10.1109/CSCI.2016.0250).
366 Too long Version * MR2422501 The GAP Group. (2005). GAP - Groups, Algorithms, and Programming, version 4.4.10.2007. http://www.gap-system.org.
371 Too long Version * MR3272384 John Bamberg, S.P. Glasby, Eric Swartz, AS-configurations and skew-translation generalised quadrangles (including supporting GAP code), arXiv:1405.5063v2.
645 Too long Version * MR4193641 GAP – Groups, Algorithms, and Programming. (2018). Version 4.08.10. https://www.gap-system.org.
651 Too long Version * MR2422303 T. Breuer, GAP computations concerning probabilistic generation of finite simple groups, arXiv:0710.3267.
655 Too long Version * MR2669683 T. Breuer, `GAP computations concerning Ham

We have a list of anomalies here which we inspect manually in the cell above. We will only look at the genuine GAP citations with typing errors conencting version and year - these we will fix manually with our function `fix_version`. <br> 
Others are not GAP citations but rather citing articles connected to GAP and have other organizational numbers such as `arXiv:0710.3267` which fooled our version hunter function - these we will remove from our data once we finish the manual fixing as they are not citations of GAP software or its packages.

In [29]:
# https://studres.cs.st-andrews.ac.uk/CS2006/Lectures/Python/L08-dataset.pdf
def fix_version(mrno,version):
 r = corpus_df[corpus_df['MR'] == mrno]
 corpus_df.at[r.index[0],'Version']=version

We start with MR2422501 which is version 4.4 accidentaly connected with the year, we will manually fix it below.

In [30]:
get_c('MR2422501')

Unnamed: 0,MR,Citation,Version
366,MR2422501,"The GAP Group. (2005). GAP - Groups, Algorithms, and Programming, version 4.4.10.2007. http://www.gap-system.org.",Not GAP citation


In [31]:
fix_version('MR2422501', '4.4')

Next is MR4193641 which should be 4.8.10 instead of 4.08.10. Fixed manually below.

In [32]:
get_c('MR4193641')

Unnamed: 0,MR,Citation,Version
645,MR4193641,"GAP – Groups, Algorithms, and Programming. (2018). Version 4.08.10. https://www.gap-system.org.",Not GAP citation


In [33]:
fix_version('MR4193641', '4.8.10')

Next we have version 4.4 accidentally connected with the year again, easy fix below.

In [34]:
get_c('MR2526731')

Unnamed: 0,MR,Citation,Version
1839,MR2526731,"The GAP Group, GAP–Groups, Algorithms, and Programming, Version 4.4.2006. http://www.gap-system.org.",Not GAP citation


In [35]:
fix_version('MR2526731', '4.4')

This citation has a long number before the version which was captured by our version checker and used as version. The real version is 4.4.12 which we will manually assign below.

In [36]:
get_c('MR2928559')

Unnamed: 0,MR,Citation,Version
2315,MR2928559,"L. R. Ford, Automorphic functions, Chelsea, 1951. Zbl 55.0810.04 GAP - groups, algorithms, and programming, Version 4.4.12, The GAP Group, St. Andrews, F...",Not GAP citation


In [37]:
fix_version('MR2928559', '4.4.12')

All the rest are anomalies citing other sources but not GAP.

Once we manually fixed all the genuine citations versions, we will delete all the remaining records with version labelled 'Not GAP citation' with the following line of code.

In [38]:
corpus_df = corpus_df[corpus_df['Version'] != 'Not GAP citation']

In [39]:
corpus_df.loc[corpus_df['MR'] == 'MR3957957']

Unnamed: 0,MR,Citation,Version
1150,MR3957957,"The GAP Group, GAP – Groups, Algorithms, and Programming, http://www.gap-system.org.",Unknown
1151,MR3957957,"D.F. Holt, The $\ssf{GAP}$ package $\ssf{kbmag}$, Knuth-Bendix on monoids and automatic groups, https://www.gap-system.org/Packages/kbmag.html.",Package
1152,MR3957957,"M. Neunhöffer, Á. Seress, et al., The $\ssf{GAP}$ package $\ssf{recog}$, A collection of group recognition methods, http://gap-packages.github.io/recog/.",Package


* Now we will investigate the versions a little bit more manually.

Versions from 4 onwards are fine, we will focus on the older ones between 1 and 3, as they might be anomalies which are not GAP citations at all.

In [40]:
ver_list = corpus_df['Version'].unique()
ver_list = np.sort(ver_list)
ver_list # list of versions we have in the data

array(['1.0', '1.1', '1.9.6', '3.0', '3.1', '3.2', '3.3', '3.4', '3.4.3',
       '3.4.4', '4.1', '4.10', '4.10.0', '4.10.1', '4.10.2', '4.11',
       '4.11.0', '4.2', '4.3', '4.4', '4.4.10', '4.4.11', '4.4.12',
       '4.4.2', '4.4.3', '4.4.4', '4.4.5', '4.4.6', '4.4.7', '4.4.9',
       '4.46', '4.49', '4.5', '4.5.3', '4.5.4', '4.5.5', '4.5.6', '4.5.7',
       '4.6', '4.6.1', '4.6.12', '4.6.2', '4.6.3', '4.6.4', '4.6.5',
       '4.6.9', '4.7', '4.7.2', '4.7.4', '4.7.5', '4.7.6', '4.7.7',
       '4.7.8', '4.7.9', '4.8', '4.8.1', '4.8.10', '4.8.2', '4.8.3',
       '4.8.4', '4.8.5', '4.8.6', '4.8.7', '4.8.8', '4.8.9', '4.9',
       '4.9.0', '4.9.1', '4.9.2', '4.9.3', '5.7', 'Package', 'Unknown'],
      dtype=object)

Versions 1.0, 1.0.0 and 1.1 have less than 10 records and we will check them all manually.  
We will start with 1.0, as we can see below it is a GAP manual which is early practice of GAP citation and we will keep it in the data so we can investigate how this early practice dissapeared over time.

In [41]:
corpus_df[corpus_df['Version'] == '1.0']

Unnamed: 0,MR,Citation,Version
2222,MR2111596,"Breuer, T. (2001). Manual for the GAP Character Table Library, Version 1.0. Lehrstuhl D für Mathematik; RWTH Aachen, Germany.",1.0


In [42]:
fix_version('MR2111596','Package')

All the six records with version 1.1 are actually for the "Character Table Library" which is a GAP package, but escaped the Regex expression because its full name was used here. I will fix these manually.

In [43]:
corpus_df[corpus_df['Version'] == '1.1']

Unnamed: 0,MR,Citation,Version
389,MR2308856,"Thomas Breuer, Manual for the GAP character table library, Version 1.1 (Lehrstuhl D für Mathematik, Rheinisch Westfälische Technische Hochschule, Aachen,...",1.1
735,MR3007647,"T. Breuer, Manual for the GAP character table library, version 1.1 (RWTH, Aachen, 2004).",1.1
738,MR2684423,"T. Breuer, Manual for the GAP Character Table Library, Version 1.1, RWTH Aachen, 2004.",1.1
741,MR3219555,"T. Breuer, Manual for the GAP Character Table Library, Version 1.1, RWTH Aachen, 2004.",1.1
966,MR2805443,"Breuer, T.: Manual for the GAP Character Table Library, Version 1.1, Lehrstuhl D für Mathematik, Rheinisch Westfälische Technische Hochschule, Aachen, Ge...",1.1
1742,MR2326329,"T. Breuer, Manual for the GAP Character Table Library Version 1.1 (Lehrstuhl D für Mathematik, Rheinisch West-fälische Hochschule, Aachen, 2004).",1.1


In [44]:
fix_version('MR2308856', 'Package')
fix_version('MR3007647', 'Package')
fix_version('MR2684423', 'Package')
fix_version('MR3219555', 'Package')
fix_version('MR2805443', 'Package')
#fix_version('MR2326329', 'Package')

In [45]:
corpus_df[corpus_df['MR'] == 'MR2326329']

Unnamed: 0,MR,Citation,Version
1741,MR2326329,"The GAP Group, gap—Groups, Algorithms, Programming, Version 4.4.7, 2006 (http://www.gap-system.org).",4.4.7
1742,MR2326329,"T. Breuer, Manual for the GAP Character Table Library Version 1.1 (Lehrstuhl D für Mathematik, Rheinisch West-fälische Hochschule, Aachen, 2004).",1.1


The last entry `MR2326329` has two citations and our `fix_version` function wrongly apllies itself on the firt one. Therefore, we will use the manual fix below instead.

In [46]:
corpus_df.loc[1742]['Version']='Package'

There is a single entry with version 1.9.6. After discussing with Dr Konovalov, we were both unable to access the paper and it is definitely some sort of error as there is no such early GAP release, we have decided to exclude this record from the analysis.

In [47]:
corpus_df[corpus_df['Version'] == '1.9.6']

Unnamed: 0,MR,Citation,Version
2824,MR2747149,"The GAP Group, Welcome to GAP – Groups, Algorithms and Programming: a system for computational discrete algebra. Version 1.9.6, URL www.gap-system.org/. ...",1.9.6


In [55]:
corpus_df[corpus_df['MR'] == 'MR2747149']

Unnamed: 0,MR,Citation,Version


In [56]:
corpus_df.drop(corpus_df[corpus_df['MR'] == 'MR2747149'].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [57]:
#corpus_df = corpus_df.drop(2824)

We have one entry with version 3.0 which is another example of early GAP citation practice by Martin Schönert. He is one of the initial authors, who created the GAP language.

In [58]:
corpus_df[corpus_df['Version'] == '3.0']

Unnamed: 0,MR,Citation,Version
1601,MR1195429,"M. Schönert (Editor), GAP 3.0 manual, Lehrstuhl D für Mathematik, RWTH Aachen, 1991.",3.0


Two entries with version 3.1, again by Martin Schönert, they will remain in the data to help us analyse early GAP citation practice.

In [None]:
corpus_df[corpus_df['Version'] == '3.1']

One entry for version 3.2, again it remains in the data as it is genuine early GAP citation.

In [None]:
corpus_df[corpus_df['Version'] == '3.2']

With version 3.3 two more early GAP citation practice examples.

In [None]:
corpus_df[corpus_df['Version'] == '3.3']

We have quite a few examples with version 3.4 again by Marting Schönert and we will keep them in the data, as there are no anomalies here.

In [None]:
corpus_df[corpus_df['Version'] == '3.4']

We have 3 records for 3.4.3 and 9 records for 3.4.4 all of them genuine early GAP citations, which we will gladly keep in the data.

In [None]:
corpus_df[corpus_df['Version'] == '3.4.3']

In [None]:
corpus_df[corpus_df['Version'] == '3.4.4']

There is one entry with version 5.7, after manual inspection we can see this is a typing error. We will use our function to manually fix the version of such anomalies.

In [None]:
corpus_df[corpus_df['Version'] == '5.7']

In [None]:
fix_version('MR4052374','4.5.7')

In [None]:
corpus_df[corpus_df['Version'] == '5.7'] # now the anomaly is gone

corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR4170882'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR2422501'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR3272384'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR4193641'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR2422303'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR2669683'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR3968779'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR4160887'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR3957957'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR2526731'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR2928559'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR3974806'].index, inplace=True)
corpus_df.drop(corpus_df.loc[corpus_df['MR'] == 'MR3725940'].index, inplace=True)

Versions 4.46 and 4.49 are typing errors and we will correct them to 4.4.6 and 4.4.9

In [None]:
corpus_df[corpus_df['Version'] == '4.46']

In [None]:
fix_version('MR2537368','4.4.6')

In [None]:
corpus_df[corpus_df['Version'] == '4.49']

In [None]:
fix_version('MR2548919','4.4.9')
fix_version('MR2606860','4.4.9')

Versions 1.0 and 1.1 after manual inspection are found to be manuals of packages. We will fix them using the function, below.

# Website

Now we will create a `website` coulmn to indicate if such is provided in each entry.<br>
Then we fill each cell using  a Regex to search citations for the GAP website.<br>
It will be a binary column with Yes and No cells. <br>
The function below iterates over Citation cells and searches for "www" or ".net" or "http" - these are the website characteristic strings, isolated after testing. If the search returns positive `Website` cell is populated with "Yes" nad if not then it is filled with "No". <br>
Again we add a "print" statement to teach case of the loop so we can manually inspect results.

In [None]:
def website_check(series):
    mrno = series['MR']
    citation = series['Citation']
    version = series['Version']
    if re.search("www|\.net|http", citation, re.IGNORECASE) != None:
        print('***Provided Website***:', mrno, citation)
        return 'Yes'
    else:
        print('***Not Provided***:', mrno, citation)
        return 'No'

In [None]:
corpus_df.insert(loc=3, column='Website', value=' ') # we apply it to our data

In [None]:
corpus_df['Website'] = corpus_df.apply(website_check, axis=1)

# Merging the two dataframes with the equivalent of SQL `join`
* The `MR` column in `corpus_df` dataframe has the letters "MR" preceeding each number, first we will remove these letters, using Regex, so the the MR number format is the same in both datasets.

In [None]:
corpus_df['MR'] = corpus_df['MR'].str.extract('(\d+)', expand=False)
corpus_df

In [None]:
corpus_df.info()

The data from GAP Bibliography has Null values across the columns, this is indicated by the difference in the count of Non-Null entries in each coilumn. However this issue will be sorted by the merge process, as we will use `corpus_df` MR numbers as a base column to join the two data-frames on.

In [None]:
bib_df.info()

With the following code we are joining the two datasets on the `MR` column and using `corpus_df` as a base. <br>
The resulting dataset will have as many lines as `corpus_df` but all columns from `bib_df` will be added, hence we will have much more information to work with. <br>
Rows that were in `bib_df` but had no matching MR number in `corpus_df` will be left behind, because we would not have Citation text for them, hence they are not useful for further analysis.

In [None]:
merged_df = pd.merge(bib_df, corpus_df, on='MR', how='right', indicator=True)

In [None]:
merged_df.info() # to inspect for Null values and data-types of each column.

We need to remove any rows not containing Year value as they will be also of little use for our analysis. We will also correct they `Year` column data type to Integer, again.

In [None]:
type(merged_df['Year'][3])

In [None]:
merged_df = merged_df.dropna(subset=['Year'])
merged_df['Year'] = merged_df['Year'].astype(np.int64)

In [None]:
merged_df.info()

In [None]:
type(merged_df['Year'][3])

We can use the following iteration loop to browse the resulting merged dataframe. By borwsing the raw data we can make sure everything is alright and spot any remaining issues or anomalies. In our case there are some remaining special characters, which we will remove as best as we can.

In [None]:
for index, row in merged_df.iterrows():
    print(row['MR'], row['Citation'])

We use Regex to further purify the `Citation` column, removing some remaining special characters, that we noticed during manual scrolling over the data.

In [None]:
merged_df['Citation'] = merged_df['Citation'].str.replace(r'[\\\$\{\}\^]', '')
merged_df['Citation'] = merged_df['Citation'].str.replace(r'(ssf)', '')

##### We remove the unnecessary `merge` column and add a `length` column to reflect the character lenght of each citation.

In [None]:
merged_df = merged_df.drop(['_merge'], axis=1)
merged_df['Length'] = merged_df['Citation'].apply(len)
merged_df = merged_df.dropna()
merged_df.info()

### Creating the Accuracy Score column
I have decided to award each citation with one accuracy point for:
* providing some kind of version (either GAP version or some sort of package version)
* providing a website (either the official GAP website or a package website)
* Citation longer than 90 characters (because too short citations do not contain enough information)
<br>First we create the column, then we apply to it a function, which checks `Version`, `Website`, and `Length` columns and awards points accordingly.

In [None]:
merged_df['Accuracy Score'] = 0
merged_df['Accuracy Score'] = merged_df['Accuracy Score'].astype(int)

In [None]:
def accuracy_calculator(series):
    mrno = series['MR']
    citation = series['Citation']
    version = series['Version']
    website = series['Website']
    score = series['Accuracy Score']
    dal = series['Length']
    
    if version != 'Unknown':
        score += 1
        
    if website != 'No':
        score += 1
        
    if dal >= 90:
        score += 1
        
    return score

In [None]:
merged_df['Accuracy Score'] = merged_df.apply(accuracy_calculator, axis=1)

In [None]:
merged_df['Accuracy Score'].value_counts() # overview of the results

### Now we split the extended dataset in two dataframes for further analysis

### Pure GAP citations  - citing GAP software, not a Package.

In [None]:
gap_df = merged_df[merged_df['Version'] != 'Package']
gap_df = gap_df.dropna()
gap_df.info()

In [None]:
versions_cited = gap_df['Version'].unique() # the same as ver_list but for the gap_df
versions_cited = np.sort(versions_cited)
versions_cited

We will add two more columns that we will need later in the analysis `ReleaseYear` and `Delay`.
<br> Below is a dictionary we manually assembled with the help of Dr Konovalov and the GAP website. <br> The dictionary contains the release year for each version we have in the data.

In [None]:
release_dates = { 
    # dates from archive timestamps
    '4.11.1': 2021,
    '4.11.0': 2020,
    '4.11': 2020,
    '4.10.2': 2019,
    '4.10.1': 2019,
    '4.10.0': 2018,
    '4.10': 2018,
    '4.9.3': 2018,
    '4.9.2': 2018,
    '4.9.1': 2018,
    '4.9.0': 2018,
    '4.9': 2018,
    '4.8.10': 2017, # assumption
    '4.8.9': 2017,
    '4.8.8': 2017,
    '4.8.7': 2017,
    '4.8.6': 2016,
    '4.8.5': 2016,
    '4.8.4': 2016,
    '4.8.3': 2016,
    '4.8.2': 2016, # 2016/02/20
    '4.8.1': 2016,
    '4.8': 2016,
    '4.7.9': 2015, # 2015/11/29
    '4.7.8': 2015, # 2015/06/09
    '4.7.7': 2015, # 2015/02/13
    '4.7.6': 2014, # 2014/11/15
    '4.7.5': 2014, # 2014/05/24
    '4.7.4': 2014, # 2014/02/20
    '4.7.3': 2013, # 2014/02/15
    '4.7.2': 2013, # 2013/12/01
    '4.7': 2013,
    '4.6.9': 2013,
    '4.6.5': 2013, # 2013/07/20
    '4.6.4': 2013, # 2013/05/04
    '4.6.3': 2013, # 2013/03/18
    '4.6.2': 2013, # 2013/02/02
    '4.6.12': 2013,
    '4.6.1': 2013,
    '4.6': 2013,
    '4.5.7': 2012, # 2012/12/14
    '4.5.6': 2012, # 2012/09/16
    '4.5.5': 2012, # 2012/07/16
    '4.5.4': 2012, # 2013/06/04
    '4.5.3': 2012,
    '4.5': 2012, # https://www.gap-system.org/Doc/History/history.html
    # dates below from file creation
    '4.4.12': 2008, # 2008/12/16
    '4.4.11': 2008, # 2008/12/08
    '4.4.10': 2007, # 2007/10/05
    '4.4.9': 2006,  # 2006/11/02
    '4.4.8': 2006,  # 2006/09/29
    '4.4.7': 2006,  # 2006/03/17
    '4.4.6': 2005,  # 2005/09/02
    '4.4.5': 2005,  # 2005/05/13
    '4.4.4': 2004,  # 2004/12/22
    # dates below from http://www.gap-system.org/Download/Updates/index.html
    '4.4.3': 2004,   # May 2004
    '4.4.2': 2004,  # April 2004
    # dates from http://www.gap-system.org/Doc/History/history.html
    # if not stated otherwise
    '4.4': 2004, # https://www.gap-system.org/Doc/History/history.html
    '4.3': 2002, # https://www.gap-system.org/Doc/History/history.html
    '4.2': 2000, # http://www.gap-system.org/ForumArchive/Linton.1/Steve.1/Release_.3/1.html
    '4.1': 1999, # https://www.gap-system.org/Doc/History/history.html
    '3.4.4': 1997, # https://www.gap-system.org/Doc/History/history.html
    '3.4.3': 1994, # https://www.gap-system.org/ForumArchive/Schoener.1/Martin.1/Upgrade_.10/1.html
    '3.4': 1994, # https://www.gap-system.org/ForumArchive/Schoener.1/Martin.1/GAP_vers.2/1.html
    '3.3': 1993, # https://www.gap-system.org/ForumArchive/Schoener.1/Martin.1/GAP_vers.1/1.html
    '3.2': 1993, # https://www.gap-system.org/Doc/History/history.html
    '3.1': 1991, # https://www.gap-system.org/Doc/History/history.html
    '3.0': 1991, # "M. Schönert (Editor), GAP 3.0 manual, Lehrstuhl D für Mathematik, RWTH Aachen, 1991."
    
}

The following loop checks for versions that we have in the data but do not have in our Release Year dictionary.

In [None]:
for x in versions_cited:
    if not x in release_dates.keys():
        print(x)

The following function we will use to populate the cells in the `Release Year` column.

In [None]:
def release_year(version):
    if version in release_dates.keys():
        return release_dates[version]
    else:
        return 'Unknown'

In [None]:
release_year('3.4')

In [None]:
gap_df['ReleaseYear'] = gap_df['Version'].map(release_year) # applying the function to the whole data

In [None]:
gap_df.head() # inspect results

* Delay column we will use later to analyse the difference between publication year and the year of GAP release cited by this publication.

In [None]:
gap_df['Delay'] = 0 # create the column, with 0 as default value for each cell

The following function we will use to populate `Delay` column. It will give us the difference between year of publication and year when the cited GAP version was released.

In [None]:
def set_delay(series):
    rel_year = series['ReleaseYear']
    year = series['Year']
    delay = series['Delay']
    if rel_year != 'Unknown':
        #print('***Package***:')
        delay = year - rel_year
    return delay

In [None]:
gap_df['Delay'] = gap_df.apply(set_delay, axis=1) # we apply it to our data

In [None]:
gap_df.info() # we can see the new column at the bottom

### GAP Packages Citations - all rows that have "Package" in the `Version` column cell.
<br> This subset of our data we will later use to perform some specific analysis of Package citations and give a brief overview of GAP Package citation practices.

In [None]:
pac_df = merged_df[merged_df['Version'] == 'Package']
pac_df = pac_df.dropna()
pac_df.info()

Once all the data is cleaned and prepared, we can several random samples to ensure it is all good before we pass it to Module 3 for analysis and visualisation.

In [None]:
# we can see the count of citations by specified length, for example
sma = gap_df[gap_df['Length'] < 90]
big = gap_df[gap_df['Length'] > 90]
print(len(sma))
print(len(big))

In [None]:
get_c('3092787') # using this function conveniently displays all records with the specified MR number

In [None]:
merged_df.loc[354] # thus we can display a single row by specified index

Exporting the pre-processed data to `CSV` files to be picked up by the final *Data Visualisations and Analysis* notebook.

In [None]:
merged_df.to_csv('full.csv', index=False, encoding='utf-8')
gap_df.to_csv('gap.csv', index=False, encoding='utf-8')
pac_df.to_csv('pac.csv', index=False, encoding='utf-8')