# Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from unidecode import unidecode
import nltk
from nltk.util import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from itertools import combinations

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leomurta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/leomurta/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Dataset

In [2]:
#reads publications from Excel file
df = pd.read_excel('publications.xlsx', keep_default_na=False)
df.head()

Unnamed: 0,title,authors,institutions,venue,year,citations
0,Towards Component-based Software Maintenance v...,Leonardo Murta; Hamilton Oliveira; Cristine Da...,UFRJ,wmswm,2004,13
1,Documentacao Essencial para Manutencao de Soft...,Sérgio Cozzetti Bertoldi de Souza; Wesley Chri...,UCB,wmswm,2004,8
2,Evolução Orientada a Aspectos de um Framework OO,Maria Tânia Francelino da Silva; Rosana T. Vac...,USP,wmswm,2004,4
3,An Evolution Process for Application Frameworks,Maria Istela Cagnin; José Carlos Maldonado; Pa...,USP; UFSCar,wmswm,2004,6
4,Uma experiencia no ensino de manutencao de sof...,Márcio Greyck Batista Dias,UNIGOIÁS,wmswm,2004,6


# Authors

In [3]:
def first_last(text):
    words = text.split(' ')
    return f'{words[0]} {words[-1]}'

authors = df['authors'].apply(unidecode) \
                       .str.split('; ') \
                       .apply(lambda x: [first_last(author) for author in x])
authors

0      [Leonardo Murta, Hamilton Oliveira, Cristine D...
1      [Sergio Souza, Wesley Neves, Nicolas Anquetil,...
2             [Maria Silva, Rosana Braga, Paulo Masiero]
3      [Maria Cagnin, Jose Maldonado, Paulo Masiero, ...
4                                          [Marcio Dias]
                             ...                        
226    [Altino Junior, Leticia Meireles, Lucas Figuei...
227         [Matheus Melo, Gabriel Menezes, Bruno Cafeo]
228    [Luan Ciribelli, Joao Lima, Heleno Junior, Gle...
229    [Humberto Damasceno, Joao Nascimento, Carla Be...
230                      [Nelson Rosa, David Cavalcanti]
Name: authors, Length: 231, dtype: object

# Top authors 

In [4]:

authors.explode().value_counts().head(20)

Claudia Werner        18
Marco Valente         17
Manoel Mendonca       12
Leonardo Murta        11
Marcelo Schots        10
Rosangela Penteado    10
Andre Hora             9
Marco                  8
Marcelo Maia           8
Claudio Sant'Anna      7
Vinicius Durelli       7
Glauco Carneiro        7
Paulo Junior           6
Valter Camargo         6
Ricardo Terra          6
Marco Gerosa           6
Renato Novais          6
Eduardo Figueiredo     6
Bruno Cafeo            5
Aline Vasconcelos      5
Name: authors, dtype: int64

# Authors colaborations

In [5]:
authors.apply(lambda x: list(combinations(sorted(x), 2))).explode().value_counts().head(20)

(Claudia Werner, Marcelo Schots)        9
(Claudia Werner, Leonardo Murta)        5
(Marco Valente, Ricardo Terra)          5
(Heitor Costa, Paulo Junior)            4
(Rafael Durelli, Vinicius Durelli)      4
(Rosangela Penteado, Valter Camargo)    4
(Claudio Sant'Anna, Marcos Dosea)       4
(Denis Pinheiro, Roberto Bigonha)       3
(Igor Steinmacher, Igor Wiese)          3
(Claudia Werner, Rodrigo Santos)        3
(Matheus Viana, Rosangela Penteado)     3
(Dalton Guerrero, Jorge Figueiredo)     3
(Marcelo Maia, Roberto Bigonha)         3
(Denis Pinheiro, Marcelo Maia)          3
(Aline Vasconcelos, Claudia Werner)     3
(Eduardo Figueiredo, Marco Valente)     3
(Glauco Carneiro, Manoel Mendonca)      3
(Marcelo Schots, Marlon Silva)          3
(Leonardo Murta, Marcelo Schots)        3
(Claudia Werner, Marlon Silva)          3
Name: authors, dtype: int64

# Institutions

In [6]:
institutions = df['institutions'].apply(unidecode).str.split('; ')
institutions

0                              [UFRJ]
1                               [UCB]
2                               [USP]
3                       [USP, UFSCar]
4                          [UNIGOIAS]
                    ...              
226                       [PUC Minas]
227                            [UFMS]
228    [UFJF, UFF, UC Irvine, UNIRIO]
229                             [UFC]
230                            [UFPE]
Name: institutions, Length: 231, dtype: object

# Top institutions

In [7]:
institutions.explode().value_counts().head(20)

UFMG         37
UFBA         26
USP          25
UFRJ         23
UFLA         17
UFSCar       15
UFMS         14
IFBA         11
UNIFACS      10
UFU          10
UFF           9
UFC           8
UFSJ          8
UFPA          8
UNIFOR        8
UFS           6
UFRN          6
UTFPR         6
PUC Minas     6
UFPE          5
Name: institutions, dtype: int64

# Institutions collaborations

In [8]:
institutions.apply(lambda x: list(combinations(sorted(x), 2))).explode().value_counts().head(20)

(IFBA, UFBA)            8
(UFLA, UFMG)            6
(UFBA, UFS)             6
(UFBA, UNIFACS)         6
(UFSCar, USP)           4
(UFMG, UFMS)            4
(UFLA, UFSJ)            4
(UFMG, UFU)             4
(CEFET-MG, UFMG)        4
(IFBA, UNIFACS)         3
(IFS, UFBA)             3
(UFF, UFRJ)             3
(USP, UTFPR)            3
(CEFET Campos, UFRJ)    3
(UFC, UNIFOR)           2
(UFPA, UTFPR)           2
(Fraunhofer, UFBA)      2
(UFF, UNIRIO)           2
(UERJ, UFRJ)            2
(IPT-SP, NAU)           2
Name: institutions, dtype: int64

# Titles

In [9]:
# @TODO: unificar o idioma
# @TODO: unificar plural e singular (stemming)
titles = df['title'].str.upper()\
                   .apply(unidecode)\
                   .apply(RegexpTokenizer(r'\w+').tokenize)\
                   .apply(lambda x: [word for word in x if word.lower() not in (stopwords.words('english') + stopwords.words('portuguese')) + ['sobre', 'atraves']])
titles

0      [TOWARDS, COMPONENT, BASED, SOFTWARE, MAINTENA...
1      [DOCUMENTACAO, ESSENCIAL, MANUTENCAO, SOFTWARE...
2         [EVOLUCAO, ORIENTADA, ASPECTOS, FRAMEWORK, OO]
3          [EVOLUTION, PROCESS, APPLICATION, FRAMEWORKS]
4            [EXPERIENCIA, ENSINO, MANUTENCAO, SOFTWARE]
                             ...                        
226    [ENTENDENDO, ENGAJAMENTO, COMUNIDADES, FRONT, ...
227           [EXPLORING, PULL, REQUESTS, CODE, SAMPLES]
228    [MERGE, NATURE, TOOL, SUPPORT, RESEARCH, MERGE...
229    [PERCEPTIONS, DIFFICULTIES, SOFTWARE, ENGINEER...
230    [USING, CONTROLLERS, ADAPT, MESSAGING, SYSTEMS...
Name: title, Length: 231, dtype: object

# Top bigrams from titles

In [10]:
                
titles.apply(lambda x: set(bigrams(x))).explode().value_counts().head(20)


(MANUTENCAO, SOFTWARE)      13
(ESTUDO, PRELIMINAR)         7
(CODE, SMELLS)               6
(CODIGO, FONTE)              6
(PROJETOS, SOFTWARE)         6
(SOFTWARE, EVOLUTION)        5
(EVOLUCAO, SOFTWARE)         5
(ORIENTADOS, OBJETOS)        4
(SISTEMAS, LEGADOS)          4
(ESTUDO, EXPLORATORIO)       4
(ESTUDO, CASO)               4
(CASOS, USO)                 4
(LARGA, ESCALA)              4
(ESTUDO, EMPIRICO)           4
(EMPIRICAL, STUDY)           3
(ARCHITECTURE, RECOVERY)     3
(DIVIDA, TECNICA)            3
(ORIENTADO, OBJETOS)         3
(ESTUDO, LARGA)              3
(ENGENHARIA, REVERSA)        3
Name: title, dtype: int64

# Top combinations from titles

In [11]:
titles.apply(lambda x: set(combinations(sorted(x), 2))).explode().value_counts().head(20)

(MANUTENCAO, SOFTWARE)         18
(PROJETOS, SOFTWARE)            9
(EVOLUCAO, SOFTWARE)            8
(ESTUDO, PRELIMINAR)            7
(SOFTWARE, VISUALIZATION)       6
(APIS, ESTUDO)                  6
(CODE, SMELLS)                  6
(ANALISE, SOFTWARE)             6
(CODIGO, FONTE)                 6
(MODELOS, SOFTWARE)             6
(EVOLUTION, SOFTWARE)           5
(SOFTWARE, VISUALIZACAO)        5
(DESENVOLVIMENTO, SOFTWARE)     5
(ESTUDO, SOFTWARE)              5
(ESCALA, LARGA)                 4
(SOFTWARE, SOFTWARE)            4
(OBJETOS, ORIENTADOS)           4
(MANUTENCAO, SISTEMAS)          4
(METRICAS, SOFTWARE)            4
(EMPIRICO, ESTUDO)              4
Name: title, dtype: int64