<a href="https://colab.research.google.com/github/cicyfan/python-vis/blob/master/IEEE-conference-abstract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IEEE Conference Abstract Analysis

In this project, I focus on a textual data set and a set of questions that require grouping documents according to their content, summarizing their content with keywords and seeing how the content changes or depends on specific factors (metadata) contained in the data set.

The dataset contains information on IEEE Visualization (IEEE VIS) publications from 1990-2018. IEEE VIS is the premier conference on data visualization and every year it publishes papers from top researchers in the field. A lot of cutting edge research is published there. You can find more information about the conference at ieeevis.org.

You can access the data [here](https://drive.google.com/file/d/1matX14PQugGfJLdSXUF1iZFKhDo_eDyD/view?usp=sharing).
You can gather more information about the dataset here: https://sites.google.com/site/vispubdata/home.



In [0]:
import pandas as pd
import numpy as np
import gensim, spacy
from gensim.utils import simple_preprocess
import nltk

import altair as alt

nltk.download('stopwords')
from nltk.corpus import stopwords

import gensim.corpora as corpora
from gensim.sklearn_api import TfIdfTransformer

from gensim.matutils import corpus2dense, corpus2csc
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
df = pd.read_csv('/content/drive/My Drive/data/IEEE VIS papers 1990-2018 - Main dataset.csv')   

## Conference Publication Over the Years

In [0]:
year = df.groupby(['Year', 'Conference'])['Title'].count().reset_index()
bars = alt.Chart().mark_bar().encode(
    x=alt.X('Year:O', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('Title:Q', stack="normalize", axis=alt.Axis(format='%'), title='Percentage of Publication'),
    color='Conference:N',
)

texts = bars.mark_text(
    align='center',
    baseline='bottom',
).encode(
    text=alt.Text('Title:Q', format='.0f')
)

alt.layer(bars, texts, data=year).properties(
    width=500,
    height=300
)

## Define Process Functions

In [0]:
df1 = df[['Year', 'Conference', 'AuthorNames', 'Abstract', 'Title']]
all_tracks = df1[df1['Year'].isin(['2012', '2013', '2014', '2015', '2016', '2017', '2018'])]
infovis = df1[df1['Conference']=='InfoVis'].reset_index().dropna(subset=['Abstract'])
scivis = df1[df1['Conference']=='SciVis'].reset_index().dropna(subset=['Abstract'])
vast = df1[df1['Conference']=='VAST'].reset_index().dropna(subset=['Abstract'])

In [0]:
stop_words = stopwords.words('english')
stop_words.extend(['com', 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 
                   'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 
                   'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 
                   'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 
                   'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

In [0]:
def get_words_ready(df, column):
  col = column
  data_word_list = [simple_preprocess(sentence) for sentence in df[col]]
  data_words = [[word for word in doc if word not in stop_words] for doc in data_word_list]
  # Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
  nlp = spacy.load('en', disable=['parser', 'ner'])
  allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
  data_ready = []
  for sent in data_words:
    # Parse the sentence using the loaded 'en' model object `nlp`. Extract the lemma for each token and join
    doc = nlp(" ".join(sent)) 
    data_ready.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
  # remove stopwords once more after lemmatization
  data_ready = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_ready]

  # Create Dictionary
  id2word = corpora.Dictionary(data_ready)

  model = TfIdfTransformer(dictionary=id2word)
  # Create Corpus: Term Document Frequency
  corpus = [id2word.doc2bow(text) for text in data_ready]

  num_docs = id2word.num_docs
  num_terms = len(id2word.keys())

  tfidf_corpus = model.fit_transform(corpus)

  # construct an array of tf-idf vectors
  corpus_tfidf_dense = corpus2dense(tfidf_corpus, num_terms, num_docs)
  X = corpus_tfidf_dense[corpus_tfidf_dense.max(axis=1) > 0.1]

  words = []
  for id in id2word.keys():
      words.append(id2word[id])

  mat = pd.DataFrame(data=corpus_tfidf_dense, index=words)
  mat = mat[mat.max(axis=1) > 0.1]

  wordtfidf = pd.DataFrame(data=mat.values.T, columns=mat.index)

  return id2word, corpus, X, wordtfidf


In [0]:
def keyword_tfidf(wordtfidf):
  group_key_words = []
  # get the sum tf-idf for each word, do sum() across rows for each column
  group_df = wordtfidf.mean(axis=0)
  # sort the tf-idf values
  to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
  to_sort = sorted(to_sort, key=lambda d: d['freq'], reverse=True)
  # add the words to the list
  for i in range(len(group_df)):
      group_key_words.append([to_sort[i]['word'], to_sort[i]['freq']])

  return group_key_words

## Top 10 Keywords

In [0]:
id2word, corpus, X, wordtfidf = get_words_ready(all_tracks, 'Abstract')
group_key_words = keyword_tfidf(wordtfidf)
keyword_df = pd.DataFrame(data=group_key_words, columns=['keyword', 'tfidf']).sort_values('tfidf', ascending=False)
alt.Chart(keyword_df.head(10)).mark_bar().encode(
    x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,0.025])),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width=200,
    title='All Tracks'
)

sorting words based on frequency of occurence is not really effective

In [0]:
word_freq=[]
for doc in np.array(corpus).ravel() :
  for line in doc:
    word_freq.append([line[0], id2word[line[0]], line[1]])

wordfreq = pd.DataFrame(word_freq, columns=['id','word','frequency'])
wordfreq = wordfreq.groupby(['word']).sum().reset_index().sort_values('frequency', ascending=False)
alt.Chart(wordfreq.head(10)).mark_bar().encode(
    x=alt.X('frequency:Q'),
    y=alt.Y('word:N', sort='-x')
).properties(
    width=200
)

In [0]:
id2word, corpus, X, wordtfidf = get_words_ready(infovis, 'Abstract')
group_key_words = keyword_tfidf(wordtfidf)
keyword_df = pd.DataFrame(data=group_key_words, columns=['keyword', 'tfidf']).sort_values('tfidf', ascending=False)
alt.Chart(keyword_df.head(10)).mark_bar().encode(
    x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,0.025])),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width=200,
    title='InfoVis Track'
)

In [0]:

id2word, corpus, X, wordtfidf = get_words_ready(scivis, 'Abstract')
group_key_words = keyword_tfidf(wordtfidf)
keyword_df = pd.DataFrame(data=group_key_words, columns=['keyword', 'tfidf']).sort_values('tfidf', ascending=False)
alt.Chart(keyword_df.head(10)).mark_bar().encode(
    x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,0.025])),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width=200,
    title='SciVis Track'
)

In [0]:
id2word, corpus, X, wordtfidf = get_words_ready(vast, 'Abstract')
group_key_words = keyword_tfidf(wordtfidf)
keyword_df = pd.DataFrame(data=group_key_words, columns=['keyword', 'tfidf']).sort_values('tfidf', ascending=False)
alt.Chart(keyword_df.head(10)).mark_bar().encode(
    x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0,0.025])),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width=200,
    title='VAST Track'
)

## Top 3 Keywords Over the Years

In [0]:
stop_words.extend(['datum', 'model'])

In [0]:
words_year = []
for track in (infovis, scivis, vast):
  for year in [2012, 2013, 2014, 2015, 2016, 2017, 2018]:
    id2word, corpus, X, wordtfidf = get_words_ready(track[track['Year']==year], 'Abstract')
    group_key_words = keyword_tfidf(wordtfidf)
    words_year.append([year, '3', group_key_words[2][0], group_key_words[2][1]])
    words_year.append([year, '2', group_key_words[1][0], group_key_words[1][1]])
    words_year.append([year, '1', group_key_words[0][0], group_key_words[0][1]])
    

In [0]:
words_year_df = pd.DataFrame(words_year, columns=['Year', "Rank", "Word", "tfidf"])
track = np.array(['InfoVis', 'SciVis', 'VAST'])
words_year_df['track'] = np.repeat(track, 21)

base = alt.Chart().mark_bar().encode(
    y=alt.Y('Year:O', title='Year', axis=alt.Axis(labelAngle=-0)),
    x=alt.X('sum(tfidf)', title="TF-IDF", stack='zero', scale=alt.Scale(domain=[0.0, 0.13])),
    #y=alt.Y('tfidf:Q', title="TF-IDF", scale=alt.Scale(domain=[0.025, 0.055])),
    color=alt.Color('Rank:N', scale=alt.Scale(scheme='tealblues', reverse=True)),
    order=alt.Order('Rank', sort='ascending')
).properties(
    height=300,
    width=400,
    title='Top Keywords Over the Years'
)

texts = alt.Chart().mark_text(
    dx=-35,
    dy=0,
    align='center',
    baseline='middle',
    color='White',
    fontSize=14
).encode(
    y=alt.Y('Year:O', title='Year', axis=alt.Axis(labelAngle=-0)),
    #x=alt.X('tfidf:Q', title="TF-IDF", stack='zero'),
    x=alt.X('sum(tfidf)', title="TF-IDF", stack='zero'),
    detail='Rank:N',
    text='Word:N',
    order=alt.Order('Rank', sort='ascending')
)

alt.layer(base, texts, data=words_year_df).facet(column='track')

## Cluster Analysis - Abstract 

In [0]:
id2word, corpus, X, wordtfidf = get_words_ready(all_tracks, 'Abstract')
group_key_words = keyword_tfidf(wordtfidf)

# result_pca = PCA(n_components=2).fit_transform(X.T)
result_tsne = TSNE(n_components=2, perplexity=10).fit_transform(X.T)

tsne_df = pd.DataFrame(data=result_tsne, columns=['x','y'])

alt.Chart(tsne_df).mark_circle().encode(
    x='x:Q',
    y='y:Q'
)


In [0]:
cluster = 3
kmeans = KMeans(n_clusters=cluster, random_state=1).fit(tsne_df[['x','y']])
tsne_df['label'] = kmeans.labels_

alt.Chart(tsne_df).mark_circle(opacity = .7).encode(
    x='x:Q',
    y='y:Q',
    color=alt.Color('label:N', scale=alt.Scale(scheme='tableau20'))
)

In [0]:
''' top words for each cluster '''
group_key_words = []
for label in range(cluster):
    # get the sum tf-idf for each word, do sum() across rows for each column
    group_df = wordtfidf[kmeans.labels_ == label].mean(axis=0)
    # sort the tf-idf values
    to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
    to_sort = sorted(to_sort, key=lambda d: d['freq'], reverse=True)
    # add the words to the list
    for i in range(10):
        group_key_words.append([label, to_sort[i]['word'], to_sort[i]['freq']])
        
keyword_df = pd.DataFrame(data=group_key_words, columns=['label', 'keyword', 'tfidf'])

chart = alt.hconcat()
for label in range(cluster):
    chart |= alt.Chart(keyword_df[keyword_df['label']==label]).mark_bar().encode(
    x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0, 0.05]),title='cluster'+str(label)),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width = 100
)
    
chart

In [0]:
all_tracks = all_tracks.assign(label=kmeans.labels_)

In [0]:
bars = alt.Chart().mark_bar().encode(
    x=alt.X('Conference:N', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count(label):Q', stack="normalize", axis=alt.Axis(format='%'), title='Percentage of Publication'),
    color=alt.Color('label:N', scale=alt.Scale(scheme='tableau20')),
    order=alt.Order('count(label):Q', sort='descending')
)

texts = bars.mark_text(
    align='center',
    baseline='bottom',
).encode(
    text=alt.Text('count(label):Q', format='.0f')
)

alt.layer(bars, texts, data=all_tracks).properties(
    width=300,
    height=300
)#.facet(facet='Year', columns=3)

In [0]:
bars = alt.Chart().mark_area().encode(
    x=alt.X('Year:O', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count(label):Q', stack="normalize", axis=alt.Axis(format='%'), title='Percentage of Publication'),
    color=alt.Color('label:N', scale=alt.Scale(scheme='tableau20')),
)

texts = bars.mark_text(
    align='center',
    baseline='bottom',
).encode(
    text=alt.Text('count(label):Q', format='.0f')
)

alt.layer(bars, texts, data=all_tracks).properties(
    width=200,
    height=200
).facet(facet='Conference', columns=3)

### Author Analysis

In [0]:
author_df = all_tracks.assign(Author=all_tracks['AuthorNames'].str.split(';')).explode('Author')
top10_author=author_df.groupby(['Author'])['Abstract'].count().reset_index().sort_values('Abstract', ascending=False).head(10)['Author'].tolist()

In [0]:
bars = alt.Chart().mark_bar().encode(
    x=alt.X('Author:N', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count(label):Q', stack="normalize", axis=alt.Axis(format='%'), title='Percentage of Publication'),
    color=alt.Color('label:N', scale=alt.Scale(scheme='tableau20')),
    order=alt.Order('count(label):Q', sort='descending')
)

texts = bars.mark_text(
    align='center',
    baseline='bottom',
).encode(
    text=alt.Text('count(label):Q', format='')
)

alt.layer(bars, texts, data=author_df[author_df['Author'].isin(top10_author)]).properties(
    width=600,
    height=300
)#.facet(facet='Year', columns=3)

In [0]:
alt.layer(bars, texts, data=author_df[author_df['Author'].isin(top10_author)]).properties(
    width=300,
    height=200
).facet(facet='Conference', columns=3)

## Cluster Analysis - Title

In [0]:
id2word, corpus, X, wordtfidf = get_words_ready(all_tracks, 'Title')
group_key_words = keyword_tfidf(wordtfidf)

# result_pca = PCA(n_components=2).fit_transform(X.T)
result_tsne = TSNE(n_components=2, perplexity=10).fit_transform(X.T)

tsne_df = pd.DataFrame(data=result_tsne, columns=['x','y'])

alt.Chart(tsne_df).mark_circle().encode(
    x='x:Q',
    y='y:Q'
)


In [0]:
cluster = 3
kmeans = KMeans(n_clusters=cluster, random_state=1).fit(tsne_df[['x','y']])
tsne_df['label'] = kmeans.labels_

alt.Chart(tsne_df).mark_circle(opacity = .7).encode(
    x='x:Q',
    y='y:Q',
    color=alt.Color('label:N', scale=alt.Scale(scheme='tableau20'))
)

In [0]:
''' top words for each cluster '''
group_key_words = []
for label in range(cluster):
    # get the sum tf-idf for each word, do sum() across rows for each column
    group_df = wordtfidf[kmeans.labels_ == label].mean(axis=0)
    # sort the tf-idf values
    to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
    to_sort = sorted(to_sort, key=lambda d: d['freq'], reverse=True)
    # add the words to the list
    for i in range(10):
        group_key_words.append([label, to_sort[i]['word'], to_sort[i]['freq']])
        
keyword_df = pd.DataFrame(data=group_key_words, columns=['label', 'keyword', 'tfidf'])

chart = alt.hconcat()
for label in range(cluster):
    chart |= alt.Chart(keyword_df[keyword_df['label']==label]).mark_bar().encode(
    x=alt.X('tfidf:Q', scale=alt.Scale(domain=[0, 0.05]),title='cluster'+str(label)),
    y=alt.Y('keyword:N', sort='-x')
).properties(
    width = 100
)
    
chart

In [0]:
all_tracks = all_tracks.assign(label=kmeans.labels_)

In [0]:
bars = alt.Chart().mark_bar().encode(
    x=alt.X('Conference:N', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count(label):Q', stack="normalize", axis=alt.Axis(format='%'), title='Percentage of Publication'),
    color=alt.Color('label:N', scale=alt.Scale(scheme='tableau20')),
    order=alt.Order('count(label):Q', sort='descending')
)

texts = bars.mark_text(
    align='center',
    baseline='bottom',
).encode(
    text=alt.Text('count(label):Q', format='.0f')
)

alt.layer(bars, texts, data=all_tracks).properties(
    width=300,
    height=300
)#.facet(facet='Year', columns=3)

In [0]:
bars = alt.Chart().mark_area().encode(
    x=alt.X('Year:O', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count(label):Q', stack="normalize", axis=alt.Axis(format='%'), title='Percentage of Publication'),
    color=alt.Color('label:N', scale=alt.Scale(scheme='tableau20')),
)

texts = bars.mark_text(
    align='center',
    baseline='bottom',
).encode(
    text=alt.Text('count(label):Q', format='.0f')
)

alt.layer(bars, texts, data=all_tracks).properties(
    width=200,
    height=200
).facet(facet='Conference', columns=3)

### Author Analysis

In [0]:
author_df = all_tracks.assign(Author=all_tracks['AuthorNames'].str.split(';')).explode('Author')
top10_author=author_df.groupby(['Author'])['Title'].count().reset_index().sort_values('Title', ascending=False).head(10)['Author'].tolist()

In [0]:
bars = alt.Chart().mark_bar().encode(
    x=alt.X('Author:N', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count(label):Q', stack="normalize", axis=alt.Axis(format='%'), title='Percentage of Publication'),
    color=alt.Color('label:N', scale=alt.Scale(scheme='tableau20')),
    order=alt.Order('count(label):Q', sort='descending')
)

texts = bars.mark_text(
    align='center',
    baseline='bottom',
).encode(
    text=alt.Text('count(label):Q', format='')
)

alt.layer(bars, texts, data=author_df[author_df['Author'].isin(top10_author)]).properties(
    width=600,
    height=300
)

In [0]:
alt.layer(bars, texts, data=author_df[author_df['Author'].isin(top10_author)]).properties(
    width=300,
    height=200,
).facet(facet='Conference', columns=3)