In [2]:
import re
import string
# natural language toolkit
import nltk

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
%matplotlib inline

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

STOP_WORDS = stopwords.words()

[nltk_data] Downloading package punkt to /Users/simon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/simon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def cleaning(text):
    """
    Convert to lowercase.
    remove URL links, special characters and punctuation.
    Tokenize and remove stop words.
    """
    
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[’“”…]', '', text)

    # removing the stop-words
    text_tokens = word_tokenize(text)
    tokens_without_sw = [
        word for word in text_tokens if not word in STOP_WORDS]
    filtered_sentence = (" ").join(tokens_without_sw)
    text = filtered_sentence

    return text

In [None]:
# README.md taken from https://github.com/iml-wg/HEPML-LivingReview
with open('README.md', 'r') as file:
        data = file.read()

In [None]:
df = pd.DataFrame([x.split(';') for x in data.split('\n')])

In [None]:
df['text'] = df[0].str.lower()

In [None]:
dt = df['text'].apply(cleaning)

In [None]:
word_count = Counter(" ".join(dt).split()).most_common(25)
word_frequency = pd.DataFrame(word_count, columns = ['Word', 'Frequency'])
print(word_frequency)

In [None]:
# process README.md in vim using
# :g!/\* \[/d
# :%s/* \[\([^]]*\)\]([^0-9]*.\([0-9][0-9]\)\([0-9][0-9]\).[0-9]*).*$/"\1", \2, \3/
# :%s/^\s*//
# :g/\*/d
# add line - title, year, month 
df2 = pd.read_csv('READMEtest.md')

In [None]:
df2.info()

In [None]:
# watch out for the space in ' year' and ' month' - bad preprocessing
df2['title-clean'] = df2['title'].apply(cleaning)
df2['date'] = df2[' year'] + (df2[' month']-1)/12

In [None]:
# filter by year
df2.drop(df2[df2[' year']<22].index)[['title-clean','date']]

In [None]:
word_frequency['Word']

In [None]:
# filter by common title keywords
df2filter1 = df2.drop(df2[df2['title-clean'].str.contains( word_frequency['Word'][0] )==False].index)
df2filter2 = df2.drop(df2[df2['title-clean'].str.contains( word_frequency['Word'][6] )==False].index)
df2filter3 = df2.drop(df2[df2['title-clean'].str.contains( word_frequency['Word'][16] )==False].index)
df2filter4 = df2.drop(df2[df2['title-clean'].str.contains( word_frequency['Word'][21] )==False].index)
df2filter5 = df2.drop(df2[df2['title-clean'].str.contains( word_frequency['Word'][24] )==False].index)

In [None]:
plt.hist(df2filter1['date'], density=False, bins=50, alpha=0.5, label=word_frequency['Word'][0])
plt.xlim([7,23])
plt.xlabel('date')
plt.legend(loc='upper left')

In [None]:
plt.hist(df2filter2['date'], density=False, bins=200, alpha=0.5, label=word_frequency['Word'][6])
plt.hist(df2filter3['date'], density=False, bins=20, alpha=0.5, label=word_frequency['Word'][16])
plt.hist(df2filter4['date'], density=False, bins=20, alpha=0.5, label=word_frequency['Word'][21])
plt.hist(df2filter5['date'], density=False, bins=20, alpha=0.5, label=word_frequency['Word'][24])
plt.xlim([7,23])
plt.xlabel('date')
plt.legend(loc='upper left')