In [100]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

https://www.topcoder.com/thrive/articles/text-summarization-in-nlp

In [107]:
import string
from typing import List


def process_text(text:str) -> pd.Series:
    tokenized = nltk.tokenize.word_tokenize(text)
    to_remove = list(set(nltk.corpus.stopwords.words("english") + 
        list(string.punctuation) + 
        ['(', ')', ',', ':', ';', '&', '"', '\'']))
    return [
        x.lower() for x in tokenized
        if x.lower() not in to_remove
    ]

def read_prep_data(filename:str, text_col:str) -> pd.DataFrame:
    file_format = filename.split('.')[-1]

    if file_format == "json":
        df = pd.read_json(filename)
    elif file_format == "csv":
        df = pd.read_csv(filename)

    # This feels wrong to do, but I'm not sure what to do for a 
    # proper several-article infrastructure
    df["tokenized"] = df.apply(lambda x: process_text(x.loc[text_col]), axis=1)

    return df

In [89]:
filename = "text_data/00c2bfc7-57db-496e-9d5c-d62f8d8119e3.json/00c2bfc7-57db-496e-9d5c-d62f8d8119e3.json"
test = read_prep_data(filename, text_col="text")
test.head()

Unnamed: 0,id,text,title,tokenized
0,7751000,M-137 was a state trunkline highway in the US ...,M-137 (Michigan highway),"[m-137, state, trunkline, highway, us, state, ..."
1,7751001,"In sociology, dynamic density refers to the co...",Dynamic density,"[sociology, dynamic, density, refers, combinat..."
2,7751042,"Bert Robert Shepard (June 20, 1920 – June 16, ...",Bert Shepard,"[bert, robert, shepard, june, 20, 1920, –, jun..."
3,7751048,"Marc Fein (born Marc Alan Fein October 21, 196...",Marc Fein,"[marc, fein, born, marc, alan, fein, october, ..."
4,7751062,Ghelamco Arena panorama indoor. The Ghelamco A...,Ghelamco Arena,"[ghelamco, arena, panorama, indoor, ghelamco, ..."


In [123]:
data = dict()
for token in test.loc[0, "tokenized"]:
    if data.get(token):
        data[token] += 1
    else:
        data[token] = 1
data

{'m-137': 12,
 'state': 12,
 'trunkline': 2,
 'highway': 11,
 'us': 6,
 'michigan': 5,
 'served': 1,
 'spur': 1,
 'route': 3,
 'interlochen': 9,
 'center': 4,
 'arts': 4,
 'park': 7,
 'started': 1,
 'south': 5,
 'ran': 2,
 'north': 5,
 'two': 1,
 'lakes': 2,
 'area': 2,
 'community': 2,
 '31': 5,
 'grand': 3,
 'traverse': 3,
 'county': 4,
 'first': 2,
 'shown': 1,
 'without': 1,
 'number': 1,
 'label': 2,
 'maps': 3,
 '1930': 2,
 'labeled': 1,
 'extension': 1,
 'next': 1,
 'year': 1,
 "'s": 2,
 'current': 1,
 'routing': 2,
 'established': 1,
 '1950s': 1,
 'jurisdiction': 3,
 'roadway': 4,
 'transferred': 2,
 'department': 2,
 'transportation': 2,
 'mdot': 4,
 'road': 7,
 'commission': 1,
 'june': 3,
 '2020': 5,
 'designation': 3,
 'decommissioned': 1,
 'process': 1,
 'signage': 2,
 'removed': 2,
 'august': 2,
 'reflect': 1,
 'changeover': 1,
 '==route': 1,
 'description==': 1,
 'began': 1,
 'southern': 1,
 'end': 1,
 'intersection': 1,
 'vagabond': 1,
 'lane': 1,
 'farther': 1,
 'conti

In [103]:
test["tokenized"].head(1).values

array([list(['m-137', 'state', 'trunkline', 'highway', 'us', 'state', 'michigan', 'served', 'spur', 'route', 'interlochen', 'center', 'arts', 'interlochen', 'state', 'park', 'started', 'south', 'park', 'ran', 'north', 'two', 'lakes', 'area', 'community', 'interlochen', 'us', 'highway', '31', 'us', '31', 'grand', 'traverse', 'county', 'highway', 'first', 'shown', 'without', 'number', 'label', 'maps', '1930', 'labeled', 'extension', 'next', 'year', 'highway', "'s", 'current', 'routing', 'established', '1950s', 'jurisdiction', 'roadway', 'transferred', 'michigan', 'department', 'transportation', 'mdot', 'grand', 'traverse', 'county', 'road', 'commission', 'june', '2020', 'highway', 'designation', 'decommissioned', 'process', 'signage', 'removed', 'august', '2020', 'reflect', 'changeover', '==route', 'description==', 'm-137', 'began', 'southern', 'end', 'interlochen', 'state', 'park', 'intersection', 'vagabond', 'lane', 'farther', 'south', 'roadway', 'continues', 'toward', 'green', 'lake',

In [106]:
from itertools import chain

# master_list = [x for x in test["tokenized"].head(2).values]

master_list = list(chain.from_iterable(x for x in test["tokenized"].head(2).values))
master_list

['m-137',
 'state',
 'trunkline',
 'highway',
 'us',
 'state',
 'michigan',
 'served',
 'spur',
 'route',
 'interlochen',
 'center',
 'arts',
 'interlochen',
 'state',
 'park',
 'started',
 'south',
 'park',
 'ran',
 'north',
 'two',
 'lakes',
 'area',
 'community',
 'interlochen',
 'us',
 'highway',
 '31',
 'us',
 '31',
 'grand',
 'traverse',
 'county',
 'highway',
 'first',
 'shown',
 'without',
 'number',
 'label',
 'maps',
 '1930',
 'labeled',
 'extension',
 'next',
 'year',
 'highway',
 "'s",
 'current',
 'routing',
 'established',
 '1950s',
 'jurisdiction',
 'roadway',
 'transferred',
 'michigan',
 'department',
 'transportation',
 'mdot',
 'grand',
 'traverse',
 'county',
 'road',
 'commission',
 'june',
 '2020',
 'highway',
 'designation',
 'decommissioned',
 'process',
 'signage',
 'removed',
 'august',
 '2020',
 'reflect',
 'changeover',
 '==route',
 'description==',
 'm-137',
 'began',
 'southern',
 'end',
 'interlochen',
 'state',
 'park',
 'intersection',
 'vagabond',
 'la