In [3]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

https://www.topcoder.com/thrive/articles/text-summarization-in-nlp

### Legacy Implementation

In [4]:
import string
from typing import List


def process_text(text:str) -> pd.Series:
    tokenized = nltk.tokenize.word_tokenize(text)
    to_remove = list(set(nltk.corpus.stopwords.words("english") + 
        list(string.punctuation) + 
        ['(', ')', ',', ':', ';', '&', '"', '\'']))
    return [
        x.lower() for x in tokenized
        if x.lower() not in to_remove
    ]

def read_prep_data(filename:str, text_col:str) -> pd.DataFrame:
    file_format = filename.split('.')[-1]

    if file_format == "json":
        df = pd.read_json(filename)
    elif file_format == "csv":
        df = pd.read_csv(filename)

    # This feels wrong to do, but I'm not sure what to do for a 
    # proper several-article infrastructure
    df["tokenized"] = df.apply(lambda x: process_text(x.loc[text_col]), axis=1)

    return df

In [5]:
filename = "text_data/00c2bfc7-57db-496e-9d5c-d62f8d8119e3.json/00c2bfc7-57db-496e-9d5c-d62f8d8119e3.json"
test = read_prep_data(filename, text_col="text")
test.head()

Unnamed: 0,id,text,title,tokenized
0,7751000,M-137 was a state trunkline highway in the US ...,M-137 (Michigan highway),"[m-137, state, trunkline, highway, us, state, ..."
1,7751001,"In sociology, dynamic density refers to the co...",Dynamic density,"[sociology, dynamic, density, refers, combinat..."
2,7751042,"Bert Robert Shepard (June 20, 1920 – June 16, ...",Bert Shepard,"[bert, robert, shepard, june, 20, 1920, –, jun..."
3,7751048,"Marc Fein (born Marc Alan Fein October 21, 196...",Marc Fein,"[marc, fein, born, marc, alan, fein, october, ..."
4,7751062,Ghelamco Arena panorama indoor. The Ghelamco A...,Ghelamco Arena,"[ghelamco, arena, panorama, indoor, ghelamco, ..."


### Sklearn Implementation using CountVectorizer (much easier)

In [6]:
df = pd.read_json(filename)
df.head()

Unnamed: 0,id,text,title
0,7751000,M-137 was a state trunkline highway in the US ...,M-137 (Michigan highway)
1,7751001,"In sociology, dynamic density refers to the co...",Dynamic density
2,7751042,"Bert Robert Shepard (June 20, 1920 – June 16, ...",Bert Shepard
3,7751048,"Marc Fein (born Marc Alan Fein October 21, 196...",Marc Fein
4,7751062,Ghelamco Arena panorama indoor. The Ghelamco A...,Ghelamco Arena


In [47]:
df.shape

(9982, 3)

In [53]:
# Removes english stop words, grabs unigrams (single words) and bigrams (two words)
cv = CountVectorizer(stop_words='english', ngram_range=(1,2))
# Run it on the first 10 rows (since we have 9,982 rows, and we get a memory error using all)
results = cv.fit_transform(df.iloc[0:11,1])

In [54]:
test_df = pd.DataFrame({
    "Features": cv.get_feature_names_out(),
    "Count": results.toarray()[0]
})
test_df.sort_values(by="Count", ascending=False).head()

Unnamed: 0,Features,Count
33,137,15
3661,state,12
1865,highway,11
2053,interlochen,9
2769,park,7


In [56]:
test_df.shape

(4142, 2)

In [106]:
def get_counts(cv:CountVectorizer, text:str) -> pd.DataFrame:
    result = cv.fit_transform([text])
    features = cv.get_feature_names_out()
    counts = result.toarray()[0]
    return pd.DataFrame({
        "Features": features,
        "Count": counts
    })

def rack_em_up(sentence:str, words:pd.DataFrame) -> int:
    total = 0
    for word in words["Features"]:
        if word in sentence:
            total += words[words["Features"]==word]["Count"].array[0]
    return total

In [105]:
cv_summ = CountVectorizer(stop_words='english', ngram_range=(1,2))

docs = []

for i in range(1):
    # Fit on this row's text
    result = get_counts(cv_summ, df.iloc[i, 1])
    text_id = df.iloc[i, 0]
    sentences = nltk.sent_tokenize(df.iloc[i, 1])
    tada = pd.DataFrame({
        "TextID": [text_id]*len(sentences),
        "Sentence": sentences
    })
    tada["Value"] = tada["Sentence"].apply(lambda x: rack_em_up(x, result))
    docs.append(tada)
final = pd.concat(docs, ignore_index=True)

In [110]:
final.sort_values(by="Value", ascending=False).iloc[0,1]

'==History== reassurance marker near Diamond Park Road and the entrance to Interlochen Center for the Arts, May 2018 A highway along the route of M-137 connecting US 31 south to the state park was added to the state highway system during the first half of 1930, initially lacking a designation label on the state maps of the time.'

In [82]:
docs

{'M-137 (Michigan highway)': {'M-137 was a state trunkline highway in the US state of Michigan that served as a spur route to the Interlochen Center for the Arts and Interlochen State Park.': 50,
  'It started south of the park and ran north between two lakes in the area and through the community of Interlochen to US Highway 31 (US 31) in Grand Traverse County.': 37,
  'The highway was first shown without a number label on maps in 1930 and labeled after an extension the next year.': 25,
  "The highway's current routing was established in the 1950s.": 17,
  'Jurisdiction of the roadway was transferred from the Michigan Department of Transportation (MDOT) to the Grand Traverse County Road Commission in June 2020, and the highway designation was decommissioned in the process; signage was removed by August 2020 to reflect the changeover.': 44,
  '==Route description== M-137 began at the southern end of Interlochen State Park at an intersection with Vagabond Lane.': 30,
  'Farther south, th