In [27]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import re  
import spacy

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fangn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fangn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from google.cloud import bigquery

In [11]:
client = bigquery.Client()

query = """
SELECT title, type, score, descendants AS comments, timestamp, url
FROM `bigquery-public-data.hacker_news.full`
WHERE REGEXP_CONTAINS(title, r"(?:(a|A)g)(ri)(?:(\-|\ |))?") OR REGEXP_CONTAINS(title, r"\\b(a|A)g(\-)?\\b") OR REGEXP_CONTAINS(title, r"(f|F)arm")
ORDER BY timestamp
"""

ag_farm_df = client.query(query).to_dataframe()
ag_farm_df

Unnamed: 0,title,type,score,comments,timestamp,url
0,Possibility of running single-system-image clu...,story,1,1.0,2007-05-05 03:43:55+00:00,
1,Life of a Chinese Gold Farmer,story,21,43.0,2007-06-17 18:45:50+00:00,http://www.nytimes.com/2007/06/17/magazine/17l...
2,What's the best health insurance company for c...,story,1,3.0,2007-06-23 23:06:55+00:00,
3,"""Crowd Farm"": turn the mechanical energy of pe...",story,1,0.0,2007-08-03 19:36:19+00:00,http://www.physorg.com/news104679881.html
4,Folding Farmer - Folding @ Home As A Service,story,2,0.0,2007-08-30 21:14:45+00:00,http://foldingfarmer.com/products/faas-folding...
...,...,...,...,...,...,...
4893,"Soluzioni ecotecnologiche per allevamenti, agr...",story,1,,2021-02-10 08:28:10+00:00,http://francosrl.com/it/inizio/
4894,GPU Mining Farms Are Causing Power Outages in ...,story,6,1.0,2021-02-10 08:42:08+00:00,https://videocardz.com/newz/gpu-mining-farms-a...
4895,Israeli Farm Cultivates Lab-Grown Ribeye Steak...,story,2,0.0,2021-02-10 19:55:12+00:00,https://www.bloomberg.com/news/articles/2021-0...
4896,Rebuilding soil microbiomes in high-tunnel agr...,story,1,0.0,2021-02-11 23:20:49+00:00,https://phys.org/news/2021-02-rebuilding-soil-...


proposed sequence of steps:  
1. lowercase all words 
2. expand contractions
3. lemmatise (i.e. make them into root words)
4. create bigrams
5. prune bigrams containing stopwords

In [12]:
# convert all characters to lowercase in 
ag_farm_df['title']=ag_farm_df['title'].str.lower()
ag_farm_df

Unnamed: 0,title,type,score,comments,timestamp,url
0,possibility of running single-system-image clu...,story,1,1.0,2007-05-05 03:43:55+00:00,
1,life of a chinese gold farmer,story,21,43.0,2007-06-17 18:45:50+00:00,http://www.nytimes.com/2007/06/17/magazine/17l...
2,what's the best health insurance company for c...,story,1,3.0,2007-06-23 23:06:55+00:00,
3,"""crowd farm"": turn the mechanical energy of pe...",story,1,0.0,2007-08-03 19:36:19+00:00,http://www.physorg.com/news104679881.html
4,folding farmer - folding @ home as a service,story,2,0.0,2007-08-30 21:14:45+00:00,http://foldingfarmer.com/products/faas-folding...
...,...,...,...,...,...,...
4893,"soluzioni ecotecnologiche per allevamenti, agr...",story,1,,2021-02-10 08:28:10+00:00,http://francosrl.com/it/inizio/
4894,gpu mining farms are causing power outages in ...,story,6,1.0,2021-02-10 08:42:08+00:00,https://videocardz.com/newz/gpu-mining-farms-a...
4895,israeli farm cultivates lab-grown ribeye steak...,story,2,0.0,2021-02-10 19:55:12+00:00,https://www.bloomberg.com/news/articles/2021-0...
4896,rebuilding soil microbiomes in high-tunnel agr...,story,1,0.0,2021-02-11 23:20:49+00:00,https://phys.org/news/2021-02-rebuilding-soil-...


In [13]:
# remove punctuation from title column
ag_farm_df['title'] = ag_farm_df['title'].str.replace('[^\w\s]','')
ag_farm_df

  


Unnamed: 0,title,type,score,comments,timestamp,url
0,possibility of running singlesystemimage clust...,story,1,1.0,2007-05-05 03:43:55+00:00,
1,life of a chinese gold farmer,story,21,43.0,2007-06-17 18:45:50+00:00,http://www.nytimes.com/2007/06/17/magazine/17l...
2,whats the best health insurance company for ca...,story,1,3.0,2007-06-23 23:06:55+00:00,
3,crowd farm turn the mechanical energy of peopl...,story,1,0.0,2007-08-03 19:36:19+00:00,http://www.physorg.com/news104679881.html
4,folding farmer folding home as a service,story,2,0.0,2007-08-30 21:14:45+00:00,http://foldingfarmer.com/products/faas-folding...
...,...,...,...,...,...,...
4893,soluzioni ecotecnologiche per allevamenti agri...,story,1,,2021-02-10 08:28:10+00:00,http://francosrl.com/it/inizio/
4894,gpu mining farms are causing power outages in ...,story,6,1.0,2021-02-10 08:42:08+00:00,https://videocardz.com/newz/gpu-mining-farms-a...
4895,israeli farm cultivates labgrown ribeye steak ...,story,2,0.0,2021-02-10 19:55:12+00:00,https://www.bloomberg.com/news/articles/2021-0...
4896,rebuilding soil microbiomes in hightunnel agri...,story,1,0.0,2021-02-11 23:20:49+00:00,https://phys.org/news/2021-02-rebuilding-soil-...


In [15]:
# tokenise title column
ag_farm_df['title_tokens'] = ag_farm_df['title'].apply(word_tokenize)

In [17]:
ag_farm_df.dtypes

title                        object
type                         object
score                         int64
comments                    float64
timestamp       datetime64[ns, UTC]
url                          object
title_tokens                 object
dtype: object

In [20]:
# lemmatise title_tokens column
# which may be a bit tricky since the title_tokens contains lists of the tokens
lmtzr = WordNetLemmatizer()
ag_farm_df['title_lemmas'] = ag_farm_df['title_tokens'].apply(lambda lst:[lmtzr.lemmatize(word) for word in lst])
ag_farm_df

Unnamed: 0,title,type,score,comments,timestamp,url,title_tokens,title_lemmas
0,possibility of running singlesystemimage clust...,story,1,1.0,2007-05-05 03:43:55+00:00,,"[possibility, of, running, singlesystemimage, ...","[possibility, of, running, singlesystemimage, ..."
1,life of a chinese gold farmer,story,21,43.0,2007-06-17 18:45:50+00:00,http://www.nytimes.com/2007/06/17/magazine/17l...,"[life, of, a, chinese, gold, farmer]","[life, of, a, chinese, gold, farmer]"
2,whats the best health insurance company for ca...,story,1,3.0,2007-06-23 23:06:55+00:00,,"[whats, the, best, health, insurance, company,...","[whats, the, best, health, insurance, company,..."
3,crowd farm turn the mechanical energy of peopl...,story,1,0.0,2007-08-03 19:36:19+00:00,http://www.physorg.com/news104679881.html,"[crowd, farm, turn, the, mechanical, energy, o...","[crowd, farm, turn, the, mechanical, energy, o..."
4,folding farmer folding home as a service,story,2,0.0,2007-08-30 21:14:45+00:00,http://foldingfarmer.com/products/faas-folding...,"[folding, farmer, folding, home, as, a, service]","[folding, farmer, folding, home, a, a, service]"
...,...,...,...,...,...,...,...,...
4893,soluzioni ecotecnologiche per allevamenti agri...,story,1,,2021-02-10 08:28:10+00:00,http://francosrl.com/it/inizio/,"[soluzioni, ecotecnologiche, per, allevamenti,...","[soluzioni, ecotecnologiche, per, allevamenti,..."
4894,gpu mining farms are causing power outages in ...,story,6,1.0,2021-02-10 08:42:08+00:00,https://videocardz.com/newz/gpu-mining-farms-a...,"[gpu, mining, farms, are, causing, power, outa...","[gpu, mining, farm, are, causing, power, outag..."
4895,israeli farm cultivates labgrown ribeye steak ...,story,2,0.0,2021-02-10 19:55:12+00:00,https://www.bloomberg.com/news/articles/2021-0...,"[israeli, farm, cultivates, labgrown, ribeye, ...","[israeli, farm, cultivates, labgrown, ribeye, ..."
4896,rebuilding soil microbiomes in hightunnel agri...,story,1,0.0,2021-02-11 23:20:49+00:00,https://phys.org/news/2021-02-rebuilding-soil-...,"[rebuilding, soil, microbiomes, in, hightunnel...","[rebuilding, soil, microbiomes, in, hightunnel..."


Note to self: define functions to do text pre-processing/ normalisation, comprising of all the steps above.

In [32]:
# create bigrams from title_lemmas column
def toBigram(ag_farm_df):
    bigram = ag_farm_df['title_lemmas'].apply(lambda row: list(map(lambda x:ngrams(x,2), row)))
    return bigram

toBigram(ag_farm_df)

0       [<generator object ngrams at 0x000001B3B5F04AC...
1       [<generator object ngrams at 0x000001B3B5F04EC...
2       [<generator object ngrams at 0x000001B3B5F0C24...
3       [<generator object ngrams at 0x000001B3B5F0C8C...
4       [<generator object ngrams at 0x000001B3B5F0D04...
                              ...                        
4893    [<generator object ngrams at 0x000001B3B846B54...
4894    [<generator object ngrams at 0x000001B3B846B8C...
4895    [<generator object ngrams at 0x000001B3B846BD4...
4896    [<generator object ngrams at 0x000001B3B846C24...
4897    [<generator object ngrams at 0x000001B3B846C74...
Name: title_lemmas, Length: 4898, dtype: object

In [29]:
toBigram(ag_farm_df)

0       [<generator object ngrams at 0x000001B3A27CC7C...
1       [<generator object ngrams at 0x000001B3A27CC44...
2       [<generator object ngrams at 0x000001B3A27CC5C...
3       [<generator object ngrams at 0x000001B3AB47C4C...
4       [<generator object ngrams at 0x000001B3AB4779C...
                              ...                        
4893    [<generator object ngrams at 0x000001B3B5939AC...
4894    [<generator object ngrams at 0x000001B3B5939E4...
4895    [<generator object ngrams at 0x000001B3B593A34...
4896    [<generator object ngrams at 0x000001B3B593A7C...
4897    [<generator object ngrams at 0x000001B3B593ACC...
Name: title_lemmas, Length: 4898, dtype: object

The below cells show how to get rid of the generators when generating ngrams.

In [36]:
ag_farm_df_backup = ag_farm_df.copy()

In [38]:
ag_farm_df['title_bigrams'] = ag_farm_df['title_lemmas'].apply(lambda row: list(nltk.ngrams(row, 2)))
ag_farm_df

Unnamed: 0,title,type,score,comments,timestamp,url,title_tokens,title_lemmas,title_bigrams
0,possibility of running singlesystemimage clust...,story,1,1.0,2007-05-05 03:43:55+00:00,,"[possibility, of, running, singlesystemimage, ...","[possibility, of, running, singlesystemimage, ...","[(possibility, of), (of, running), (running, s..."
1,life of a chinese gold farmer,story,21,43.0,2007-06-17 18:45:50+00:00,http://www.nytimes.com/2007/06/17/magazine/17l...,"[life, of, a, chinese, gold, farmer]","[life, of, a, chinese, gold, farmer]","[(life, of), (of, a), (a, chinese), (chinese, ..."
2,whats the best health insurance company for ca...,story,1,3.0,2007-06-23 23:06:55+00:00,,"[whats, the, best, health, insurance, company,...","[whats, the, best, health, insurance, company,...","[(whats, the), (the, best), (best, health), (h..."
3,crowd farm turn the mechanical energy of peopl...,story,1,0.0,2007-08-03 19:36:19+00:00,http://www.physorg.com/news104679881.html,"[crowd, farm, turn, the, mechanical, energy, o...","[crowd, farm, turn, the, mechanical, energy, o...","[(crowd, farm), (farm, turn), (turn, the), (th..."
4,folding farmer folding home as a service,story,2,0.0,2007-08-30 21:14:45+00:00,http://foldingfarmer.com/products/faas-folding...,"[folding, farmer, folding, home, as, a, service]","[folding, farmer, folding, home, a, a, service]","[(folding, farmer), (farmer, folding), (foldin..."
...,...,...,...,...,...,...,...,...,...
4893,soluzioni ecotecnologiche per allevamenti agri...,story,1,,2021-02-10 08:28:10+00:00,http://francosrl.com/it/inizio/,"[soluzioni, ecotecnologiche, per, allevamenti,...","[soluzioni, ecotecnologiche, per, allevamenti,...","[(soluzioni, ecotecnologiche), (ecotecnologich..."
4894,gpu mining farms are causing power outages in ...,story,6,1.0,2021-02-10 08:42:08+00:00,https://videocardz.com/newz/gpu-mining-farms-a...,"[gpu, mining, farms, are, causing, power, outa...","[gpu, mining, farm, are, causing, power, outag...","[(gpu, mining), (mining, farm), (farm, are), (..."
4895,israeli farm cultivates labgrown ribeye steak ...,story,2,0.0,2021-02-10 19:55:12+00:00,https://www.bloomberg.com/news/articles/2021-0...,"[israeli, farm, cultivates, labgrown, ribeye, ...","[israeli, farm, cultivates, labgrown, ribeye, ...","[(israeli, farm), (farm, cultivates), (cultiva..."
4896,rebuilding soil microbiomes in hightunnel agri...,story,1,0.0,2021-02-11 23:20:49+00:00,https://phys.org/news/2021-02-rebuilding-soil-...,"[rebuilding, soil, microbiomes, in, hightunnel...","[rebuilding, soil, microbiomes, in, hightunnel...","[(rebuilding, soil), (soil, microbiomes), (mic..."


In [39]:
ag_farm_df.sample(5)

Unnamed: 0,title,type,score,comments,timestamp,url,title_tokens,title_lemmas,title_bigrams
1341,cowspiracy how the planet is being destroyed ...,story,18,5.0,2014-06-19 10:56:05+00:00,http://cowspiracy.com,"[cowspiracy, how, the, planet, is, being, dest...","[cowspiracy, how, the, planet, is, being, dest...","[(cowspiracy, how), (how, the), (the, planet),..."
442,content farmers react to algorithm update sea...,story,2,0.0,2011-02-25 06:23:03+00:00,http://www.seanpercival.com/blog/2011/02/25/co...,"[content, farmers, react, to, algorithm, updat...","[content, farmer, react, to, algorithm, update...","[(content, farmer), (farmer, react), (react, t..."
1190,vertical farms sprouting in a few places,story,6,0.0,2014-01-19 22:24:09+00:00,http://www.newscientist.com/article/mg22129524...,"[vertical, farms, sprouting, in, a, few, places]","[vertical, farm, sprouting, in, a, few, place]","[(vertical, farm), (farm, sprouting), (sprouti..."
1135,farm confessional im an undocumented farm worker,story,1,0.0,2013-11-06 15:24:33+00:00,http://modernfarmer.com/2013/11/farmworker-con...,"[farm, confessional, im, an, undocumented, far...","[farm, confessional, im, an, undocumented, far...","[(farm, confessional), (confessional, im), (im..."
4075,wisconsin wind farm decommissioned after just ...,story,3,,2019-12-02 17:55:07+00:00,https://www.americanexperiment.org/2018/11/ano...,"[wisconsin, wind, farm, decommissioned, after,...","[wisconsin, wind, farm, decommissioned, after,...","[(wisconsin, wind), (wind, farm), (farm, decom..."
