In [6]:
# Install LanguageTool Library
# !pip install language-tool-python



In [7]:
# Library imports
import re, pandas as pd
from pprint import pprint
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

import spacy
from spacy import displacy
import language_tool_python

In [8]:
# Initialize objects
nlp = spacy.load('en_core_web_sm')
tool = language_tool_python.LanguageTool('en-US')

# Custom request header
hdr = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'}

In [9]:
# Load urls into dataframe
url = ["https://stackoverflow.com/questions/20109391/how-to-make-good-reproducible-pandas-examples","https://stackoverflow.com/questions/240178/list-of-lists-changes-reflected-across-sublists-unexpectedly","https://forums.hardwarezone.com.sg/threads/discuss-naive-sinkies-think-they-are-successful-in-life-by-having-these-3-items-in-sg.6624726/post-137385099","https://forums.hardwarezone.com.sg/threads/well-done-ong-ye-kung-usa-classify-singapore-as-cat-iv-very-high-risk.6624719/post-137385434","https://www.channelnewsasia.com/business/energy-bogs-estonian-scientists-use-peat-make-batteries-2236431","https://www.channelnewsasia.com/asia/hong-kong-storm-cyclone-kompasu-weather-t8-warning-2238766"]
site = ["StackOverflow","StackOverflow","Hardwarezone","Hardwarezone","ChannelNewsAsia","ChannelNewsAsia"]
df = pd.DataFrame({'url': url, 'site': site})
df

Unnamed: 0,url,site
0,https://stackoverflow.com/questions/20109391/h...,StackOverflow
1,https://stackoverflow.com/questions/240178/lis...,StackOverflow
2,https://forums.hardwarezone.com.sg/threads/dis...,Hardwarezone
3,https://forums.hardwarezone.com.sg/threads/wel...,Hardwarezone
4,https://www.channelnewsasia.com/business/energ...,ChannelNewsAsia
5,https://www.channelnewsasia.com/asia/hong-kong...,ChannelNewsAsia


In [10]:
# Function to extract post content from website
def extract_post(url, site):
  try:
    page = urlopen(Request(url, headers=hdr))
  except HTTPError as e:
    print(e.fp.read())
  content = page.read()
  soup = BeautifulSoup(content, 'lxml')
  post_text = ""

  # stackoverflow
  if site == 'StackOverflow':
    post = soup.find('div',{'class':'s-prose js-post-body'})
    post = post.find_all(['p','pre','ul'])
  # Hardwarezone
  elif site == 'Hardwarezone':
    post = []
    post_id = re.search(r"post-([0-9]+)", url).group(0)
    post_text = soup.find('div',{'data-lb-id':post_id}).find('div').text
  # CNA
  elif site == 'ChannelNewsAsia':
    post = soup.find('div',{'class':'text-long'}).find_all('p')
  else:
    post = []

  result = [p.get_text() for p in post]
  post_text += " ".join(result)
  return post_text

In [11]:
# Extract post content and load into dataframe
df['body'] = df['url'].apply(lambda u: extract_post(u, df['site'].loc[df['url'] == u].values))
df

Unnamed: 0,url,site,body
0,https://stackoverflow.com/questions/20109391/h...,StackOverflow,Having spent a decent amount of time watching ...
1,https://stackoverflow.com/questions/240178/lis...,StackOverflow,"I needed to create a list of lists in Python, ..."
2,https://forums.hardwarezone.com.sg/threads/dis...,Hardwarezone,I dun care about that. Other people's standard...
3,https://forums.hardwarezone.com.sg/threads/wel...,Hardwarezone,\n\ncet87 said:\n\n\n\n\t\t\the is not the goa...
4,https://www.channelnewsasia.com/business/energ...,ChannelNewsAsia,"TARTU, Estonia : Peat, plentiful in bogs i..."
5,https://www.channelnewsasia.com/asia/hong-kong...,ChannelNewsAsia,HONG KONG: Hong Kong battened down on Tuesday ...


In [12]:
def clean_text(text):
  text = re.sub(r"[\r|\n|\r\n|\t]+", " ", text)
  text = re.sub(' +', ' ', text)
  return text

In [13]:
def first_word(text):
  pattern = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
  text = re.split(pattern, text)
  sents = [sent for sent in text]
  first_words = [sent.split()[0] for sent in sents]
  return first_words

In [14]:
# Clean extra newlines and whitespaces
df['body'] = df['body'].apply(lambda c: clean_text(c))

In [15]:
# Get the first words of each sentence
df['first_words'] = df['body'].apply(lambda f: first_word(f))

In [16]:
# Display the first words extracted
pd.set_option('max_colwidth', 0)
df[['site','first_words']]

Unnamed: 0,site,first_words
0,StackOverflow,"[Having, This, People, How, Simple]"
1,StackOverflow,"[I, Can]"
2,Hardwarezone,"[I, Other, My]"
3,Hardwarezone,"[cet87, Click, yes, and, the, instead]"
4,ChannelNewsAsia,"[TARTU,, Sodium-ion, Scientists, ""Peat, The, The, Distillers, As, But, Sodium-ion, China's, ""I, Less, (Reporting]"
5,ChannelNewsAsia,"[HONG, Tropical, As, Hong, The, Schools, On, Although, Many, One, Hong, The, ""It, ""In, Social]"


In [24]:
# LanguageTool to check grammar and character casing
p = 0
for text in df['body'].to_list():
  i = 0
  matches = tool.check(text)
  print('post: ', p)
  p+=1
  for match in matches:
    print(i, '\tcontext: ', match.context, '\n\tmessage: ', match.message, '\n\tissue: ', match.ruleIssueType, '\n\tsuggested replacement: ', match.replacements, end='\n')
    i+=1
  print('\n')

post:  0
0 	context:  ...e examples for pandas questions? Simple dataframes can be put together, e.g.: import panda... 
	message:  Possible spelling mistake found. 
	issue:  misspelling 
	suggested replacement:  ['data frames']
1 	context:  ...put together, e.g.: import pandas as pd df = pd.DataFrame({'user': ['Bob', 'Jane',... 
	message:  Possible spelling mistake found. 
	issue:  misspelling 
	suggested replacement:  ['of', 'if', 'do', 'Dr', 'IDF', 'SF', 'cf', 'AF', 'DA', 'DL', 'DM', 'DT', 'HF', 'PDF', 'dB', 'DG', 'MF', 'UDF', 'GDF', 'sf', 'DCF', 'MDF', 'dz', 'DJ', 'ADF', 'BDF', 'BF', 'CDF', 'CF', 'Cf', 'D', 'D8', 'DAF', 'DBF', 'DC', 'DD', 'DE', 'DFA', 'DFB', 'DFE', 'DFF', 'DFG', 'DFI', 'DFL', 'DFM', 'DFN', 'DFO', 'DFP', 'DFS', 'DFT', 'DFW', 'DGF', 'DH', 'DI', 'DIF', 'DJF', 'DK', 'DLF', 'DMF', 'DN', 'DNF', 'DO', 'DOF', 'DP', 'DR', 'DS', 'DSF', 'DXF', 'Di', 'Du', 'Dy', 'EDF', 'F', 'FDF', 'FF', 'HDF', 'Hf', 'IF', 'NDF', 'NF', 'ODF', 'QF', 'RDF', 'RF', 'Rf', 'SDF', 'TDF', 'TF', 'VF', 'W

In [34]:
# Name-Entity Recognition
p = 0
for text in df['body'].to_list():
  doc = nlp(text)
  print('\npost: ', p)
  displacy.render(doc, style='ent', jupyter=True)
  p+=1


post:  0



post:  1



post:  2


  "__main__", mod_spec)



post:  3


  "__main__", mod_spec)



post:  4



post:  5
